llama_cpp 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -16,6 +16,7 @@
16
16
  #include <stdlib.h>
17
17
  #include <string.h>
18
18
  #include <stdint.h>
19
+ #include <inttypes.h>
19
20
  #include <stdio.h>
20
21
  #include <float.h>
21
22
 
@@ -1961,42 +1962,71 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
1961
1962
  // Initialize accumulator with zeros
1962
1963
  __m256 acc = _mm256_setzero_ps();
1963
1964
 
1964
- // Main loop
1965
- // TODO: figure a way to do this in a portable way
1966
- #ifdef __GNUC__
1967
- #pragma GCC unroll 16
1968
- #endif
1969
- for (int i = 0; i < nb; ++i) {
1970
- // Compute combined scale for the block
1971
- const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
1972
-
1973
- // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
1974
- __m256i bx = bytesFromNibbles( x[i].qs );
1975
- __m256i by = bytesFromNibbles( y[i].qs );
1976
-
1977
- // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
1978
- const __m256i off = _mm256_set1_epi8( 8 );
1979
- bx = _mm256_sub_epi8( bx, off );
1980
- by = _mm256_sub_epi8( by, off );
1981
-
1982
- // Get absolute values of x vectors
1983
- const __m256i ax = _mm256_sign_epi8(bx, bx);
1984
-
1985
- // Sign the values of the y vectors
1986
- const __m256i sy = _mm256_sign_epi8(by, bx);
1987
-
1988
- // Perform multiplication and create 16-bit values
1989
- const __m256i dot = _mm256_maddubs_epi16(ax, sy);
1990
-
1991
- const __m256i ones = _mm256_set1_epi16(1);
1992
- const __m256i i32 = _mm256_madd_epi16(ones, dot);
1965
+ /* Prepare the constants we will need during execution */
1966
+ const __m256i lowMask = _mm256_set1_epi8( 0xF );
1967
+ const __m256i offset_8 = _mm256_set1_epi16( 8 );
1993
1968
 
1994
- // Convert int32_t to float
1995
- const __m256 p = _mm256_cvtepi32_ps( i32 );
1969
+ #define UNROLL_COUNT 8
1970
+ // make sure we only unroll multiples of the block count
1971
+ assert(nb % UNROLL_COUNT == 0);
1996
1972
 
1997
- // Apply the scale, and accumulate
1998
- acc = _mm256_fmadd_ps( d, p, acc );
1999
- }
1973
+ // Main loop
1974
+ for (int i = 0; i < nb; i+=UNROLL_COUNT) {
1975
+
1976
+ // This loop will be unrolled by the compiler
1977
+ for (int u=0;u<UNROLL_COUNT;u++) {
1978
+ /* Compute combined scale for the block */
1979
+ const __m256 scale = _mm256_mul_ps(
1980
+ _mm256_broadcast_ss( &x[i+u].d ),
1981
+ _mm256_broadcast_ss( &y[i+u].d ) );
1982
+
1983
+ /* get input from x
1984
+ Input: 32 Nibbles (16 bytes) at *x[i+u]
1985
+ Output: 2 vectors with 16 values of type int16_t (x_high_q, x_low_q) */
1986
+
1987
+ /* Load 16 bytes from memory */
1988
+ const __m128i tmp_x = _mm_loadu_si128( ( const __m128i* ) x[i+u].qs);
1989
+ /* Expand bytes into uint16_t values */
1990
+ const __m256i bytes_x = _mm256_cvtepu8_epi16(tmp_x);
1991
+ /* Unpack values into individual bytes */
1992
+ __m256i x_low_q = _mm256_and_si256( lowMask, bytes_x );
1993
+ const __m256i pre_shift_x_high_q = _mm256_andnot_si256( lowMask, bytes_x );
1994
+ __m256i x_high_q = _mm256_srli_epi16( pre_shift_x_high_q, 4 );
1995
+ /* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
1996
+ x_high_q = _mm256_sub_epi16( x_high_q, offset_8 );
1997
+ x_low_q = _mm256_sub_epi16( x_low_q, offset_8 );
1998
+
1999
+ /* get input from y
2000
+ Input: 32 Nibbles (16 bytes) at *y[i+u]
2001
+ Output: 2 vectors with 16 values of type int16_t (y_high_q, y_low_q) */
2002
+
2003
+ /* Load 16 bytes from memory */
2004
+ const __m128i tmp_y = _mm_loadu_si128( (const __m128i* ) y[i+u].qs);
2005
+ /* Expand bytes into uint16_t values */
2006
+ const __m256i bytes_y = _mm256_cvtepu8_epi16(tmp_y);
2007
+ /* Unpack values into individual bytes */
2008
+ const __m256i pre_shift_y_high_q = _mm256_andnot_si256( lowMask, bytes_y );
2009
+ __m256i y_high_q = _mm256_srli_epi16( pre_shift_y_high_q, 4 );
2010
+ __m256i y_low_q = _mm256_and_si256( lowMask, bytes_y );
2011
+ /* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
2012
+ y_high_q = _mm256_sub_epi16( y_high_q, offset_8 );
2013
+ y_low_q = _mm256_sub_epi16( y_low_q, offset_8 );
2014
+
2015
+ /* Compute products of int16_t integers, add pairwise, store as int32_t */
2016
+ __m256i xy_high_q = _mm256_madd_epi16( x_high_q, y_high_q );
2017
+ __m256i xy_low_q = _mm256_madd_epi16( x_low_q, y_low_q );
2018
+
2019
+ /* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */
2020
+ __m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q );
2021
+
2022
+ /* Convert to vectore of 8 int32_t to 8 floats */
2023
+ __m256 q = _mm256_cvtepi32_ps( xy_q );
2024
+
2025
+ /* Multiply q with scale and accumulate */
2026
+ acc = _mm256_fmadd_ps( scale, q, acc );
2027
+ }
2028
+
2029
+ }
2000
2030
 
2001
2031
  // Return horizontal sum of the acc vector
2002
2032
  __m128 res = _mm256_extractf128_ps( acc, 1 );
@@ -2025,7 +2055,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
2025
2055
  bx = _mm_sub_epi8( bx, off );
2026
2056
  by = _mm_sub_epi8( by, off );
2027
2057
 
2028
- // Get absolute values of x vectors
2058
+ // Get absolute values of x vectors
2029
2059
  const __m128i ax = _mm_sign_epi8(bx, bx);
2030
2060
 
2031
2061
  // Sign the values of the y vectors
@@ -2774,7 +2804,7 @@ void ggml_print_objects(const struct ggml_context * ctx) {
2774
2804
  GGML_PRINT("%s: --- end ---\n", __func__);
2775
2805
  }
2776
2806
 
2777
- int ggml_nelements(const struct ggml_tensor * tensor) {
2807
+ int64_t ggml_nelements(const struct ggml_tensor * tensor) {
2778
2808
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2779
2809
 
2780
2810
  return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
@@ -3090,7 +3120,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
3090
3120
  struct ggml_context * ctx,
3091
3121
  enum ggml_type type,
3092
3122
  int n_dims,
3093
- const int* ne,
3123
+ const int64_t* ne,
3094
3124
  void* data) {
3095
3125
  // always insert objects at the end of the context's memory pool
3096
3126
  struct ggml_object * obj_cur = ctx->objects_end;
@@ -3189,7 +3219,8 @@ struct ggml_tensor * ggml_new_tensor_impl(
3189
3219
  /*.pad =*/ { 0 },
3190
3220
  };
3191
3221
 
3192
- ggml_assert_aligned(result->data);
3222
+ // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
3223
+ //ggml_assert_aligned(result->data);
3193
3224
 
3194
3225
  for (int i = 0; i < n_dims; i++) {
3195
3226
  result->ne[i] = ne[i];
@@ -3210,44 +3241,44 @@ struct ggml_tensor * ggml_new_tensor(
3210
3241
  struct ggml_context * ctx,
3211
3242
  enum ggml_type type,
3212
3243
  int n_dims,
3213
- const int * ne) {
3244
+ const int64_t * ne) {
3214
3245
  return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
3215
3246
  }
3216
3247
 
3217
3248
  struct ggml_tensor * ggml_new_tensor_1d(
3218
3249
  struct ggml_context * ctx,
3219
3250
  enum ggml_type type,
3220
- int ne0) {
3251
+ int64_t ne0) {
3221
3252
  return ggml_new_tensor(ctx, type, 1, &ne0);
3222
3253
  }
3223
3254
 
3224
3255
  struct ggml_tensor * ggml_new_tensor_2d(
3225
3256
  struct ggml_context * ctx,
3226
3257
  enum ggml_type type,
3227
- int ne0,
3228
- int ne1) {
3229
- const int ne[2] = { ne0, ne1 };
3258
+ int64_t ne0,
3259
+ int64_t ne1) {
3260
+ const int64_t ne[2] = { ne0, ne1 };
3230
3261
  return ggml_new_tensor(ctx, type, 2, ne);
3231
3262
  }
3232
3263
 
3233
3264
  struct ggml_tensor * ggml_new_tensor_3d(
3234
3265
  struct ggml_context * ctx,
3235
3266
  enum ggml_type type,
3236
- int ne0,
3237
- int ne1,
3238
- int ne2) {
3239
- const int ne[3] = { ne0, ne1, ne2 };
3267
+ int64_t ne0,
3268
+ int64_t ne1,
3269
+ int64_t ne2) {
3270
+ const int64_t ne[3] = { ne0, ne1, ne2 };
3240
3271
  return ggml_new_tensor(ctx, type, 3, ne);
3241
3272
  }
3242
3273
 
3243
3274
  struct ggml_tensor * ggml_new_tensor_4d(
3244
3275
  struct ggml_context * ctx,
3245
3276
  enum ggml_type type,
3246
- int ne0,
3247
- int ne1,
3248
- int ne2,
3249
- int ne3) {
3250
- const int ne[4] = { ne0, ne1, ne2, ne3 };
3277
+ int64_t ne0,
3278
+ int64_t ne1,
3279
+ int64_t ne2,
3280
+ int64_t ne3) {
3281
+ const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
3251
3282
  return ggml_new_tensor(ctx, type, 4, ne);
3252
3283
  }
3253
3284
 
@@ -3590,7 +3621,14 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
3590
3621
  struct ggml_tensor * ggml_view_tensor(
3591
3622
  struct ggml_context * ctx,
3592
3623
  const struct ggml_tensor * src) {
3593
- return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
3624
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
3625
+
3626
+ result->nb[0] = src->nb[0];
3627
+ result->nb[1] = src->nb[1];
3628
+ result->nb[2] = src->nb[2];
3629
+ result->nb[3] = src->nb[3];
3630
+
3631
+ return result;
3594
3632
  }
3595
3633
 
3596
3634
  ////////////////////////////////////////////////////////////////////////////////
@@ -3894,7 +3932,7 @@ struct ggml_tensor * ggml_mean(
3894
3932
  is_node = true;
3895
3933
  }
3896
3934
 
3897
- int ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };
3935
+ int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };
3898
3936
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne);
3899
3937
 
3900
3938
  result->op = GGML_OP_MEAN;
@@ -4255,7 +4293,7 @@ struct ggml_tensor * ggml_mul_mat(
4255
4293
  is_node = true;
4256
4294
  }
4257
4295
 
4258
- const int ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] };
4296
+ const int64_t ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] };
4259
4297
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne);
4260
4298
 
4261
4299
  result->op = GGML_OP_MUL_MAT;
@@ -4380,8 +4418,8 @@ struct ggml_tensor * ggml_reshape(
4380
4418
  struct ggml_tensor * ggml_reshape_2d(
4381
4419
  struct ggml_context * ctx,
4382
4420
  struct ggml_tensor * a,
4383
- int ne0,
4384
- int ne1) {
4421
+ int64_t ne0,
4422
+ int64_t ne1) {
4385
4423
  GGML_ASSERT(ggml_is_contiguous(a));
4386
4424
  GGML_ASSERT(ggml_nelements(a) == ne0*ne1);
4387
4425
 
@@ -4392,7 +4430,7 @@ struct ggml_tensor * ggml_reshape_2d(
4392
4430
  is_node = true;
4393
4431
  }
4394
4432
 
4395
- const int ne[2] = { ne0, ne1 };
4433
+ const int64_t ne[2] = { ne0, ne1 };
4396
4434
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
4397
4435
 
4398
4436
  result->op = GGML_OP_RESHAPE;
@@ -4406,9 +4444,9 @@ struct ggml_tensor * ggml_reshape_2d(
4406
4444
  struct ggml_tensor * ggml_reshape_3d(
4407
4445
  struct ggml_context * ctx,
4408
4446
  struct ggml_tensor * a,
4409
- int ne0,
4410
- int ne1,
4411
- int ne2) {
4447
+ int64_t ne0,
4448
+ int64_t ne1,
4449
+ int64_t ne2) {
4412
4450
  GGML_ASSERT(ggml_is_contiguous(a));
4413
4451
  GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
4414
4452
 
@@ -4419,7 +4457,7 @@ struct ggml_tensor * ggml_reshape_3d(
4419
4457
  is_node = true;
4420
4458
  }
4421
4459
 
4422
- const int ne[3] = { ne0, ne1, ne2 };
4460
+ const int64_t ne[3] = { ne0, ne1, ne2 };
4423
4461
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
4424
4462
 
4425
4463
  result->op = GGML_OP_RESHAPE;
@@ -4435,7 +4473,7 @@ struct ggml_tensor * ggml_reshape_3d(
4435
4473
  struct ggml_tensor * ggml_view_1d(
4436
4474
  struct ggml_context * ctx,
4437
4475
  struct ggml_tensor * a,
4438
- int ne0,
4476
+ int64_t ne0,
4439
4477
  size_t offset) {
4440
4478
  if (a->grad) {
4441
4479
  GGML_ASSERT(false); // gradient propagation is not supported
@@ -4456,15 +4494,15 @@ struct ggml_tensor * ggml_view_1d(
4456
4494
  struct ggml_tensor * ggml_view_2d(
4457
4495
  struct ggml_context * ctx,
4458
4496
  struct ggml_tensor * a,
4459
- int ne0,
4460
- int ne1,
4497
+ int64_t ne0,
4498
+ int64_t ne1,
4461
4499
  size_t nb1,
4462
4500
  size_t offset) {
4463
4501
  if (a->grad) {
4464
4502
  GGML_ASSERT(false); // gradient propagation is not supported
4465
4503
  }
4466
4504
 
4467
- const int ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
4505
+ const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
4468
4506
 
4469
4507
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
4470
4508
 
@@ -4480,6 +4518,37 @@ struct ggml_tensor * ggml_view_2d(
4480
4518
  return result;
4481
4519
  }
4482
4520
 
4521
+ // ggml_view_3d
4522
+
4523
+ struct ggml_tensor * ggml_view_3d(
4524
+ struct ggml_context * ctx,
4525
+ struct ggml_tensor * a,
4526
+ int64_t ne0,
4527
+ int64_t ne1,
4528
+ int64_t ne2,
4529
+ size_t nb1,
4530
+ size_t nb2,
4531
+ size_t offset) {
4532
+ if (a->grad) {
4533
+ GGML_ASSERT(false); // gradient propagation is not supported
4534
+ }
4535
+
4536
+ const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
4537
+
4538
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
4539
+
4540
+ result->nb[1] = nb1;
4541
+ result->nb[2] = nb2;
4542
+ result->nb[3] = result->nb[2]*ne2;
4543
+
4544
+ result->op = GGML_OP_VIEW;
4545
+ result->grad = NULL;
4546
+ result->src0 = a;
4547
+ result->src1 = NULL; // TODO: maybe store the offset here?
4548
+
4549
+ return result;
4550
+ }
4551
+
4483
4552
  // ggml_permute
4484
4553
 
4485
4554
  struct ggml_tensor * ggml_permute(
@@ -4695,7 +4764,7 @@ struct ggml_tensor * ggml_conv_1d_1s(
4695
4764
  is_node = true;
4696
4765
  }
4697
4766
 
4698
- const int ne[4] = { b->ne[0], a->ne[2], 1, 1, };
4767
+ const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
4699
4768
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
4700
4769
 
4701
4770
  result->op = GGML_OP_CONV_1D_1S;
@@ -4722,7 +4791,7 @@ struct ggml_tensor * ggml_conv_1d_2s(
4722
4791
  is_node = true;
4723
4792
  }
4724
4793
 
4725
- const int ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
4794
+ const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
4726
4795
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
4727
4796
 
4728
4797
  result->op = GGML_OP_CONV_1D_2S;
@@ -4815,102 +4884,112 @@ static void ggml_compute_forward_dup_f16(
4815
4884
  const struct ggml_tensor * src0,
4816
4885
  struct ggml_tensor * dst) {
4817
4886
  GGML_ASSERT(params->ith == 0);
4818
- GGML_ASSERT(ggml_is_contiguous(dst));
4819
4887
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
4820
4888
 
4821
4889
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
4822
4890
  return;
4823
4891
  }
4824
4892
 
4825
- const int ne00 = src0->ne[0];
4826
- const int ne01 = src0->ne[1];
4827
- const int ne02 = src0->ne[2];
4828
- const int ne03 = src0->ne[3];
4893
+ const int64_t ne00 = src0->ne[0];
4894
+ const int64_t ne01 = src0->ne[1];
4895
+ const int64_t ne02 = src0->ne[2];
4896
+ const int64_t ne03 = src0->ne[3];
4829
4897
 
4830
4898
  const size_t nb00 = src0->nb[0];
4831
4899
  const size_t nb01 = src0->nb[1];
4832
4900
  const size_t nb02 = src0->nb[2];
4833
4901
  const size_t nb03 = src0->nb[3];
4834
4902
 
4835
- if (ggml_is_contiguous(src0) && src0->type == dst->type) {
4903
+ const size_t nb0 = dst->nb[0];
4904
+ const size_t nb1 = dst->nb[1];
4905
+ const size_t nb2 = dst->nb[2];
4906
+ const size_t nb3 = dst->nb[3];
4907
+
4908
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
4836
4909
  memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
4837
4910
  return;
4838
4911
  }
4839
4912
 
4840
- if (src0->nb[0] == sizeof(ggml_fp16_t)) {
4841
- if (dst->type == GGML_TYPE_F16) {
4842
- size_t id = 0;
4843
- const size_t rs = ne00*nb00;
4844
-
4845
- for (int i03 = 0; i03 < ne03; i03++) {
4846
- for (int i02 = 0; i02 < ne02; i02++) {
4847
- for (int i01 = 0; i01 < ne01; i01++) {
4848
- const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
4849
- char * dst_ptr = (char *) dst->data + id*rs;
4850
-
4851
- memcpy(dst_ptr, src0_ptr, rs);
4852
-
4853
- id++;
4854
- }
4855
- }
4856
- }
4857
- } else if (dst->type == GGML_TYPE_F32) {
4858
- size_t id = 0;
4859
- float * dst_ptr = (float *) dst->data;
4860
-
4861
- for (int i03 = 0; i03 < ne03; i03++) {
4862
- for (int i02 = 0; i02 < ne02; i02++) {
4863
- for (int i01 = 0; i01 < ne01; i01++) {
4864
- for (int i00 = 0; i00 < ne00; i00++) {
4865
- const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4866
-
4867
- dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
4868
- id++;
4869
- }
4870
- }
4913
+ if (src0->type == dst->type &&
4914
+ src0->ne[0] == dst->ne[0] &&
4915
+ src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) {
4916
+ // copy by rows
4917
+ const size_t rs = ne00*nb00;
4918
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
4919
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
4920
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
4921
+ memcpy(
4922
+ ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
4923
+ ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
4924
+ rs);
4871
4925
  }
4872
4926
  }
4873
- } else {
4874
- GGML_ASSERT(false); // TODO: implement
4875
4927
  }
4876
- } else {
4877
- //printf("%s: this is not optimal - fix me\n", __func__);
4878
-
4879
- if (dst->type == GGML_TYPE_F32) {
4880
- size_t id = 0;
4881
- float * dst_ptr = (float *) dst->data;
4882
-
4883
- for (int i03 = 0; i03 < ne03; i03++) {
4884
- for (int i02 = 0; i02 < ne02; i02++) {
4885
- for (int i01 = 0; i01 < ne01; i01++) {
4886
- for (int i00 = 0; i00 < ne00; i00++) {
4887
- const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4928
+ return;
4929
+ }
4888
4930
 
4889
- dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
4890
- id++;
4931
+ // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
4932
+
4933
+ // dst counters
4934
+ int64_t i10 = 0;
4935
+ int64_t i11 = 0;
4936
+ int64_t i12 = 0;
4937
+ int64_t i13 = 0;
4938
+
4939
+ if (dst->type == GGML_TYPE_F16) {
4940
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
4941
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
4942
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
4943
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
4944
+ const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4945
+ char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
4946
+
4947
+ memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
4948
+
4949
+ if (++i10 == ne00) {
4950
+ i10 = 0;
4951
+ if (++i11 == ne01) {
4952
+ i11 = 0;
4953
+ if (++i12 == ne02) {
4954
+ i12 = 0;
4955
+ if (++i13 == ne03) {
4956
+ i13 = 0;
4957
+ }
4958
+ }
4959
+ }
4891
4960
  }
4892
4961
  }
4893
4962
  }
4894
4963
  }
4895
- } else if (dst->type == GGML_TYPE_F16) {
4896
- size_t id = 0;
4897
- ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
4898
-
4899
- for (int i03 = 0; i03 < ne03; i03++) {
4900
- for (int i02 = 0; i02 < ne02; i02++) {
4901
- for (int i01 = 0; i01 < ne01; i01++) {
4902
- for (int i00 = 0; i00 < ne00; i00++) {
4903
- const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4904
-
4905
- dst_ptr[id] = *src0_ptr;
4906
- id++;
4964
+ }
4965
+ } else if (dst->type == GGML_TYPE_F32) {
4966
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
4967
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
4968
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
4969
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
4970
+ const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4971
+ char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
4972
+
4973
+ *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
4974
+
4975
+ if (++i10 == ne00) {
4976
+ i10 = 0;
4977
+ if (++i11 == ne01) {
4978
+ i11 = 0;
4979
+ if (++i12 == ne02) {
4980
+ i12 = 0;
4981
+ if (++i13 == ne03) {
4982
+ i13 = 0;
4983
+ }
4984
+ }
4985
+ }
4907
4986
  }
4908
4987
  }
4909
4988
  }
4910
4989
  }
4911
- } else {
4912
- GGML_ASSERT(false); // TODO: implement
4913
4990
  }
4991
+ } else {
4992
+ GGML_ASSERT(false); // TODO: implement
4914
4993
  }
4915
4994
  }
4916
4995
 
@@ -4919,102 +4998,92 @@ static void ggml_compute_forward_dup_f32(
4919
4998
  const struct ggml_tensor * src0,
4920
4999
  struct ggml_tensor * dst) {
4921
5000
  GGML_ASSERT(params->ith == 0);
4922
- GGML_ASSERT(ggml_is_contiguous(dst));
4923
5001
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
4924
5002
 
4925
5003
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
4926
5004
  return;
4927
5005
  }
4928
5006
 
4929
- const int ne00 = src0->ne[0];
4930
- const int ne01 = src0->ne[1];
4931
- const int ne02 = src0->ne[2];
4932
- const int ne03 = src0->ne[3];
5007
+ const int64_t ne00 = src0->ne[0];
5008
+ const int64_t ne01 = src0->ne[1];
5009
+ const int64_t ne02 = src0->ne[2];
5010
+ const int64_t ne03 = src0->ne[3];
4933
5011
 
4934
5012
  const size_t nb00 = src0->nb[0];
4935
5013
  const size_t nb01 = src0->nb[1];
4936
5014
  const size_t nb02 = src0->nb[2];
4937
5015
  const size_t nb03 = src0->nb[3];
4938
5016
 
4939
- if (ggml_is_contiguous(src0) && src0->type == dst->type) {
5017
+ const size_t nb0 = dst->nb[0];
5018
+ const size_t nb1 = dst->nb[1];
5019
+ const size_t nb2 = dst->nb[2];
5020
+ const size_t nb3 = dst->nb[3];
5021
+
5022
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
4940
5023
  memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
4941
5024
  return;
4942
5025
  }
4943
5026
 
4944
- if (src0->nb[0] == sizeof(float)) {
4945
- if (dst->type == GGML_TYPE_F32) {
4946
- size_t id = 0;
4947
- const size_t rs = ne00*nb00;
4948
-
4949
- for (int i03 = 0; i03 < ne03; i03++) {
4950
- for (int i02 = 0; i02 < ne02; i02++) {
4951
- for (int i01 = 0; i01 < ne01; i01++) {
4952
- const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
4953
- char * dst_ptr = (char *) dst->data + id*rs;
4954
-
4955
- memcpy(dst_ptr, src0_ptr, rs);
4956
-
4957
- id++;
4958
- }
4959
- }
4960
- }
4961
- } else if (dst->type == GGML_TYPE_F16) {
4962
- size_t id = 0;
4963
- ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
4964
-
4965
- for (int i03 = 0; i03 < ne03; i03++) {
4966
- for (int i02 = 0; i02 < ne02; i02++) {
4967
- for (int i01 = 0; i01 < ne01; i01++) {
4968
- for (int i00 = 0; i00 < ne00; i00++) {
4969
- const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4970
-
4971
- dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
4972
- id++;
5027
+ // dst counters
5028
+ int64_t i10 = 0;
5029
+ int64_t i11 = 0;
5030
+ int64_t i12 = 0;
5031
+ int64_t i13 = 0;
5032
+
5033
+ if (dst->type == GGML_TYPE_F32) {
5034
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
5035
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
5036
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
5037
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
5038
+ const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
5039
+ char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
5040
+
5041
+ memcpy(dst_ptr, src0_ptr, sizeof(float));
5042
+
5043
+ if (++i10 == dst->ne[0]) {
5044
+ i10 = 0;
5045
+ if (++i11 == dst->ne[1]) {
5046
+ i11 = 0;
5047
+ if (++i12 == dst->ne[2]) {
5048
+ i12 = 0;
5049
+ if (++i13 == dst->ne[3]) {
5050
+ i13 = 0;
5051
+ }
5052
+ }
5053
+ }
4973
5054
  }
4974
5055
  }
4975
5056
  }
4976
5057
  }
4977
- } else {
4978
- GGML_ASSERT(false); // TODO: implement
4979
5058
  }
4980
- } else {
4981
- //printf("%s: this is not optimal - fix me\n", __func__);
4982
-
4983
- if (dst->type == GGML_TYPE_F32) {
4984
- size_t id = 0;
4985
- float * dst_ptr = (float *) dst->data;
4986
-
4987
- for (int i03 = 0; i03 < ne03; i03++) {
4988
- for (int i02 = 0; i02 < ne02; i02++) {
4989
- for (int i01 = 0; i01 < ne01; i01++) {
4990
- for (int i00 = 0; i00 < ne00; i00++) {
4991
- const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4992
-
4993
- dst_ptr[id] = *src0_ptr;
4994
- id++;
4995
- }
4996
- }
4997
- }
4998
- }
4999
- } else if (dst->type == GGML_TYPE_F16) {
5000
- size_t id = 0;
5001
- ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
5002
-
5003
- for (int i03 = 0; i03 < ne03; i03++) {
5004
- for (int i02 = 0; i02 < ne02; i02++) {
5005
- for (int i01 = 0; i01 < ne01; i01++) {
5006
- for (int i00 = 0; i00 < ne00; i00++) {
5007
- const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
5008
-
5009
- dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
5010
- id++;
5059
+ } else if (dst->type == GGML_TYPE_F16) {
5060
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
5061
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
5062
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
5063
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
5064
+ const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
5065
+ char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
5066
+
5067
+ *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
5068
+
5069
+ if (++i10 == dst->ne[0]) {
5070
+ i10 = 0;
5071
+ if (++i11 == dst->ne[1]) {
5072
+ i11 = 0;
5073
+ if (++i12 == dst->ne[2]) {
5074
+ i12 = 0;
5075
+ if (++i13 == dst->ne[3]) {
5076
+ i13 = 0;
5077
+ }
5078
+ }
5079
+ }
5011
5080
  }
5012
5081
  }
5013
5082
  }
5014
5083
  }
5015
- } else {
5016
- GGML_ASSERT(false); // TODO: implement
5017
5084
  }
5085
+ } else {
5086
+ GGML_ASSERT(false); // TODO: implement
5018
5087
  }
5019
5088
  }
5020
5089
 
@@ -5389,18 +5458,18 @@ static void ggml_compute_forward_sum_f32(
5389
5458
  assert(ggml_is_scalar(dst));
5390
5459
  assert(src0->nb[0] == sizeof(float));
5391
5460
 
5392
- const int ne00 = src0->ne[0];
5393
- const int ne01 = src0->ne[1];
5394
- const int ne02 = src0->ne[2];
5395
- const int ne03 = src0->ne[3];
5461
+ const int64_t ne00 = src0->ne[0];
5462
+ const int64_t ne01 = src0->ne[1];
5463
+ const int64_t ne02 = src0->ne[2];
5464
+ const int64_t ne03 = src0->ne[3];
5396
5465
 
5397
5466
  const size_t nb01 = src0->nb[1];
5398
5467
  const size_t nb02 = src0->nb[2];
5399
5468
  const size_t nb03 = src0->nb[3];
5400
5469
 
5401
- for (int i03 = 0; i03 < ne03; i03++) {
5402
- for (int i02 = 0; i02 < ne02; i02++) {
5403
- for (int i01 = 0; i01 < ne01; i01++) {
5470
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
5471
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
5472
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
5404
5473
  ggml_vec_sum_f32(ne00,
5405
5474
  (float *) (dst->data),
5406
5475
  (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
@@ -5445,19 +5514,19 @@ static void ggml_compute_forward_mean_f32(
5445
5514
 
5446
5515
  assert(src0->nb[0] == sizeof(float));
5447
5516
 
5448
- const int ne00 = src0->ne[0];
5449
- const int ne01 = src0->ne[1];
5450
- const int ne02 = src0->ne[2];
5451
- const int ne03 = src0->ne[3];
5517
+ const int64_t ne00 = src0->ne[0];
5518
+ const int64_t ne01 = src0->ne[1];
5519
+ const int64_t ne02 = src0->ne[2];
5520
+ const int64_t ne03 = src0->ne[3];
5452
5521
 
5453
5522
  const size_t nb01 = src0->nb[1];
5454
5523
  const size_t nb02 = src0->nb[2];
5455
5524
  const size_t nb03 = src0->nb[3];
5456
5525
 
5457
- const int ne0 = dst->ne[0];
5458
- const int ne1 = dst->ne[1];
5459
- const int ne2 = dst->ne[2];
5460
- const int ne3 = dst->ne[3];
5526
+ const int64_t ne0 = dst->ne[0];
5527
+ const int64_t ne1 = dst->ne[1];
5528
+ const int64_t ne2 = dst->ne[2];
5529
+ const int64_t ne3 = dst->ne[3];
5461
5530
 
5462
5531
  assert(ne0 == 1);
5463
5532
  assert(ne1 == ne01);
@@ -5473,9 +5542,9 @@ static void ggml_compute_forward_mean_f32(
5473
5542
  const size_t nb2 = dst->nb[2];
5474
5543
  const size_t nb3 = dst->nb[3];
5475
5544
 
5476
- for (int i03 = 0; i03 < ne03; i03++) {
5477
- for (int i02 = 0; i02 < ne02; i02++) {
5478
- for (int i01 = 0; i01 < ne01; i01++) {
5545
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
5546
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
5547
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
5479
5548
  ggml_vec_sum_f32(ne00,
5480
5549
  (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
5481
5550
  (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
@@ -5962,10 +6031,10 @@ static void ggml_compute_forward_norm_f32(
5962
6031
  const int ith = params->ith;
5963
6032
  const int nth = params->nth;
5964
6033
 
5965
- const int ne00 = src0->ne[0];
5966
- const int ne01 = src0->ne[1];
5967
- const int ne02 = src0->ne[2];
5968
- const int ne03 = src0->ne[3];
6034
+ const int64_t ne00 = src0->ne[0];
6035
+ const int64_t ne01 = src0->ne[1];
6036
+ const int64_t ne02 = src0->ne[2];
6037
+ const int64_t ne03 = src0->ne[3];
5969
6038
 
5970
6039
  const size_t nb01 = src0->nb[1];
5971
6040
  const size_t nb02 = src0->nb[2];
@@ -5978,13 +6047,13 @@ static void ggml_compute_forward_norm_f32(
5978
6047
  const float eps = 1e-5f; // TODO: make this a parameter
5979
6048
 
5980
6049
  // TODO: optimize
5981
- for (int i03 = 0; i03 < ne03; i03++) {
5982
- for (int i02 = 0; i02 < ne02; i02++) {
5983
- for (int i01 = ith; i01 < ne01; i01 += nth) {
6050
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
6051
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
6052
+ for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
5984
6053
  const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
5985
6054
 
5986
6055
  ggml_float sum = 0.0;
5987
- for (int i00 = 0; i00 < ne00; i00++) {
6056
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
5988
6057
  sum += (ggml_float)x[i00];
5989
6058
  }
5990
6059
 
@@ -5993,7 +6062,7 @@ static void ggml_compute_forward_norm_f32(
5993
6062
  float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
5994
6063
 
5995
6064
  ggml_float sum2 = 0.0;
5996
- for (int i00 = 0; i00 < ne00; i00++) {
6065
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
5997
6066
  float v = x[i00] - mean;
5998
6067
  y[i00] = v;
5999
6068
  sum2 += (ggml_float)(v*v);
@@ -6045,10 +6114,10 @@ static void ggml_compute_forward_rms_norm_f32(
6045
6114
  const int ith = params->ith;
6046
6115
  const int nth = params->nth;
6047
6116
 
6048
- const int ne00 = src0->ne[0];
6049
- const int ne01 = src0->ne[1];
6050
- const int ne02 = src0->ne[2];
6051
- const int ne03 = src0->ne[3];
6117
+ const int64_t ne00 = src0->ne[0];
6118
+ const int64_t ne01 = src0->ne[1];
6119
+ const int64_t ne02 = src0->ne[2];
6120
+ const int64_t ne03 = src0->ne[3];
6052
6121
 
6053
6122
  const size_t nb01 = src0->nb[1];
6054
6123
  const size_t nb02 = src0->nb[2];
@@ -6061,13 +6130,13 @@ static void ggml_compute_forward_rms_norm_f32(
6061
6130
  const float eps = 1e-6f; // TODO: make this a parameter
6062
6131
 
6063
6132
  // TODO: optimize
6064
- for (int i03 = 0; i03 < ne03; i03++) {
6065
- for (int i02 = 0; i02 < ne02; i02++) {
6066
- for (int i01 = ith; i01 < ne01; i01 += nth) {
6133
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
6134
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
6135
+ for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
6067
6136
  const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
6068
6137
 
6069
6138
  ggml_float sum = 0.0;
6070
- for (int i00 = 0; i00 < ne00; i00++) {
6139
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
6071
6140
  sum += (ggml_float)(x[i00] * x[i00]);
6072
6141
  }
6073
6142
 
@@ -6120,13 +6189,13 @@ static bool ggml_compute_forward_mul_mat_use_blas(
6120
6189
  const struct ggml_tensor * src0,
6121
6190
  const struct ggml_tensor * src1,
6122
6191
  struct ggml_tensor * dst) {
6123
- //const int ne00 = src0->ne[0];
6124
- //const int ne01 = src0->ne[1];
6192
+ //const int64_t ne00 = src0->ne[0];
6193
+ //const int64_t ne01 = src0->ne[1];
6125
6194
 
6126
- const int ne10 = src1->ne[0];
6195
+ const int64_t ne10 = src1->ne[0];
6127
6196
 
6128
- const int ne0 = dst->ne[0];
6129
- const int ne1 = dst->ne[1];
6197
+ const int64_t ne0 = dst->ne[0];
6198
+ const int64_t ne1 = dst->ne[1];
6130
6199
 
6131
6200
  // TODO: find the optimal values for these
6132
6201
  if (ggml_is_contiguous(src0) &&
@@ -6148,23 +6217,23 @@ static void ggml_compute_forward_mul_mat_f32(
6148
6217
  int64_t t0 = ggml_perf_time_us();
6149
6218
  UNUSED(t0);
6150
6219
 
6151
- const int ne00 = src0->ne[0];
6152
- const int ne01 = src0->ne[1];
6153
- const int ne02 = src0->ne[2];
6154
- const int ne03 = src0->ne[3];
6220
+ const int64_t ne00 = src0->ne[0];
6221
+ const int64_t ne01 = src0->ne[1];
6222
+ const int64_t ne02 = src0->ne[2];
6223
+ const int64_t ne03 = src0->ne[3];
6155
6224
 
6156
6225
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
6157
- const int ne10 = src1->ne[0];
6226
+ const int64_t ne10 = src1->ne[0];
6158
6227
  #endif
6159
- const int ne11 = src1->ne[1];
6228
+ const int64_t ne11 = src1->ne[1];
6160
6229
  #ifndef NDEBUG
6161
- const int ne12 = src1->ne[2];
6162
- const int ne13 = src1->ne[3];
6230
+ const int64_t ne12 = src1->ne[2];
6231
+ const int64_t ne13 = src1->ne[3];
6163
6232
 
6164
- const int ne0 = dst->ne[0];
6165
- const int ne1 = dst->ne[1];
6166
- const int ne2 = dst->ne[2];
6167
- const int ne3 = dst->ne[3];
6233
+ const int64_t ne0 = dst->ne[0];
6234
+ const int64_t ne1 = dst->ne[1];
6235
+ const int64_t ne2 = dst->ne[2];
6236
+ const int64_t ne3 = dst->ne[3];
6168
6237
 
6169
6238
  const int nb00 = src0->nb[0];
6170
6239
  #endif
@@ -6224,8 +6293,8 @@ static void ggml_compute_forward_mul_mat_f32(
6224
6293
  return;
6225
6294
  }
6226
6295
 
6227
- for (int i03 = 0; i03 < ne03; i03++) {
6228
- for (int i02 = 0; i02 < ne02; i02++) {
6296
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
6297
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
6229
6298
  const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
6230
6299
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
6231
6300
 
@@ -6272,7 +6341,7 @@ static void ggml_compute_forward_mul_mat_f32(
6272
6341
  const int i02 = (ir - i03*ne02*ne01)/ne01;
6273
6342
  const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
6274
6343
 
6275
- for (int ic = 0; ic < ne11; ++ic) {
6344
+ for (int64_t ic = 0; ic < ne11; ++ic) {
6276
6345
  // src1 indices
6277
6346
  const int i13 = i03;
6278
6347
  const int i12 = i02;
@@ -6313,21 +6382,21 @@ static void ggml_compute_forward_mul_mat_f16_f32(
6313
6382
  int64_t t0 = ggml_perf_time_us();
6314
6383
  UNUSED(t0);
6315
6384
 
6316
- const int ne00 = src0->ne[0];
6317
- const int ne01 = src0->ne[1];
6318
- const int ne02 = src0->ne[2];
6319
- const int ne03 = src0->ne[3];
6385
+ const int64_t ne00 = src0->ne[0];
6386
+ const int64_t ne01 = src0->ne[1];
6387
+ const int64_t ne02 = src0->ne[2];
6388
+ const int64_t ne03 = src0->ne[3];
6320
6389
 
6321
- const int ne10 = src1->ne[0];
6322
- const int ne11 = src1->ne[1];
6323
- const int ne12 = src1->ne[2];
6324
- const int ne13 = src1->ne[3];
6390
+ const int64_t ne10 = src1->ne[0];
6391
+ const int64_t ne11 = src1->ne[1];
6392
+ const int64_t ne12 = src1->ne[2];
6393
+ const int64_t ne13 = src1->ne[3];
6325
6394
 
6326
- const int ne0 = dst->ne[0];
6327
- const int ne1 = dst->ne[1];
6328
- const int ne2 = dst->ne[2];
6329
- const int ne3 = dst->ne[3];
6330
- //const int ne = ne0*ne1*ne2*ne3;
6395
+ const int64_t ne0 = dst->ne[0];
6396
+ const int64_t ne1 = dst->ne[1];
6397
+ const int64_t ne2 = dst->ne[2];
6398
+ const int64_t ne3 = dst->ne[3];
6399
+ //const int64_t ne = ne0*ne1*ne2*ne3;
6331
6400
 
6332
6401
  const int nb00 = src0->nb[0];
6333
6402
  const int nb01 = src0->nb[1];
@@ -6387,12 +6456,12 @@ static void ggml_compute_forward_mul_mat_f16_f32(
6387
6456
 
6388
6457
  float * const wdata = params->wdata;
6389
6458
 
6390
- for (int i03 = 0; i03 < ne03; i03++) {
6391
- for (int i02 = 0; i02 < ne02; i02++) {
6459
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
6460
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
6392
6461
  {
6393
6462
  size_t id = 0;
6394
- for (int i01 = 0; i01 < ne01; ++i01) {
6395
- for (int i00 = 0; i00 < ne00; ++i00) {
6463
+ for (int64_t i01 = 0; i01 < ne01; ++i01) {
6464
+ for (int64_t i00 = 0; i00 < ne00; ++i00) {
6396
6465
  wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
6397
6466
  }
6398
6467
  }
@@ -6422,10 +6491,10 @@ static void ggml_compute_forward_mul_mat_f16_f32(
6422
6491
  ggml_fp16_t * const wdata = params->wdata;
6423
6492
 
6424
6493
  size_t id = 0;
6425
- for (int i13 = 0; i13 < ne13; ++i13) {
6426
- for (int i12 = 0; i12 < ne12; ++i12) {
6427
- for (int i11 = 0; i11 < ne11; ++i11) {
6428
- for (int i10 = 0; i10 < ne10; ++i10) {
6494
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
6495
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
6496
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
6497
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
6429
6498
  wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
6430
6499
  }
6431
6500
  }
@@ -6477,7 +6546,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
6477
6546
 
6478
6547
  float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
6479
6548
 
6480
- for (int ic = 0; ic < ne11; ++ic) {
6549
+ for (int64_t ic = 0; ic < ne11; ++ic) {
6481
6550
  ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00);
6482
6551
  }
6483
6552
  }
@@ -6526,20 +6595,20 @@ static void ggml_compute_forward_mul_mat_q_f32(
6526
6595
  int64_t t0 = ggml_perf_time_us();
6527
6596
  UNUSED(t0);
6528
6597
 
6529
- const int ne00 = src0->ne[0];
6530
- const int ne01 = src0->ne[1];
6531
- const int ne02 = src0->ne[2];
6532
- const int ne03 = src0->ne[3];
6598
+ const int64_t ne00 = src0->ne[0];
6599
+ const int64_t ne01 = src0->ne[1];
6600
+ const int64_t ne02 = src0->ne[2];
6601
+ const int64_t ne03 = src0->ne[3];
6533
6602
 
6534
- const int ne10 = src1->ne[0];
6535
- const int ne11 = src1->ne[1];
6536
- const int ne12 = src1->ne[2];
6537
- const int ne13 = src1->ne[3];
6603
+ const int64_t ne10 = src1->ne[0];
6604
+ const int64_t ne11 = src1->ne[1];
6605
+ const int64_t ne12 = src1->ne[2];
6606
+ const int64_t ne13 = src1->ne[3];
6538
6607
 
6539
- const int ne0 = dst->ne[0];
6540
- const int ne1 = dst->ne[1];
6541
- const int ne2 = dst->ne[2];
6542
- const int ne3 = dst->ne[3];
6608
+ const int64_t ne0 = dst->ne[0];
6609
+ const int64_t ne1 = dst->ne[1];
6610
+ const int64_t ne2 = dst->ne[2];
6611
+ const int64_t ne3 = dst->ne[3];
6543
6612
 
6544
6613
  const int nb00 = src0->nb[0];
6545
6614
  const int nb01 = src0->nb[1];
@@ -6603,11 +6672,11 @@ static void ggml_compute_forward_mul_mat_q_f32(
6603
6672
  float * const wdata = params->wdata;
6604
6673
  dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
6605
6674
 
6606
- for (int i03 = 0; i03 < ne03; i03++) {
6607
- for (int i02 = 0; i02 < ne02; i02++) {
6675
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
6676
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
6608
6677
  {
6609
6678
  size_t id = 0;
6610
- for (int i01 = 0; i01 < ne01; ++i01) {
6679
+ for (int64_t i01 = 0; i01 < ne01; ++i01) {
6611
6680
  dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
6612
6681
  id += ne00;
6613
6682
  }
@@ -6637,9 +6706,9 @@ static void ggml_compute_forward_mul_mat_q_f32(
6637
6706
  char * wdata = params->wdata;
6638
6707
  const size_t row_size = ne10*GGML_TYPE_SIZE[type]/GGML_BLCK_SIZE[type];
6639
6708
 
6640
- for (int i13 = 0; i13 < ne13; ++i13) {
6641
- for (int i12 = 0; i12 < ne12; ++i12) {
6642
- for (int i11 = 0; i11 < ne11; ++i11) {
6709
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
6710
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
6711
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
6643
6712
  quantize_row_q((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
6644
6713
  wdata += row_size;
6645
6714
  }
@@ -6688,7 +6757,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
6688
6757
 
6689
6758
  assert(ne00 % 32 == 0);
6690
6759
 
6691
- for (int ic = 0; ic < ne11; ++ic) {
6760
+ for (int64_t ic = 0; ic < ne11; ++ic) {
6692
6761
  vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
6693
6762
  }
6694
6763
  }
@@ -7169,7 +7238,6 @@ static void ggml_compute_forward_rope_f32(
7169
7238
  const struct ggml_tensor * src0,
7170
7239
  const struct ggml_tensor * src1,
7171
7240
  struct ggml_tensor * dst) {
7172
- assert(params->ith == 0);
7173
7241
  assert(src1->type == GGML_TYPE_I32);
7174
7242
  assert(ggml_nelements(src1) == 3);
7175
7243
 
@@ -7181,10 +7249,10 @@ static void ggml_compute_forward_rope_f32(
7181
7249
  const int n_dims = ((int32_t *) src1->data)[1];
7182
7250
  const int mode = ((int32_t *) src1->data)[2];
7183
7251
 
7184
- //const int ne0 = src0->ne[0];
7185
- const int ne1 = src0->ne[1];
7186
- const int ne2 = src0->ne[2];
7187
- const int ne3 = src0->ne[3];
7252
+ //const int64_t ne0 = src0->ne[0];
7253
+ const int64_t ne1 = src0->ne[1];
7254
+ const int64_t ne2 = src0->ne[2];
7255
+ const int64_t ne3 = src0->ne[3];
7188
7256
 
7189
7257
  const int nb0 = src0->nb[0];
7190
7258
  const int nb1 = src0->nb[1];
@@ -7196,11 +7264,28 @@ static void ggml_compute_forward_rope_f32(
7196
7264
 
7197
7265
  assert(nb0 == sizeof(float));
7198
7266
 
7199
- // TODO: optimize
7200
- for (int i3 = 0; i3 < ne3; i3++) {
7201
- for (int i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
7267
+ const int ith = params->ith;
7268
+ const int nth = params->nth;
7269
+
7270
+ const int nr = ggml_nrows(src0);
7271
+
7272
+ // rows per thread
7273
+ const int dr = (nr + nth - 1)/nth;
7274
+
7275
+ // row range for this thread
7276
+ const int ir0 = dr*ith;
7277
+ const int ir1 = MIN(ir0 + dr, nr);
7278
+
7279
+ // row index used to determine which thread to use
7280
+ int ir = 0;
7281
+
7282
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
7283
+ for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
7202
7284
  const int p = (mode == 0 ? n_past + i2 : i2);
7203
- for (int i1 = 0; i1 < ne1; i1++) {
7285
+ for (int64_t i1 = 0; i1 < ne1; i1++) {
7286
+ if (ir++ < ir0) continue;
7287
+ if (ir > ir1) break;
7288
+
7204
7289
  for (int i0 = 0; i0 < n_dims; i0 += 2) {
7205
7290
  const float theta = powf(10000.0, ((float)-i0)/n_dims);
7206
7291
 
@@ -7226,7 +7311,6 @@ static void ggml_compute_forward_rope_f16(
7226
7311
  const struct ggml_tensor * src0,
7227
7312
  const struct ggml_tensor * src1,
7228
7313
  struct ggml_tensor * dst) {
7229
- assert(params->ith == 0);
7230
7314
  assert(src1->type == GGML_TYPE_I32);
7231
7315
  assert(ggml_nelements(src1) == 3);
7232
7316
 
@@ -7238,10 +7322,10 @@ static void ggml_compute_forward_rope_f16(
7238
7322
  const int n_dims = ((int32_t *) src1->data)[1];
7239
7323
  const int mode = ((int32_t *) src1->data)[2];
7240
7324
 
7241
- //const int ne0 = src0->ne[0];
7242
- const int ne1 = src0->ne[1];
7243
- const int ne2 = src0->ne[2];
7244
- const int ne3 = src0->ne[3];
7325
+ //const int64_t ne0 = src0->ne[0];
7326
+ const int64_t ne1 = src0->ne[1];
7327
+ const int64_t ne2 = src0->ne[2];
7328
+ const int64_t ne3 = src0->ne[3];
7245
7329
 
7246
7330
  const int nb0 = src0->nb[0];
7247
7331
  const int nb1 = src0->nb[1];
@@ -7253,10 +7337,28 @@ static void ggml_compute_forward_rope_f16(
7253
7337
 
7254
7338
  assert(nb0 == sizeof(ggml_fp16_t));
7255
7339
 
7256
- for (int i3 = 0; i3 < ne3; i3++) {
7257
- for (int i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
7340
+ const int ith = params->ith;
7341
+ const int nth = params->nth;
7342
+
7343
+ const int nr = ggml_nrows(src0);
7344
+
7345
+ // rows per thread
7346
+ const int dr = (nr + nth - 1)/nth;
7347
+
7348
+ // row range for this thread
7349
+ const int ir0 = dr*ith;
7350
+ const int ir1 = MIN(ir0 + dr, nr);
7351
+
7352
+ // row index used to determine which thread to use
7353
+ int ir = 0;
7354
+
7355
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
7356
+ for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
7258
7357
  const int p = (mode == 0 ? n_past + i2 : i2);
7259
- for (int i1 = 0; i1 < ne1; i1++) {
7358
+ for (int64_t i1 = 0; i1 < ne1; i1++) {
7359
+ if (ir++ < ir0) continue;
7360
+ if (ir > ir1) break;
7361
+
7260
7362
  for (int i0 = 0; i0 < n_dims; i0 += 2) {
7261
7363
  const float theta = powf(10000.0, ((float)-i0)/n_dims);
7262
7364
 
@@ -7317,21 +7419,21 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
7317
7419
  int64_t t0 = ggml_perf_time_us();
7318
7420
  UNUSED(t0);
7319
7421
 
7320
- const int ne00 = src0->ne[0];
7321
- const int ne01 = src0->ne[1];
7322
- const int ne02 = src0->ne[2];
7323
- //const int ne03 = src0->ne[3];
7422
+ const int64_t ne00 = src0->ne[0];
7423
+ const int64_t ne01 = src0->ne[1];
7424
+ const int64_t ne02 = src0->ne[2];
7425
+ //const int64_t ne03 = src0->ne[3];
7324
7426
 
7325
- const int ne10 = src1->ne[0];
7326
- const int ne11 = src1->ne[1];
7327
- //const int ne12 = src1->ne[2];
7328
- //const int ne13 = src1->ne[3];
7427
+ const int64_t ne10 = src1->ne[0];
7428
+ const int64_t ne11 = src1->ne[1];
7429
+ //const int64_t ne12 = src1->ne[2];
7430
+ //const int64_t ne13 = src1->ne[3];
7329
7431
 
7330
- //const int ne0 = dst->ne[0];
7331
- //const int ne1 = dst->ne[1];
7332
- //const int ne2 = dst->ne[2];
7333
- //const int ne3 = dst->ne[3];
7334
- //const int ne = ne0*ne1*ne2*ne3;
7432
+ //const int64_t ne0 = dst->ne[0];
7433
+ //const int64_t ne1 = dst->ne[1];
7434
+ //const int64_t ne2 = dst->ne[2];
7435
+ //const int64_t ne3 = dst->ne[3];
7436
+ //const int64_t ne = ne0*ne1*ne2*ne3;
7335
7437
 
7336
7438
  const int nb00 = src0->nb[0];
7337
7439
  const int nb01 = src0->nb[1];
@@ -7368,11 +7470,11 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
7368
7470
  {
7369
7471
  ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
7370
7472
 
7371
- for (int i02 = 0; i02 < ne02; i02++) {
7372
- for (int i01 = 0; i01 < ne01; i01++) {
7473
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
7474
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
7373
7475
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
7374
7476
  ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
7375
- for (int i00 = 0; i00 < ne00; i00++) {
7477
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
7376
7478
  dst_data[i00*ew0 + i01] = src[i00];
7377
7479
  }
7378
7480
  }
@@ -7383,10 +7485,10 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
7383
7485
  {
7384
7486
  ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
7385
7487
 
7386
- for (int i11 = 0; i11 < ne11; i11++) {
7488
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
7387
7489
  const float * const src = (float *)((char *) src1->data + i11*nb11);
7388
7490
  ggml_fp16_t * dst_data = wdata;
7389
- for (int i10 = 0; i10 < ne10; i10++) {
7491
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
7390
7492
  dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
7391
7493
  }
7392
7494
  }
@@ -7411,7 +7513,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
7411
7513
 
7412
7514
  for (int i1 = ir0; i1 < ir1; i1++) {
7413
7515
  float * dst_data = (float *)((char *) dst->data + i1*nb1);
7414
- for (int i0 = 0; i0 < ne10; ++i0) {
7516
+ for (int64_t i0 = 0; i0 < ne10; ++i0) {
7415
7517
  dst_data[i0] = 0;
7416
7518
  for (int k = -nh; k <= nh; k++) {
7417
7519
  float v = 0.0f;
@@ -7437,21 +7539,21 @@ static void ggml_compute_forward_conv_1d_1s_f32(
7437
7539
  int64_t t0 = ggml_perf_time_us();
7438
7540
  UNUSED(t0);
7439
7541
 
7440
- const int ne00 = src0->ne[0];
7441
- const int ne01 = src0->ne[1];
7442
- const int ne02 = src0->ne[2];
7443
- //const int ne03 = src0->ne[3];
7542
+ const int64_t ne00 = src0->ne[0];
7543
+ const int64_t ne01 = src0->ne[1];
7544
+ const int64_t ne02 = src0->ne[2];
7545
+ //const int64_t ne03 = src0->ne[3];
7444
7546
 
7445
- const int ne10 = src1->ne[0];
7446
- const int ne11 = src1->ne[1];
7447
- //const int ne12 = src1->ne[2];
7448
- //const int ne13 = src1->ne[3];
7547
+ const int64_t ne10 = src1->ne[0];
7548
+ const int64_t ne11 = src1->ne[1];
7549
+ //const int64_t ne12 = src1->ne[2];
7550
+ //const int64_t ne13 = src1->ne[3];
7449
7551
 
7450
- //const int ne0 = dst->ne[0];
7451
- //const int ne1 = dst->ne[1];
7452
- //const int ne2 = dst->ne[2];
7453
- //const int ne3 = dst->ne[3];
7454
- //const int ne = ne0*ne1*ne2*ne3;
7552
+ //const int64_t ne0 = dst->ne[0];
7553
+ //const int64_t ne1 = dst->ne[1];
7554
+ //const int64_t ne2 = dst->ne[2];
7555
+ //const int64_t ne3 = dst->ne[3];
7556
+ //const int64_t ne = ne0*ne1*ne2*ne3;
7455
7557
 
7456
7558
  const int nb00 = src0->nb[0];
7457
7559
  const int nb01 = src0->nb[1];
@@ -7488,11 +7590,11 @@ static void ggml_compute_forward_conv_1d_1s_f32(
7488
7590
  {
7489
7591
  float * const wdata = (float *) params->wdata + 0;
7490
7592
 
7491
- for (int i02 = 0; i02 < ne02; i02++) {
7492
- for (int i01 = 0; i01 < ne01; i01++) {
7593
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
7594
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
7493
7595
  const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
7494
7596
  float * dst_data = wdata + i02*ew0*ne00;
7495
- for (int i00 = 0; i00 < ne00; i00++) {
7597
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
7496
7598
  dst_data[i00*ew0 + i01] = src[i00];
7497
7599
  }
7498
7600
  }
@@ -7503,10 +7605,10 @@ static void ggml_compute_forward_conv_1d_1s_f32(
7503
7605
  {
7504
7606
  float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
7505
7607
 
7506
- for (int i11 = 0; i11 < ne11; i11++) {
7608
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
7507
7609
  const float * const src = (float *)((char *) src1->data + i11*nb11);
7508
7610
  float * dst_data = wdata;
7509
- for (int i10 = 0; i10 < ne10; i10++) {
7611
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
7510
7612
  dst_data[(i10 + nh)*ew0 + i11] = src[i10];
7511
7613
  }
7512
7614
  }
@@ -7531,7 +7633,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
7531
7633
 
7532
7634
  for (int i1 = ir0; i1 < ir1; i1++) {
7533
7635
  float * dst_data = (float *)((char *) dst->data + i1*nb1);
7534
- for (int i0 = 0; i0 < ne10; ++i0) {
7636
+ for (int64_t i0 = 0; i0 < ne10; ++i0) {
7535
7637
  dst_data[i0] = 0;
7536
7638
  for (int k = -nh; k <= nh; k++) {
7537
7639
  float v = 0.0f;
@@ -7585,21 +7687,21 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
7585
7687
  int64_t t0 = ggml_perf_time_us();
7586
7688
  UNUSED(t0);
7587
7689
 
7588
- const int ne00 = src0->ne[0];
7589
- const int ne01 = src0->ne[1];
7590
- const int ne02 = src0->ne[2];
7591
- //const int ne03 = src0->ne[3];
7690
+ const int64_t ne00 = src0->ne[0];
7691
+ const int64_t ne01 = src0->ne[1];
7692
+ const int64_t ne02 = src0->ne[2];
7693
+ //const int64_t ne03 = src0->ne[3];
7592
7694
 
7593
- const int ne10 = src1->ne[0];
7594
- const int ne11 = src1->ne[1];
7595
- //const int ne12 = src1->ne[2];
7596
- //const int ne13 = src1->ne[3];
7695
+ const int64_t ne10 = src1->ne[0];
7696
+ const int64_t ne11 = src1->ne[1];
7697
+ //const int64_t ne12 = src1->ne[2];
7698
+ //const int64_t ne13 = src1->ne[3];
7597
7699
 
7598
- //const int ne0 = dst->ne[0];
7599
- //const int ne1 = dst->ne[1];
7600
- //const int ne2 = dst->ne[2];
7601
- //const int ne3 = dst->ne[3];
7602
- //const int ne = ne0*ne1*ne2*ne3;
7700
+ //const int64_t ne0 = dst->ne[0];
7701
+ //const int64_t ne1 = dst->ne[1];
7702
+ //const int64_t ne2 = dst->ne[2];
7703
+ //const int64_t ne3 = dst->ne[3];
7704
+ //const int64_t ne = ne0*ne1*ne2*ne3;
7603
7705
 
7604
7706
  const int nb00 = src0->nb[0];
7605
7707
  const int nb01 = src0->nb[1];
@@ -7636,11 +7738,11 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
7636
7738
  {
7637
7739
  ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
7638
7740
 
7639
- for (int i02 = 0; i02 < ne02; i02++) {
7640
- for (int i01 = 0; i01 < ne01; i01++) {
7741
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
7742
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
7641
7743
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
7642
7744
  ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
7643
- for (int i00 = 0; i00 < ne00; i00++) {
7745
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
7644
7746
  dst_data[i00*ew0 + i01] = src[i00];
7645
7747
  }
7646
7748
  }
@@ -7651,10 +7753,10 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
7651
7753
  {
7652
7754
  ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
7653
7755
 
7654
- for (int i11 = 0; i11 < ne11; i11++) {
7756
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
7655
7757
  const float * const src = (float *)((char *) src1->data + i11*nb11);
7656
7758
  ggml_fp16_t * dst_data = wdata;
7657
- for (int i10 = 0; i10 < ne10; i10++) {
7759
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
7658
7760
  dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
7659
7761
  }
7660
7762
  }
@@ -7679,7 +7781,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
7679
7781
 
7680
7782
  for (int i1 = ir0; i1 < ir1; i1++) {
7681
7783
  float * dst_data = (float *)((char *) dst->data + i1*nb1);
7682
- for (int i0 = 0; i0 < ne10; i0 += 2) {
7784
+ for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
7683
7785
  dst_data[i0/2] = 0;
7684
7786
  for (int k = -nh; k <= nh; k++) {
7685
7787
  float v = 0.0f;
@@ -7705,21 +7807,21 @@ static void ggml_compute_forward_conv_1d_2s_f32(
7705
7807
  int64_t t0 = ggml_perf_time_us();
7706
7808
  UNUSED(t0);
7707
7809
 
7708
- const int ne00 = src0->ne[0];
7709
- const int ne01 = src0->ne[1];
7710
- const int ne02 = src0->ne[2];
7711
- //const int ne03 = src0->ne[3];
7810
+ const int64_t ne00 = src0->ne[0];
7811
+ const int64_t ne01 = src0->ne[1];
7812
+ const int64_t ne02 = src0->ne[2];
7813
+ //const int64_t ne03 = src0->ne[3];
7712
7814
 
7713
- const int ne10 = src1->ne[0];
7714
- const int ne11 = src1->ne[1];
7715
- //const int ne12 = src1->ne[2];
7716
- //const int ne13 = src1->ne[3];
7815
+ const int64_t ne10 = src1->ne[0];
7816
+ const int64_t ne11 = src1->ne[1];
7817
+ //const int64_t ne12 = src1->ne[2];
7818
+ //const int64_t ne13 = src1->ne[3];
7717
7819
 
7718
- //const int ne0 = dst->ne[0];
7719
- //const int ne1 = dst->ne[1];
7720
- //const int ne2 = dst->ne[2];
7721
- //const int ne3 = dst->ne[3];
7722
- //const int ne = ne0*ne1*ne2*ne3;
7820
+ //const int64_t ne0 = dst->ne[0];
7821
+ //const int64_t ne1 = dst->ne[1];
7822
+ //const int64_t ne2 = dst->ne[2];
7823
+ //const int64_t ne3 = dst->ne[3];
7824
+ //const int64_t ne = ne0*ne1*ne2*ne3;
7723
7825
 
7724
7826
  const int nb00 = src0->nb[0];
7725
7827
  const int nb01 = src0->nb[1];
@@ -7756,11 +7858,11 @@ static void ggml_compute_forward_conv_1d_2s_f32(
7756
7858
  {
7757
7859
  float * const wdata = (float *) params->wdata + 0;
7758
7860
 
7759
- for (int i02 = 0; i02 < ne02; i02++) {
7760
- for (int i01 = 0; i01 < ne01; i01++) {
7861
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
7862
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
7761
7863
  const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
7762
7864
  float * dst_data = wdata + i02*ew0*ne00;
7763
- for (int i00 = 0; i00 < ne00; i00++) {
7865
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
7764
7866
  dst_data[i00*ew0 + i01] = src[i00];
7765
7867
  }
7766
7868
  }
@@ -7771,10 +7873,10 @@ static void ggml_compute_forward_conv_1d_2s_f32(
7771
7873
  {
7772
7874
  float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
7773
7875
 
7774
- for (int i11 = 0; i11 < ne11; i11++) {
7876
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
7775
7877
  const float * const src = (float *)((char *) src1->data + i11*nb11);
7776
7878
  float * dst_data = wdata;
7777
- for (int i10 = 0; i10 < ne10; i10++) {
7879
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
7778
7880
  dst_data[(i10 + nh)*ew0 + i11] = src[i10];
7779
7881
  }
7780
7882
  }
@@ -7799,7 +7901,7 @@ static void ggml_compute_forward_conv_1d_2s_f32(
7799
7901
 
7800
7902
  for (int i1 = ir0; i1 < ir1; i1++) {
7801
7903
  float * dst_data = (float *)((char *) dst->data + i1*nb1);
7802
- for (int i0 = 0; i0 < ne10; i0 += 2) {
7904
+ for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
7803
7905
  dst_data[i0/2] = 0;
7804
7906
  for (int k = -nh; k <= nh; k++) {
7805
7907
  float v = 0.0f;
@@ -7851,25 +7953,25 @@ static void ggml_compute_forward_flash_attn_f32(
7851
7953
  int64_t t0 = ggml_perf_time_us();
7852
7954
  UNUSED(t0);
7853
7955
 
7854
- const int neq0 = q->ne[0];
7855
- const int neq1 = q->ne[1];
7856
- const int neq2 = q->ne[2];
7857
- const int neq3 = q->ne[3];
7956
+ const int64_t neq0 = q->ne[0];
7957
+ const int64_t neq1 = q->ne[1];
7958
+ const int64_t neq2 = q->ne[2];
7959
+ const int64_t neq3 = q->ne[3];
7858
7960
 
7859
- const int nek0 = k->ne[0];
7860
- const int nek1 = k->ne[1];
7861
- //const int nek2 = k->ne[2];
7862
- //const int nek3 = k->ne[3];
7961
+ const int64_t nek0 = k->ne[0];
7962
+ const int64_t nek1 = k->ne[1];
7963
+ //const int64_t nek2 = k->ne[2];
7964
+ //const int64_t nek3 = k->ne[3];
7863
7965
 
7864
- //const int nev0 = v->ne[0];
7865
- const int nev1 = v->ne[1];
7866
- //const int nev2 = v->ne[2];
7867
- //const int nev3 = v->ne[3];
7966
+ //const int64_t nev0 = v->ne[0];
7967
+ const int64_t nev1 = v->ne[1];
7968
+ //const int64_t nev2 = v->ne[2];
7969
+ //const int64_t nev3 = v->ne[3];
7868
7970
 
7869
- const int ne0 = dst->ne[0];
7870
- const int ne1 = dst->ne[1];
7871
- //const int ne2 = dst->ne[2];
7872
- //const int ne3 = dst->ne[3];
7971
+ const int64_t ne0 = dst->ne[0];
7972
+ const int64_t ne1 = dst->ne[1];
7973
+ //const int64_t ne2 = dst->ne[2];
7974
+ //const int64_t ne3 = dst->ne[3];
7873
7975
 
7874
7976
  const int nbk0 = k->nb[0];
7875
7977
  const int nbk1 = k->nb[1];
@@ -7894,10 +7996,10 @@ static void ggml_compute_forward_flash_attn_f32(
7894
7996
  const int ith = params->ith;
7895
7997
  const int nth = params->nth;
7896
7998
 
7897
- const int D = neq0;
7898
- const int N = neq1;
7899
- const int P = nek1 - N;
7900
- const int M = P + N;
7999
+ const int64_t D = neq0;
8000
+ const int64_t N = neq1;
8001
+ const int64_t P = nek1 - N;
8002
+ const int64_t M = P + N;
7901
8003
 
7902
8004
  const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
7903
8005
 
@@ -7959,7 +8061,7 @@ static void ggml_compute_forward_flash_attn_f32(
7959
8061
  S[i] = -INFINITY;
7960
8062
  }
7961
8063
 
7962
- for (int ic = 0; ic < nek1; ++ic) {
8064
+ for (int64_t ic = 0; ic < nek1; ++ic) {
7963
8065
  // k indices
7964
8066
  const int ik3 = iq3;
7965
8067
  const int ik2 = iq2;
@@ -7978,7 +8080,7 @@ static void ggml_compute_forward_flash_attn_f32(
7978
8080
  ggml_vec_scale_f32(nek1, S, scale);
7979
8081
 
7980
8082
  if (masked) {
7981
- for (int i = P; i < M; i++) {
8083
+ for (int64_t i = P; i < M; i++) {
7982
8084
  if (i > P + iq1) {
7983
8085
  S[i] = -INFINITY;
7984
8086
  }
@@ -8036,7 +8138,7 @@ static void ggml_compute_forward_flash_attn_f32(
8036
8138
  #endif
8037
8139
  }
8038
8140
 
8039
- for (int ic = 0; ic < nev1; ++ic) {
8141
+ for (int64_t ic = 0; ic < nev1; ++ic) {
8040
8142
  // dst indices
8041
8143
  const int i1 = iq1;
8042
8144
  const int i2 = iq2;
@@ -8060,25 +8162,25 @@ static void ggml_compute_forward_flash_attn_f16(
8060
8162
  int64_t t0 = ggml_perf_time_us();
8061
8163
  UNUSED(t0);
8062
8164
 
8063
- const int neq0 = q->ne[0];
8064
- const int neq1 = q->ne[1];
8065
- const int neq2 = q->ne[2];
8066
- const int neq3 = q->ne[3];
8165
+ const int64_t neq0 = q->ne[0];
8166
+ const int64_t neq1 = q->ne[1];
8167
+ const int64_t neq2 = q->ne[2];
8168
+ const int64_t neq3 = q->ne[3];
8067
8169
 
8068
- const int nek0 = k->ne[0];
8069
- const int nek1 = k->ne[1];
8070
- //const int nek2 = k->ne[2];
8071
- //const int nek3 = k->ne[3];
8170
+ const int64_t nek0 = k->ne[0];
8171
+ const int64_t nek1 = k->ne[1];
8172
+ //const int64_t nek2 = k->ne[2];
8173
+ //const int64_t nek3 = k->ne[3];
8072
8174
 
8073
- //const int nev0 = v->ne[0];
8074
- const int nev1 = v->ne[1];
8075
- //const int nev2 = v->ne[2];
8076
- //const int nev3 = v->ne[3];
8175
+ //const int64_t nev0 = v->ne[0];
8176
+ const int64_t nev1 = v->ne[1];
8177
+ //const int64_t nev2 = v->ne[2];
8178
+ //const int64_t nev3 = v->ne[3];
8077
8179
 
8078
- const int ne0 = dst->ne[0];
8079
- const int ne1 = dst->ne[1];
8080
- //const int ne2 = dst->ne[2];
8081
- //const int ne3 = dst->ne[3];
8180
+ const int64_t ne0 = dst->ne[0];
8181
+ const int64_t ne1 = dst->ne[1];
8182
+ //const int64_t ne2 = dst->ne[2];
8183
+ //const int64_t ne3 = dst->ne[3];
8082
8184
 
8083
8185
  const int nbk0 = k->nb[0];
8084
8186
  const int nbk1 = k->nb[1];
@@ -8103,10 +8205,10 @@ static void ggml_compute_forward_flash_attn_f16(
8103
8205
  const int ith = params->ith;
8104
8206
  const int nth = params->nth;
8105
8207
 
8106
- const int D = neq0;
8107
- const int N = neq1;
8108
- const int P = nek1 - N;
8109
- const int M = P + N;
8208
+ const int64_t D = neq0;
8209
+ const int64_t N = neq1;
8210
+ const int64_t P = nek1 - N;
8211
+ const int64_t M = P + N;
8110
8212
 
8111
8213
  const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
8112
8214
 
@@ -8169,7 +8271,7 @@ static void ggml_compute_forward_flash_attn_f16(
8169
8271
  }
8170
8272
 
8171
8273
  if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
8172
- for (int ic = 0; ic < nek1; ++ic) {
8274
+ for (int64_t ic = 0; ic < nek1; ++ic) {
8173
8275
  // k indices
8174
8276
  const int ik3 = iq3;
8175
8277
  const int ik2 = iq2;
@@ -8184,7 +8286,7 @@ static void ggml_compute_forward_flash_attn_f16(
8184
8286
  (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
8185
8287
  }
8186
8288
  } else {
8187
- for (int ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
8289
+ for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
8188
8290
  // k indices
8189
8291
  const int ik3 = iq3;
8190
8292
  const int ik2 = iq2;
@@ -8204,7 +8306,7 @@ static void ggml_compute_forward_flash_attn_f16(
8204
8306
  ggml_vec_scale_f32(nek1, S, scale);
8205
8307
 
8206
8308
  if (masked) {
8207
- for (int i = P; i < M; i++) {
8309
+ for (int64_t i = P; i < M; i++) {
8208
8310
  if (i > P + iq1) {
8209
8311
  S[i] = -INFINITY;
8210
8312
  }
@@ -8264,12 +8366,12 @@ static void ggml_compute_forward_flash_attn_f16(
8264
8366
 
8265
8367
  ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);
8266
8368
 
8267
- for (int i = 0; i < M; i++) {
8369
+ for (int64_t i = 0; i < M; i++) {
8268
8370
  S16[i] = GGML_FP32_TO_FP16(S[i]);
8269
8371
  }
8270
8372
 
8271
8373
  if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {
8272
- for (int ic = 0; ic < nev1; ++ic) {
8374
+ for (int64_t ic = 0; ic < nev1; ++ic) {
8273
8375
  // dst indices
8274
8376
  const int i1 = iq1;
8275
8377
  const int i2 = iq2;
@@ -8281,7 +8383,7 @@ static void ggml_compute_forward_flash_attn_f16(
8281
8383
  S16);
8282
8384
  }
8283
8385
  } else {
8284
- for (int ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
8386
+ for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
8285
8387
  // dst indices
8286
8388
  const int i1 = iq1;
8287
8389
  const int i2 = iq2;
@@ -8337,35 +8439,35 @@ static void ggml_compute_forward_flash_ff_f16(
8337
8439
  int64_t t0 = ggml_perf_time_us();
8338
8440
  UNUSED(t0);
8339
8441
 
8340
- const int nea0 = a->ne[0];
8341
- const int nea1 = a->ne[1];
8342
- const int nea2 = a->ne[2];
8343
- const int nea3 = a->ne[3];
8442
+ const int64_t nea0 = a->ne[0];
8443
+ const int64_t nea1 = a->ne[1];
8444
+ const int64_t nea2 = a->ne[2];
8445
+ const int64_t nea3 = a->ne[3];
8344
8446
 
8345
- const int neb00 = b0->ne[0];
8346
- const int neb01 = b0->ne[1];
8347
- //const int neb02 = b0->ne[2];
8348
- //const int neb03 = b0->ne[3];
8447
+ const int64_t neb00 = b0->ne[0];
8448
+ const int64_t neb01 = b0->ne[1];
8449
+ //const int64_t neb02 = b0->ne[2];
8450
+ //const int64_t neb03 = b0->ne[3];
8349
8451
 
8350
- const int neb10 = b1->ne[0];
8351
- const int neb11 = b1->ne[1];
8352
- //const int neb12 = b1->ne[2];
8353
- //const int neb13 = b1->ne[3];
8452
+ const int64_t neb10 = b1->ne[0];
8453
+ const int64_t neb11 = b1->ne[1];
8454
+ //const int64_t neb12 = b1->ne[2];
8455
+ //const int64_t neb13 = b1->ne[3];
8354
8456
 
8355
- const int nec00 = c0->ne[0];
8356
- const int nec01 = c0->ne[1];
8357
- //const int nec02 = c0->ne[2];
8358
- //const int nec03 = c0->ne[3];
8457
+ const int64_t nec00 = c0->ne[0];
8458
+ const int64_t nec01 = c0->ne[1];
8459
+ //const int64_t nec02 = c0->ne[2];
8460
+ //const int64_t nec03 = c0->ne[3];
8359
8461
 
8360
- const int nec10 = c1->ne[0];
8361
- const int nec11 = c1->ne[1];
8362
- //const int nec12 = c1->ne[2];
8363
- //const int nec13 = c1->ne[3];
8462
+ const int64_t nec10 = c1->ne[0];
8463
+ const int64_t nec11 = c1->ne[1];
8464
+ //const int64_t nec12 = c1->ne[2];
8465
+ //const int64_t nec13 = c1->ne[3];
8364
8466
 
8365
- const int ne0 = dst->ne[0];
8366
- const int ne1 = dst->ne[1];
8367
- const int ne2 = dst->ne[2];
8368
- //const int ne3 = dst->ne[3];
8467
+ const int64_t ne0 = dst->ne[0];
8468
+ const int64_t ne1 = dst->ne[1];
8469
+ const int64_t ne2 = dst->ne[2];
8470
+ //const int64_t ne3 = dst->ne[3];
8369
8471
 
8370
8472
  const int nba0 = a->nb[0];
8371
8473
  const int nba1 = a->nb[1];
@@ -8400,9 +8502,9 @@ static void ggml_compute_forward_flash_ff_f16(
8400
8502
  const int ith = params->ith;
8401
8503
  const int nth = params->nth;
8402
8504
 
8403
- const int D = nea0;
8404
- //const int N = nea1;
8405
- const int M = neb01;
8505
+ const int64_t D = nea0;
8506
+ //const int64_t N = nea1;
8507
+ const int64_t M = neb01;
8406
8508
 
8407
8509
  GGML_ASSERT(ne0 == nea0);
8408
8510
  GGML_ASSERT(ne1 == nea1);
@@ -8458,7 +8560,7 @@ static void ggml_compute_forward_flash_ff_f16(
8458
8560
 
8459
8561
  float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
8460
8562
 
8461
- for (int ic = 0; ic < neb01; ++ic) {
8563
+ for (int64_t ic = 0; ic < neb01; ++ic) {
8462
8564
  // b0 indices
8463
8565
  const int ib03 = ia3;
8464
8566
  const int ib02 = ia2;
@@ -8478,7 +8580,7 @@ static void ggml_compute_forward_flash_ff_f16(
8478
8580
 
8479
8581
  ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
8480
8582
 
8481
- for (int i = 0; i < M; i++) {
8583
+ for (int64_t i = 0; i < M; i++) {
8482
8584
  S16[i] = GGML_FP32_TO_FP16(S[i]);
8483
8585
  }
8484
8586
 
@@ -8490,7 +8592,7 @@ static void ggml_compute_forward_flash_ff_f16(
8490
8592
  const int i2 = ia2;
8491
8593
  const int i3 = ia3;
8492
8594
 
8493
- for (int ic = 0; ic < nec01; ++ic) {
8595
+ for (int64_t ic = 0; ic < nec01; ++ic) {
8494
8596
 
8495
8597
  ggml_vec_dot_f16(neb01,
8496
8598
  (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
@@ -9355,7 +9457,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
9355
9457
  } break;
9356
9458
  case GGML_OP_ROPE:
9357
9459
  {
9358
- node->n_tasks = 1;
9460
+ node->n_tasks = n_threads;
9359
9461
  } break;
9360
9462
  case GGML_OP_CONV_1D_1S:
9361
9463
  case GGML_OP_CONV_1D_2S:
@@ -9393,7 +9495,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
9393
9495
 
9394
9496
  size_t cur = 0;
9395
9497
 
9396
- const int ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
9498
+ const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
9397
9499
 
9398
9500
  if (node->src1->type == GGML_TYPE_F32) {
9399
9501
  cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
@@ -9652,7 +9754,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
9652
9754
 
9653
9755
  perf_total_per_op_us[node->op] += node->perf_time_us;
9654
9756
 
9655
- GGML_PRINT(" - %3d: [ %6d, %6d, %6d] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
9757
+ GGML_PRINT(" - %3d: [ %" PRId64 ", %" PRId64 ", %" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
9656
9758
  i,
9657
9759
  node->ne[0], node->ne[1], node->ne[2],
9658
9760
  GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
@@ -9666,7 +9768,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
9666
9768
  for (int i = 0; i < cgraph->n_leafs; i++) {
9667
9769
  struct ggml_tensor * node = cgraph->leafs[i];
9668
9770
 
9669
- GGML_PRINT(" - %3d: [ %6d, %6d] %8s\n",
9771
+ GGML_PRINT(" - %3d: [ %" PRId64 ", %" PRId64 "] %8s\n",
9670
9772
  i,
9671
9773
  node->ne[0], node->ne[1],
9672
9774
  GGML_OP_LABEL[node->op]);
@@ -9737,7 +9839,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
9737
9839
 
9738
9840
  fprintf(fp, " \"%p\" [ \
9739
9841
  style = filled; fillcolor = %s; shape = record; \
9740
- label=\"%d [%d, %d] | <x>%s",
9842
+ label=\"%d [%" PRId64 ", %" PRId64 "] | <x>%s",
9741
9843
  (void *) node, color,
9742
9844
  i, node->ne[0], node->ne[1],
9743
9845
  GGML_OP_SYMBOL[node->op]);
@@ -9762,7 +9864,7 @@ label=\"<x>%.1e\"; ]\n",
9762
9864
  } else {
9763
9865
  fprintf(fp, " \"%p\" [ \
9764
9866
  style = filled; fillcolor = %s; shape = record; \
9765
- label=\"<x>CONST %d [%d, %d]\"; ]\n",
9867
+ label=\"<x>CONST %d [%" PRId64 ", %" PRId64 "]\"; ]\n",
9766
9868
  (void *) node, color,
9767
9869
  i, node->ne[0], node->ne[1]);
9768
9870
  }
@@ -9826,9 +9928,9 @@ label=\"<x>CONST %d [%d, %d]\"; ]\n",
9826
9928
  static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) {
9827
9929
  int i = 0;
9828
9930
  for (int p = 0; p < np; ++p) {
9829
- const int ne = ggml_nelements(ps[p]) ;
9931
+ const int64_t ne = ggml_nelements(ps[p]) ;
9830
9932
  // TODO: add function to set tensor from array
9831
- for (int j = 0; j < ne; ++j) {
9933
+ for (int64_t j = 0; j < ne; ++j) {
9832
9934
  ggml_set_f32_1d(ps[p], j, x[i++]);
9833
9935
  }
9834
9936
  }
@@ -9837,9 +9939,9 @@ static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const f
9837
9939
  static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {
9838
9940
  int i = 0;
9839
9941
  for (int p = 0; p < np; ++p) {
9840
- const int ne = ggml_nelements(ps[p]) ;
9942
+ const int64_t ne = ggml_nelements(ps[p]) ;
9841
9943
  // TODO: add function to get all elements at once
9842
- for (int j = 0; j < ne; ++j) {
9944
+ for (int64_t j = 0; j < ne; ++j) {
9843
9945
  x[i++] = ggml_get_f32_1d(ps[p], j);
9844
9946
  }
9845
9947
  }
@@ -9848,9 +9950,9 @@ static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float *
9848
9950
  static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
9849
9951
  int i = 0;
9850
9952
  for (int p = 0; p < np; ++p) {
9851
- const int ne = ggml_nelements(ps[p]) ;
9953
+ const int64_t ne = ggml_nelements(ps[p]) ;
9852
9954
  // TODO: add function to get all elements at once
9853
- for (int j = 0; j < ne; ++j) {
9955
+ for (int64_t j = 0; j < ne; ++j) {
9854
9956
  g[i++] = ggml_get_f32_1d(ps[p]->grad, j);
9855
9957
  }
9856
9958
  }