llama-rb 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,6 +16,7 @@
16
16
  #include <stdlib.h>
17
17
  #include <string.h>
18
18
  #include <stdint.h>
19
+ #include <inttypes.h>
19
20
  #include <stdio.h>
20
21
  #include <float.h>
21
22
 
@@ -1961,42 +1962,71 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
1961
1962
  // Initialize accumulator with zeros
1962
1963
  __m256 acc = _mm256_setzero_ps();
1963
1964
 
1964
- // Main loop
1965
- // TODO: figure a way to do this in a portable way
1966
- #ifdef __GNUC__
1967
- #pragma GCC unroll 16
1968
- #endif
1969
- for (int i = 0; i < nb; ++i) {
1970
- // Compute combined scale for the block
1971
- const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
1972
-
1973
- // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
1974
- __m256i bx = bytesFromNibbles( x[i].qs );
1975
- __m256i by = bytesFromNibbles( y[i].qs );
1976
-
1977
- // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
1978
- const __m256i off = _mm256_set1_epi8( 8 );
1979
- bx = _mm256_sub_epi8( bx, off );
1980
- by = _mm256_sub_epi8( by, off );
1981
-
1982
- // Get absolute values of x vectors
1983
- const __m256i ax = _mm256_sign_epi8(bx, bx);
1984
-
1985
- // Sign the values of the y vectors
1986
- const __m256i sy = _mm256_sign_epi8(by, bx);
1987
-
1988
- // Perform multiplication and create 16-bit values
1989
- const __m256i dot = _mm256_maddubs_epi16(ax, sy);
1990
-
1991
- const __m256i ones = _mm256_set1_epi16(1);
1992
- const __m256i i32 = _mm256_madd_epi16(ones, dot);
1965
+ /* Prepare the constants we will need during execution */
1966
+ const __m256i lowMask = _mm256_set1_epi8( 0xF );
1967
+ const __m256i offset_8 = _mm256_set1_epi16( 8 );
1993
1968
 
1994
- // Convert int32_t to float
1995
- const __m256 p = _mm256_cvtepi32_ps( i32 );
1969
+ #define UNROLL_COUNT 8
1970
+ // make sure we only unroll multiples of the block count
1971
+ assert(nb % UNROLL_COUNT == 0);
1996
1972
 
1997
- // Apply the scale, and accumulate
1998
- acc = _mm256_fmadd_ps( d, p, acc );
1999
- }
1973
+ // Main loop
1974
+ for (int i = 0; i < nb; i+=UNROLL_COUNT) {
1975
+
1976
+ // This loop will be unrolled by the compiler
1977
+ for (int u=0;u<UNROLL_COUNT;u++) {
1978
+ /* Compute combined scale for the block */
1979
+ const __m256 scale = _mm256_mul_ps(
1980
+ _mm256_broadcast_ss( &x[i+u].d ),
1981
+ _mm256_broadcast_ss( &y[i+u].d ) );
1982
+
1983
+ /* get input from x
1984
+ Input: 32 Nibbles (16 bytes) at *x[i+u]
1985
+ Output: 2 vectors with 16 values of type int16_t (x_high_q, x_low_q) */
1986
+
1987
+ /* Load 16 bytes from memory */
1988
+ const __m128i tmp_x = _mm_loadu_si128( ( const __m128i* ) x[i+u].qs);
1989
+ /* Expand bytes into uint16_t values */
1990
+ const __m256i bytes_x = _mm256_cvtepu8_epi16(tmp_x);
1991
+ /* Unpack values into individual bytes */
1992
+ __m256i x_low_q = _mm256_and_si256( lowMask, bytes_x );
1993
+ const __m256i pre_shift_x_high_q = _mm256_andnot_si256( lowMask, bytes_x );
1994
+ __m256i x_high_q = _mm256_srli_epi16( pre_shift_x_high_q, 4 );
1995
+ /* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
1996
+ x_high_q = _mm256_sub_epi16( x_high_q, offset_8 );
1997
+ x_low_q = _mm256_sub_epi16( x_low_q, offset_8 );
1998
+
1999
+ /* get input from y
2000
+ Input: 32 Nibbles (16 bytes) at *y[i+u]
2001
+ Output: 2 vectors with 16 values of type int16_t (y_high_q, y_low_q) */
2002
+
2003
+ /* Load 16 bytes from memory */
2004
+ const __m128i tmp_y = _mm_loadu_si128( (const __m128i* ) y[i+u].qs);
2005
+ /* Expand bytes into uint16_t values */
2006
+ const __m256i bytes_y = _mm256_cvtepu8_epi16(tmp_y);
2007
+ /* Unpack values into individual bytes */
2008
+ const __m256i pre_shift_y_high_q = _mm256_andnot_si256( lowMask, bytes_y );
2009
+ __m256i y_high_q = _mm256_srli_epi16( pre_shift_y_high_q, 4 );
2010
+ __m256i y_low_q = _mm256_and_si256( lowMask, bytes_y );
2011
+ /* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
2012
+ y_high_q = _mm256_sub_epi16( y_high_q, offset_8 );
2013
+ y_low_q = _mm256_sub_epi16( y_low_q, offset_8 );
2014
+
2015
+ /* Compute products of int16_t integers, add pairwise, store as int32_t */
2016
+ __m256i xy_high_q = _mm256_madd_epi16( x_high_q, y_high_q );
2017
+ __m256i xy_low_q = _mm256_madd_epi16( x_low_q, y_low_q );
2018
+
2019
+ /* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */
2020
+ __m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q );
2021
+
2022
+ /* Convert to vectore of 8 int32_t to 8 floats */
2023
+ __m256 q = _mm256_cvtepi32_ps( xy_q );
2024
+
2025
+ /* Multiply q with scale and accumulate */
2026
+ acc = _mm256_fmadd_ps( scale, q, acc );
2027
+ }
2028
+
2029
+ }
2000
2030
 
2001
2031
  // Return horizontal sum of the acc vector
2002
2032
  __m128 res = _mm256_extractf128_ps( acc, 1 );
@@ -2025,7 +2055,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
2025
2055
  bx = _mm_sub_epi8( bx, off );
2026
2056
  by = _mm_sub_epi8( by, off );
2027
2057
 
2028
- // Get absolute values of x vectors
2058
+ // Get absolute values of x vectors
2029
2059
  const __m128i ax = _mm_sign_epi8(bx, bx);
2030
2060
 
2031
2061
  // Sign the values of the y vectors
@@ -2774,7 +2804,7 @@ void ggml_print_objects(const struct ggml_context * ctx) {
2774
2804
  GGML_PRINT("%s: --- end ---\n", __func__);
2775
2805
  }
2776
2806
 
2777
- int ggml_nelements(const struct ggml_tensor * tensor) {
2807
+ int64_t ggml_nelements(const struct ggml_tensor * tensor) {
2778
2808
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2779
2809
 
2780
2810
  return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
@@ -3090,7 +3120,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
3090
3120
  struct ggml_context * ctx,
3091
3121
  enum ggml_type type,
3092
3122
  int n_dims,
3093
- const int* ne,
3123
+ const int64_t* ne,
3094
3124
  void* data) {
3095
3125
  // always insert objects at the end of the context's memory pool
3096
3126
  struct ggml_object * obj_cur = ctx->objects_end;
@@ -3189,7 +3219,8 @@ struct ggml_tensor * ggml_new_tensor_impl(
3189
3219
  /*.pad =*/ { 0 },
3190
3220
  };
3191
3221
 
3192
- ggml_assert_aligned(result->data);
3222
+ // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
3223
+ //ggml_assert_aligned(result->data);
3193
3224
 
3194
3225
  for (int i = 0; i < n_dims; i++) {
3195
3226
  result->ne[i] = ne[i];
@@ -3210,44 +3241,44 @@ struct ggml_tensor * ggml_new_tensor(
3210
3241
  struct ggml_context * ctx,
3211
3242
  enum ggml_type type,
3212
3243
  int n_dims,
3213
- const int * ne) {
3244
+ const int64_t * ne) {
3214
3245
  return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
3215
3246
  }
3216
3247
 
3217
3248
  struct ggml_tensor * ggml_new_tensor_1d(
3218
3249
  struct ggml_context * ctx,
3219
3250
  enum ggml_type type,
3220
- int ne0) {
3251
+ int64_t ne0) {
3221
3252
  return ggml_new_tensor(ctx, type, 1, &ne0);
3222
3253
  }
3223
3254
 
3224
3255
  struct ggml_tensor * ggml_new_tensor_2d(
3225
3256
  struct ggml_context * ctx,
3226
3257
  enum ggml_type type,
3227
- int ne0,
3228
- int ne1) {
3229
- const int ne[2] = { ne0, ne1 };
3258
+ int64_t ne0,
3259
+ int64_t ne1) {
3260
+ const int64_t ne[2] = { ne0, ne1 };
3230
3261
  return ggml_new_tensor(ctx, type, 2, ne);
3231
3262
  }
3232
3263
 
3233
3264
  struct ggml_tensor * ggml_new_tensor_3d(
3234
3265
  struct ggml_context * ctx,
3235
3266
  enum ggml_type type,
3236
- int ne0,
3237
- int ne1,
3238
- int ne2) {
3239
- const int ne[3] = { ne0, ne1, ne2 };
3267
+ int64_t ne0,
3268
+ int64_t ne1,
3269
+ int64_t ne2) {
3270
+ const int64_t ne[3] = { ne0, ne1, ne2 };
3240
3271
  return ggml_new_tensor(ctx, type, 3, ne);
3241
3272
  }
3242
3273
 
3243
3274
  struct ggml_tensor * ggml_new_tensor_4d(
3244
3275
  struct ggml_context * ctx,
3245
3276
  enum ggml_type type,
3246
- int ne0,
3247
- int ne1,
3248
- int ne2,
3249
- int ne3) {
3250
- const int ne[4] = { ne0, ne1, ne2, ne3 };
3277
+ int64_t ne0,
3278
+ int64_t ne1,
3279
+ int64_t ne2,
3280
+ int64_t ne3) {
3281
+ const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
3251
3282
  return ggml_new_tensor(ctx, type, 4, ne);
3252
3283
  }
3253
3284
 
@@ -3590,7 +3621,14 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
3590
3621
  struct ggml_tensor * ggml_view_tensor(
3591
3622
  struct ggml_context * ctx,
3592
3623
  const struct ggml_tensor * src) {
3593
- return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
3624
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
3625
+
3626
+ result->nb[0] = src->nb[0];
3627
+ result->nb[1] = src->nb[1];
3628
+ result->nb[2] = src->nb[2];
3629
+ result->nb[3] = src->nb[3];
3630
+
3631
+ return result;
3594
3632
  }
3595
3633
 
3596
3634
  ////////////////////////////////////////////////////////////////////////////////
@@ -3894,7 +3932,7 @@ struct ggml_tensor * ggml_mean(
3894
3932
  is_node = true;
3895
3933
  }
3896
3934
 
3897
- int ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };
3935
+ int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };
3898
3936
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne);
3899
3937
 
3900
3938
  result->op = GGML_OP_MEAN;
@@ -4255,7 +4293,7 @@ struct ggml_tensor * ggml_mul_mat(
4255
4293
  is_node = true;
4256
4294
  }
4257
4295
 
4258
- const int ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] };
4296
+ const int64_t ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] };
4259
4297
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne);
4260
4298
 
4261
4299
  result->op = GGML_OP_MUL_MAT;
@@ -4380,8 +4418,8 @@ struct ggml_tensor * ggml_reshape(
4380
4418
  struct ggml_tensor * ggml_reshape_2d(
4381
4419
  struct ggml_context * ctx,
4382
4420
  struct ggml_tensor * a,
4383
- int ne0,
4384
- int ne1) {
4421
+ int64_t ne0,
4422
+ int64_t ne1) {
4385
4423
  GGML_ASSERT(ggml_is_contiguous(a));
4386
4424
  GGML_ASSERT(ggml_nelements(a) == ne0*ne1);
4387
4425
 
@@ -4392,7 +4430,7 @@ struct ggml_tensor * ggml_reshape_2d(
4392
4430
  is_node = true;
4393
4431
  }
4394
4432
 
4395
- const int ne[2] = { ne0, ne1 };
4433
+ const int64_t ne[2] = { ne0, ne1 };
4396
4434
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
4397
4435
 
4398
4436
  result->op = GGML_OP_RESHAPE;
@@ -4406,9 +4444,9 @@ struct ggml_tensor * ggml_reshape_2d(
4406
4444
  struct ggml_tensor * ggml_reshape_3d(
4407
4445
  struct ggml_context * ctx,
4408
4446
  struct ggml_tensor * a,
4409
- int ne0,
4410
- int ne1,
4411
- int ne2) {
4447
+ int64_t ne0,
4448
+ int64_t ne1,
4449
+ int64_t ne2) {
4412
4450
  GGML_ASSERT(ggml_is_contiguous(a));
4413
4451
  GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
4414
4452
 
@@ -4419,7 +4457,7 @@ struct ggml_tensor * ggml_reshape_3d(
4419
4457
  is_node = true;
4420
4458
  }
4421
4459
 
4422
- const int ne[3] = { ne0, ne1, ne2 };
4460
+ const int64_t ne[3] = { ne0, ne1, ne2 };
4423
4461
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
4424
4462
 
4425
4463
  result->op = GGML_OP_RESHAPE;
@@ -4435,7 +4473,7 @@ struct ggml_tensor * ggml_reshape_3d(
4435
4473
  struct ggml_tensor * ggml_view_1d(
4436
4474
  struct ggml_context * ctx,
4437
4475
  struct ggml_tensor * a,
4438
- int ne0,
4476
+ int64_t ne0,
4439
4477
  size_t offset) {
4440
4478
  if (a->grad) {
4441
4479
  GGML_ASSERT(false); // gradient propagation is not supported
@@ -4456,15 +4494,15 @@ struct ggml_tensor * ggml_view_1d(
4456
4494
  struct ggml_tensor * ggml_view_2d(
4457
4495
  struct ggml_context * ctx,
4458
4496
  struct ggml_tensor * a,
4459
- int ne0,
4460
- int ne1,
4497
+ int64_t ne0,
4498
+ int64_t ne1,
4461
4499
  size_t nb1,
4462
4500
  size_t offset) {
4463
4501
  if (a->grad) {
4464
4502
  GGML_ASSERT(false); // gradient propagation is not supported
4465
4503
  }
4466
4504
 
4467
- const int ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
4505
+ const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
4468
4506
 
4469
4507
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
4470
4508
 
@@ -4480,6 +4518,37 @@ struct ggml_tensor * ggml_view_2d(
4480
4518
  return result;
4481
4519
  }
4482
4520
 
4521
+ // ggml_view_3d
4522
+
4523
+ struct ggml_tensor * ggml_view_3d(
4524
+ struct ggml_context * ctx,
4525
+ struct ggml_tensor * a,
4526
+ int64_t ne0,
4527
+ int64_t ne1,
4528
+ int64_t ne2,
4529
+ size_t nb1,
4530
+ size_t nb2,
4531
+ size_t offset) {
4532
+ if (a->grad) {
4533
+ GGML_ASSERT(false); // gradient propagation is not supported
4534
+ }
4535
+
4536
+ const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
4537
+
4538
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
4539
+
4540
+ result->nb[1] = nb1;
4541
+ result->nb[2] = nb2;
4542
+ result->nb[3] = result->nb[2]*ne2;
4543
+
4544
+ result->op = GGML_OP_VIEW;
4545
+ result->grad = NULL;
4546
+ result->src0 = a;
4547
+ result->src1 = NULL; // TODO: maybe store the offset here?
4548
+
4549
+ return result;
4550
+ }
4551
+
4483
4552
  // ggml_permute
4484
4553
 
4485
4554
  struct ggml_tensor * ggml_permute(
@@ -4695,7 +4764,7 @@ struct ggml_tensor * ggml_conv_1d_1s(
4695
4764
  is_node = true;
4696
4765
  }
4697
4766
 
4698
- const int ne[4] = { b->ne[0], a->ne[2], 1, 1, };
4767
+ const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
4699
4768
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
4700
4769
 
4701
4770
  result->op = GGML_OP_CONV_1D_1S;
@@ -4722,7 +4791,7 @@ struct ggml_tensor * ggml_conv_1d_2s(
4722
4791
  is_node = true;
4723
4792
  }
4724
4793
 
4725
- const int ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
4794
+ const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
4726
4795
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
4727
4796
 
4728
4797
  result->op = GGML_OP_CONV_1D_2S;
@@ -4815,102 +4884,112 @@ static void ggml_compute_forward_dup_f16(
4815
4884
  const struct ggml_tensor * src0,
4816
4885
  struct ggml_tensor * dst) {
4817
4886
  GGML_ASSERT(params->ith == 0);
4818
- GGML_ASSERT(ggml_is_contiguous(dst));
4819
4887
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
4820
4888
 
4821
4889
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
4822
4890
  return;
4823
4891
  }
4824
4892
 
4825
- const int ne00 = src0->ne[0];
4826
- const int ne01 = src0->ne[1];
4827
- const int ne02 = src0->ne[2];
4828
- const int ne03 = src0->ne[3];
4893
+ const int64_t ne00 = src0->ne[0];
4894
+ const int64_t ne01 = src0->ne[1];
4895
+ const int64_t ne02 = src0->ne[2];
4896
+ const int64_t ne03 = src0->ne[3];
4829
4897
 
4830
4898
  const size_t nb00 = src0->nb[0];
4831
4899
  const size_t nb01 = src0->nb[1];
4832
4900
  const size_t nb02 = src0->nb[2];
4833
4901
  const size_t nb03 = src0->nb[3];
4834
4902
 
4835
- if (ggml_is_contiguous(src0) && src0->type == dst->type) {
4903
+ const size_t nb0 = dst->nb[0];
4904
+ const size_t nb1 = dst->nb[1];
4905
+ const size_t nb2 = dst->nb[2];
4906
+ const size_t nb3 = dst->nb[3];
4907
+
4908
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
4836
4909
  memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
4837
4910
  return;
4838
4911
  }
4839
4912
 
4840
- if (src0->nb[0] == sizeof(ggml_fp16_t)) {
4841
- if (dst->type == GGML_TYPE_F16) {
4842
- size_t id = 0;
4843
- const size_t rs = ne00*nb00;
4844
-
4845
- for (int i03 = 0; i03 < ne03; i03++) {
4846
- for (int i02 = 0; i02 < ne02; i02++) {
4847
- for (int i01 = 0; i01 < ne01; i01++) {
4848
- const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
4849
- char * dst_ptr = (char *) dst->data + id*rs;
4850
-
4851
- memcpy(dst_ptr, src0_ptr, rs);
4852
-
4853
- id++;
4854
- }
4855
- }
4856
- }
4857
- } else if (dst->type == GGML_TYPE_F32) {
4858
- size_t id = 0;
4859
- float * dst_ptr = (float *) dst->data;
4860
-
4861
- for (int i03 = 0; i03 < ne03; i03++) {
4862
- for (int i02 = 0; i02 < ne02; i02++) {
4863
- for (int i01 = 0; i01 < ne01; i01++) {
4864
- for (int i00 = 0; i00 < ne00; i00++) {
4865
- const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4866
-
4867
- dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
4868
- id++;
4869
- }
4870
- }
4913
+ if (src0->type == dst->type &&
4914
+ src0->ne[0] == dst->ne[0] &&
4915
+ src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) {
4916
+ // copy by rows
4917
+ const size_t rs = ne00*nb00;
4918
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
4919
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
4920
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
4921
+ memcpy(
4922
+ ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
4923
+ ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
4924
+ rs);
4871
4925
  }
4872
4926
  }
4873
- } else {
4874
- GGML_ASSERT(false); // TODO: implement
4875
4927
  }
4876
- } else {
4877
- //printf("%s: this is not optimal - fix me\n", __func__);
4878
-
4879
- if (dst->type == GGML_TYPE_F32) {
4880
- size_t id = 0;
4881
- float * dst_ptr = (float *) dst->data;
4882
-
4883
- for (int i03 = 0; i03 < ne03; i03++) {
4884
- for (int i02 = 0; i02 < ne02; i02++) {
4885
- for (int i01 = 0; i01 < ne01; i01++) {
4886
- for (int i00 = 0; i00 < ne00; i00++) {
4887
- const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4928
+ return;
4929
+ }
4888
4930
 
4889
- dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
4890
- id++;
4931
+ // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
4932
+
4933
+ // dst counters
4934
+ int64_t i10 = 0;
4935
+ int64_t i11 = 0;
4936
+ int64_t i12 = 0;
4937
+ int64_t i13 = 0;
4938
+
4939
+ if (dst->type == GGML_TYPE_F16) {
4940
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
4941
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
4942
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
4943
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
4944
+ const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4945
+ char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
4946
+
4947
+ memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
4948
+
4949
+ if (++i10 == ne00) {
4950
+ i10 = 0;
4951
+ if (++i11 == ne01) {
4952
+ i11 = 0;
4953
+ if (++i12 == ne02) {
4954
+ i12 = 0;
4955
+ if (++i13 == ne03) {
4956
+ i13 = 0;
4957
+ }
4958
+ }
4959
+ }
4891
4960
  }
4892
4961
  }
4893
4962
  }
4894
4963
  }
4895
- } else if (dst->type == GGML_TYPE_F16) {
4896
- size_t id = 0;
4897
- ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
4898
-
4899
- for (int i03 = 0; i03 < ne03; i03++) {
4900
- for (int i02 = 0; i02 < ne02; i02++) {
4901
- for (int i01 = 0; i01 < ne01; i01++) {
4902
- for (int i00 = 0; i00 < ne00; i00++) {
4903
- const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4904
-
4905
- dst_ptr[id] = *src0_ptr;
4906
- id++;
4964
+ }
4965
+ } else if (dst->type == GGML_TYPE_F32) {
4966
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
4967
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
4968
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
4969
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
4970
+ const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4971
+ char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
4972
+
4973
+ *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
4974
+
4975
+ if (++i10 == ne00) {
4976
+ i10 = 0;
4977
+ if (++i11 == ne01) {
4978
+ i11 = 0;
4979
+ if (++i12 == ne02) {
4980
+ i12 = 0;
4981
+ if (++i13 == ne03) {
4982
+ i13 = 0;
4983
+ }
4984
+ }
4985
+ }
4907
4986
  }
4908
4987
  }
4909
4988
  }
4910
4989
  }
4911
- } else {
4912
- GGML_ASSERT(false); // TODO: implement
4913
4990
  }
4991
+ } else {
4992
+ GGML_ASSERT(false); // TODO: implement
4914
4993
  }
4915
4994
  }
4916
4995
 
@@ -4919,102 +4998,92 @@ static void ggml_compute_forward_dup_f32(
4919
4998
  const struct ggml_tensor * src0,
4920
4999
  struct ggml_tensor * dst) {
4921
5000
  GGML_ASSERT(params->ith == 0);
4922
- GGML_ASSERT(ggml_is_contiguous(dst));
4923
5001
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
4924
5002
 
4925
5003
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
4926
5004
  return;
4927
5005
  }
4928
5006
 
4929
- const int ne00 = src0->ne[0];
4930
- const int ne01 = src0->ne[1];
4931
- const int ne02 = src0->ne[2];
4932
- const int ne03 = src0->ne[3];
5007
+ const int64_t ne00 = src0->ne[0];
5008
+ const int64_t ne01 = src0->ne[1];
5009
+ const int64_t ne02 = src0->ne[2];
5010
+ const int64_t ne03 = src0->ne[3];
4933
5011
 
4934
5012
  const size_t nb00 = src0->nb[0];
4935
5013
  const size_t nb01 = src0->nb[1];
4936
5014
  const size_t nb02 = src0->nb[2];
4937
5015
  const size_t nb03 = src0->nb[3];
4938
5016
 
4939
- if (ggml_is_contiguous(src0) && src0->type == dst->type) {
5017
+ const size_t nb0 = dst->nb[0];
5018
+ const size_t nb1 = dst->nb[1];
5019
+ const size_t nb2 = dst->nb[2];
5020
+ const size_t nb3 = dst->nb[3];
5021
+
5022
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
4940
5023
  memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
4941
5024
  return;
4942
5025
  }
4943
5026
 
4944
- if (src0->nb[0] == sizeof(float)) {
4945
- if (dst->type == GGML_TYPE_F32) {
4946
- size_t id = 0;
4947
- const size_t rs = ne00*nb00;
4948
-
4949
- for (int i03 = 0; i03 < ne03; i03++) {
4950
- for (int i02 = 0; i02 < ne02; i02++) {
4951
- for (int i01 = 0; i01 < ne01; i01++) {
4952
- const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
4953
- char * dst_ptr = (char *) dst->data + id*rs;
4954
-
4955
- memcpy(dst_ptr, src0_ptr, rs);
4956
-
4957
- id++;
4958
- }
4959
- }
4960
- }
4961
- } else if (dst->type == GGML_TYPE_F16) {
4962
- size_t id = 0;
4963
- ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
4964
-
4965
- for (int i03 = 0; i03 < ne03; i03++) {
4966
- for (int i02 = 0; i02 < ne02; i02++) {
4967
- for (int i01 = 0; i01 < ne01; i01++) {
4968
- for (int i00 = 0; i00 < ne00; i00++) {
4969
- const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4970
-
4971
- dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
4972
- id++;
5027
+ // dst counters
5028
+ int64_t i10 = 0;
5029
+ int64_t i11 = 0;
5030
+ int64_t i12 = 0;
5031
+ int64_t i13 = 0;
5032
+
5033
+ if (dst->type == GGML_TYPE_F32) {
5034
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
5035
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
5036
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
5037
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
5038
+ const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
5039
+ char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
5040
+
5041
+ memcpy(dst_ptr, src0_ptr, sizeof(float));
5042
+
5043
+ if (++i10 == dst->ne[0]) {
5044
+ i10 = 0;
5045
+ if (++i11 == dst->ne[1]) {
5046
+ i11 = 0;
5047
+ if (++i12 == dst->ne[2]) {
5048
+ i12 = 0;
5049
+ if (++i13 == dst->ne[3]) {
5050
+ i13 = 0;
5051
+ }
5052
+ }
5053
+ }
4973
5054
  }
4974
5055
  }
4975
5056
  }
4976
5057
  }
4977
- } else {
4978
- GGML_ASSERT(false); // TODO: implement
4979
5058
  }
4980
- } else {
4981
- //printf("%s: this is not optimal - fix me\n", __func__);
4982
-
4983
- if (dst->type == GGML_TYPE_F32) {
4984
- size_t id = 0;
4985
- float * dst_ptr = (float *) dst->data;
4986
-
4987
- for (int i03 = 0; i03 < ne03; i03++) {
4988
- for (int i02 = 0; i02 < ne02; i02++) {
4989
- for (int i01 = 0; i01 < ne01; i01++) {
4990
- for (int i00 = 0; i00 < ne00; i00++) {
4991
- const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4992
-
4993
- dst_ptr[id] = *src0_ptr;
4994
- id++;
4995
- }
4996
- }
4997
- }
4998
- }
4999
- } else if (dst->type == GGML_TYPE_F16) {
5000
- size_t id = 0;
5001
- ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
5002
-
5003
- for (int i03 = 0; i03 < ne03; i03++) {
5004
- for (int i02 = 0; i02 < ne02; i02++) {
5005
- for (int i01 = 0; i01 < ne01; i01++) {
5006
- for (int i00 = 0; i00 < ne00; i00++) {
5007
- const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
5008
-
5009
- dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
5010
- id++;
5059
+ } else if (dst->type == GGML_TYPE_F16) {
5060
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
5061
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
5062
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
5063
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
5064
+ const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
5065
+ char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
5066
+
5067
+ *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
5068
+
5069
+ if (++i10 == dst->ne[0]) {
5070
+ i10 = 0;
5071
+ if (++i11 == dst->ne[1]) {
5072
+ i11 = 0;
5073
+ if (++i12 == dst->ne[2]) {
5074
+ i12 = 0;
5075
+ if (++i13 == dst->ne[3]) {
5076
+ i13 = 0;
5077
+ }
5078
+ }
5079
+ }
5011
5080
  }
5012
5081
  }
5013
5082
  }
5014
5083
  }
5015
- } else {
5016
- GGML_ASSERT(false); // TODO: implement
5017
5084
  }
5085
+ } else {
5086
+ GGML_ASSERT(false); // TODO: implement
5018
5087
  }
5019
5088
  }
5020
5089
 
@@ -5389,18 +5458,18 @@ static void ggml_compute_forward_sum_f32(
5389
5458
  assert(ggml_is_scalar(dst));
5390
5459
  assert(src0->nb[0] == sizeof(float));
5391
5460
 
5392
- const int ne00 = src0->ne[0];
5393
- const int ne01 = src0->ne[1];
5394
- const int ne02 = src0->ne[2];
5395
- const int ne03 = src0->ne[3];
5461
+ const int64_t ne00 = src0->ne[0];
5462
+ const int64_t ne01 = src0->ne[1];
5463
+ const int64_t ne02 = src0->ne[2];
5464
+ const int64_t ne03 = src0->ne[3];
5396
5465
 
5397
5466
  const size_t nb01 = src0->nb[1];
5398
5467
  const size_t nb02 = src0->nb[2];
5399
5468
  const size_t nb03 = src0->nb[3];
5400
5469
 
5401
- for (int i03 = 0; i03 < ne03; i03++) {
5402
- for (int i02 = 0; i02 < ne02; i02++) {
5403
- for (int i01 = 0; i01 < ne01; i01++) {
5470
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
5471
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
5472
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
5404
5473
  ggml_vec_sum_f32(ne00,
5405
5474
  (float *) (dst->data),
5406
5475
  (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
@@ -5445,19 +5514,19 @@ static void ggml_compute_forward_mean_f32(
5445
5514
 
5446
5515
  assert(src0->nb[0] == sizeof(float));
5447
5516
 
5448
- const int ne00 = src0->ne[0];
5449
- const int ne01 = src0->ne[1];
5450
- const int ne02 = src0->ne[2];
5451
- const int ne03 = src0->ne[3];
5517
+ const int64_t ne00 = src0->ne[0];
5518
+ const int64_t ne01 = src0->ne[1];
5519
+ const int64_t ne02 = src0->ne[2];
5520
+ const int64_t ne03 = src0->ne[3];
5452
5521
 
5453
5522
  const size_t nb01 = src0->nb[1];
5454
5523
  const size_t nb02 = src0->nb[2];
5455
5524
  const size_t nb03 = src0->nb[3];
5456
5525
 
5457
- const int ne0 = dst->ne[0];
5458
- const int ne1 = dst->ne[1];
5459
- const int ne2 = dst->ne[2];
5460
- const int ne3 = dst->ne[3];
5526
+ const int64_t ne0 = dst->ne[0];
5527
+ const int64_t ne1 = dst->ne[1];
5528
+ const int64_t ne2 = dst->ne[2];
5529
+ const int64_t ne3 = dst->ne[3];
5461
5530
 
5462
5531
  assert(ne0 == 1);
5463
5532
  assert(ne1 == ne01);
@@ -5473,9 +5542,9 @@ static void ggml_compute_forward_mean_f32(
5473
5542
  const size_t nb2 = dst->nb[2];
5474
5543
  const size_t nb3 = dst->nb[3];
5475
5544
 
5476
- for (int i03 = 0; i03 < ne03; i03++) {
5477
- for (int i02 = 0; i02 < ne02; i02++) {
5478
- for (int i01 = 0; i01 < ne01; i01++) {
5545
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
5546
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
5547
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
5479
5548
  ggml_vec_sum_f32(ne00,
5480
5549
  (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
5481
5550
  (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
@@ -5962,10 +6031,10 @@ static void ggml_compute_forward_norm_f32(
5962
6031
  const int ith = params->ith;
5963
6032
  const int nth = params->nth;
5964
6033
 
5965
- const int ne00 = src0->ne[0];
5966
- const int ne01 = src0->ne[1];
5967
- const int ne02 = src0->ne[2];
5968
- const int ne03 = src0->ne[3];
6034
+ const int64_t ne00 = src0->ne[0];
6035
+ const int64_t ne01 = src0->ne[1];
6036
+ const int64_t ne02 = src0->ne[2];
6037
+ const int64_t ne03 = src0->ne[3];
5969
6038
 
5970
6039
  const size_t nb01 = src0->nb[1];
5971
6040
  const size_t nb02 = src0->nb[2];
@@ -5978,13 +6047,13 @@ static void ggml_compute_forward_norm_f32(
5978
6047
  const float eps = 1e-5f; // TODO: make this a parameter
5979
6048
 
5980
6049
  // TODO: optimize
5981
- for (int i03 = 0; i03 < ne03; i03++) {
5982
- for (int i02 = 0; i02 < ne02; i02++) {
5983
- for (int i01 = ith; i01 < ne01; i01 += nth) {
6050
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
6051
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
6052
+ for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
5984
6053
  const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
5985
6054
 
5986
6055
  ggml_float sum = 0.0;
5987
- for (int i00 = 0; i00 < ne00; i00++) {
6056
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
5988
6057
  sum += (ggml_float)x[i00];
5989
6058
  }
5990
6059
 
@@ -5993,7 +6062,7 @@ static void ggml_compute_forward_norm_f32(
5993
6062
  float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
5994
6063
 
5995
6064
  ggml_float sum2 = 0.0;
5996
- for (int i00 = 0; i00 < ne00; i00++) {
6065
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
5997
6066
  float v = x[i00] - mean;
5998
6067
  y[i00] = v;
5999
6068
  sum2 += (ggml_float)(v*v);
@@ -6045,10 +6114,10 @@ static void ggml_compute_forward_rms_norm_f32(
6045
6114
  const int ith = params->ith;
6046
6115
  const int nth = params->nth;
6047
6116
 
6048
- const int ne00 = src0->ne[0];
6049
- const int ne01 = src0->ne[1];
6050
- const int ne02 = src0->ne[2];
6051
- const int ne03 = src0->ne[3];
6117
+ const int64_t ne00 = src0->ne[0];
6118
+ const int64_t ne01 = src0->ne[1];
6119
+ const int64_t ne02 = src0->ne[2];
6120
+ const int64_t ne03 = src0->ne[3];
6052
6121
 
6053
6122
  const size_t nb01 = src0->nb[1];
6054
6123
  const size_t nb02 = src0->nb[2];
@@ -6061,13 +6130,13 @@ static void ggml_compute_forward_rms_norm_f32(
6061
6130
  const float eps = 1e-6f; // TODO: make this a parameter
6062
6131
 
6063
6132
  // TODO: optimize
6064
- for (int i03 = 0; i03 < ne03; i03++) {
6065
- for (int i02 = 0; i02 < ne02; i02++) {
6066
- for (int i01 = ith; i01 < ne01; i01 += nth) {
6133
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
6134
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
6135
+ for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
6067
6136
  const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
6068
6137
 
6069
6138
  ggml_float sum = 0.0;
6070
- for (int i00 = 0; i00 < ne00; i00++) {
6139
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
6071
6140
  sum += (ggml_float)(x[i00] * x[i00]);
6072
6141
  }
6073
6142
 
@@ -6120,13 +6189,13 @@ static bool ggml_compute_forward_mul_mat_use_blas(
6120
6189
  const struct ggml_tensor * src0,
6121
6190
  const struct ggml_tensor * src1,
6122
6191
  struct ggml_tensor * dst) {
6123
- //const int ne00 = src0->ne[0];
6124
- //const int ne01 = src0->ne[1];
6192
+ //const int64_t ne00 = src0->ne[0];
6193
+ //const int64_t ne01 = src0->ne[1];
6125
6194
 
6126
- const int ne10 = src1->ne[0];
6195
+ const int64_t ne10 = src1->ne[0];
6127
6196
 
6128
- const int ne0 = dst->ne[0];
6129
- const int ne1 = dst->ne[1];
6197
+ const int64_t ne0 = dst->ne[0];
6198
+ const int64_t ne1 = dst->ne[1];
6130
6199
 
6131
6200
  // TODO: find the optimal values for these
6132
6201
  if (ggml_is_contiguous(src0) &&
@@ -6148,23 +6217,23 @@ static void ggml_compute_forward_mul_mat_f32(
6148
6217
  int64_t t0 = ggml_perf_time_us();
6149
6218
  UNUSED(t0);
6150
6219
 
6151
- const int ne00 = src0->ne[0];
6152
- const int ne01 = src0->ne[1];
6153
- const int ne02 = src0->ne[2];
6154
- const int ne03 = src0->ne[3];
6220
+ const int64_t ne00 = src0->ne[0];
6221
+ const int64_t ne01 = src0->ne[1];
6222
+ const int64_t ne02 = src0->ne[2];
6223
+ const int64_t ne03 = src0->ne[3];
6155
6224
 
6156
6225
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
6157
- const int ne10 = src1->ne[0];
6226
+ const int64_t ne10 = src1->ne[0];
6158
6227
  #endif
6159
- const int ne11 = src1->ne[1];
6228
+ const int64_t ne11 = src1->ne[1];
6160
6229
  #ifndef NDEBUG
6161
- const int ne12 = src1->ne[2];
6162
- const int ne13 = src1->ne[3];
6230
+ const int64_t ne12 = src1->ne[2];
6231
+ const int64_t ne13 = src1->ne[3];
6163
6232
 
6164
- const int ne0 = dst->ne[0];
6165
- const int ne1 = dst->ne[1];
6166
- const int ne2 = dst->ne[2];
6167
- const int ne3 = dst->ne[3];
6233
+ const int64_t ne0 = dst->ne[0];
6234
+ const int64_t ne1 = dst->ne[1];
6235
+ const int64_t ne2 = dst->ne[2];
6236
+ const int64_t ne3 = dst->ne[3];
6168
6237
 
6169
6238
  const int nb00 = src0->nb[0];
6170
6239
  #endif
@@ -6224,8 +6293,8 @@ static void ggml_compute_forward_mul_mat_f32(
6224
6293
  return;
6225
6294
  }
6226
6295
 
6227
- for (int i03 = 0; i03 < ne03; i03++) {
6228
- for (int i02 = 0; i02 < ne02; i02++) {
6296
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
6297
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
6229
6298
  const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
6230
6299
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
6231
6300
 
@@ -6272,7 +6341,7 @@ static void ggml_compute_forward_mul_mat_f32(
6272
6341
  const int i02 = (ir - i03*ne02*ne01)/ne01;
6273
6342
  const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
6274
6343
 
6275
- for (int ic = 0; ic < ne11; ++ic) {
6344
+ for (int64_t ic = 0; ic < ne11; ++ic) {
6276
6345
  // src1 indices
6277
6346
  const int i13 = i03;
6278
6347
  const int i12 = i02;
@@ -6313,21 +6382,21 @@ static void ggml_compute_forward_mul_mat_f16_f32(
6313
6382
  int64_t t0 = ggml_perf_time_us();
6314
6383
  UNUSED(t0);
6315
6384
 
6316
- const int ne00 = src0->ne[0];
6317
- const int ne01 = src0->ne[1];
6318
- const int ne02 = src0->ne[2];
6319
- const int ne03 = src0->ne[3];
6385
+ const int64_t ne00 = src0->ne[0];
6386
+ const int64_t ne01 = src0->ne[1];
6387
+ const int64_t ne02 = src0->ne[2];
6388
+ const int64_t ne03 = src0->ne[3];
6320
6389
 
6321
- const int ne10 = src1->ne[0];
6322
- const int ne11 = src1->ne[1];
6323
- const int ne12 = src1->ne[2];
6324
- const int ne13 = src1->ne[3];
6390
+ const int64_t ne10 = src1->ne[0];
6391
+ const int64_t ne11 = src1->ne[1];
6392
+ const int64_t ne12 = src1->ne[2];
6393
+ const int64_t ne13 = src1->ne[3];
6325
6394
 
6326
- const int ne0 = dst->ne[0];
6327
- const int ne1 = dst->ne[1];
6328
- const int ne2 = dst->ne[2];
6329
- const int ne3 = dst->ne[3];
6330
- //const int ne = ne0*ne1*ne2*ne3;
6395
+ const int64_t ne0 = dst->ne[0];
6396
+ const int64_t ne1 = dst->ne[1];
6397
+ const int64_t ne2 = dst->ne[2];
6398
+ const int64_t ne3 = dst->ne[3];
6399
+ //const int64_t ne = ne0*ne1*ne2*ne3;
6331
6400
 
6332
6401
  const int nb00 = src0->nb[0];
6333
6402
  const int nb01 = src0->nb[1];
@@ -6387,12 +6456,12 @@ static void ggml_compute_forward_mul_mat_f16_f32(
6387
6456
 
6388
6457
  float * const wdata = params->wdata;
6389
6458
 
6390
- for (int i03 = 0; i03 < ne03; i03++) {
6391
- for (int i02 = 0; i02 < ne02; i02++) {
6459
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
6460
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
6392
6461
  {
6393
6462
  size_t id = 0;
6394
- for (int i01 = 0; i01 < ne01; ++i01) {
6395
- for (int i00 = 0; i00 < ne00; ++i00) {
6463
+ for (int64_t i01 = 0; i01 < ne01; ++i01) {
6464
+ for (int64_t i00 = 0; i00 < ne00; ++i00) {
6396
6465
  wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
6397
6466
  }
6398
6467
  }
@@ -6422,10 +6491,10 @@ static void ggml_compute_forward_mul_mat_f16_f32(
6422
6491
  ggml_fp16_t * const wdata = params->wdata;
6423
6492
 
6424
6493
  size_t id = 0;
6425
- for (int i13 = 0; i13 < ne13; ++i13) {
6426
- for (int i12 = 0; i12 < ne12; ++i12) {
6427
- for (int i11 = 0; i11 < ne11; ++i11) {
6428
- for (int i10 = 0; i10 < ne10; ++i10) {
6494
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
6495
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
6496
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
6497
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
6429
6498
  wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
6430
6499
  }
6431
6500
  }
@@ -6477,7 +6546,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
6477
6546
 
6478
6547
  float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
6479
6548
 
6480
- for (int ic = 0; ic < ne11; ++ic) {
6549
+ for (int64_t ic = 0; ic < ne11; ++ic) {
6481
6550
  ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00);
6482
6551
  }
6483
6552
  }
@@ -6526,20 +6595,20 @@ static void ggml_compute_forward_mul_mat_q_f32(
6526
6595
  int64_t t0 = ggml_perf_time_us();
6527
6596
  UNUSED(t0);
6528
6597
 
6529
- const int ne00 = src0->ne[0];
6530
- const int ne01 = src0->ne[1];
6531
- const int ne02 = src0->ne[2];
6532
- const int ne03 = src0->ne[3];
6598
+ const int64_t ne00 = src0->ne[0];
6599
+ const int64_t ne01 = src0->ne[1];
6600
+ const int64_t ne02 = src0->ne[2];
6601
+ const int64_t ne03 = src0->ne[3];
6533
6602
 
6534
- const int ne10 = src1->ne[0];
6535
- const int ne11 = src1->ne[1];
6536
- const int ne12 = src1->ne[2];
6537
- const int ne13 = src1->ne[3];
6603
+ const int64_t ne10 = src1->ne[0];
6604
+ const int64_t ne11 = src1->ne[1];
6605
+ const int64_t ne12 = src1->ne[2];
6606
+ const int64_t ne13 = src1->ne[3];
6538
6607
 
6539
- const int ne0 = dst->ne[0];
6540
- const int ne1 = dst->ne[1];
6541
- const int ne2 = dst->ne[2];
6542
- const int ne3 = dst->ne[3];
6608
+ const int64_t ne0 = dst->ne[0];
6609
+ const int64_t ne1 = dst->ne[1];
6610
+ const int64_t ne2 = dst->ne[2];
6611
+ const int64_t ne3 = dst->ne[3];
6543
6612
 
6544
6613
  const int nb00 = src0->nb[0];
6545
6614
  const int nb01 = src0->nb[1];
@@ -6603,11 +6672,11 @@ static void ggml_compute_forward_mul_mat_q_f32(
6603
6672
  float * const wdata = params->wdata;
6604
6673
  dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
6605
6674
 
6606
- for (int i03 = 0; i03 < ne03; i03++) {
6607
- for (int i02 = 0; i02 < ne02; i02++) {
6675
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
6676
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
6608
6677
  {
6609
6678
  size_t id = 0;
6610
- for (int i01 = 0; i01 < ne01; ++i01) {
6679
+ for (int64_t i01 = 0; i01 < ne01; ++i01) {
6611
6680
  dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
6612
6681
  id += ne00;
6613
6682
  }
@@ -6637,9 +6706,9 @@ static void ggml_compute_forward_mul_mat_q_f32(
6637
6706
  char * wdata = params->wdata;
6638
6707
  const size_t row_size = ne10*GGML_TYPE_SIZE[type]/GGML_BLCK_SIZE[type];
6639
6708
 
6640
- for (int i13 = 0; i13 < ne13; ++i13) {
6641
- for (int i12 = 0; i12 < ne12; ++i12) {
6642
- for (int i11 = 0; i11 < ne11; ++i11) {
6709
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
6710
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
6711
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
6643
6712
  quantize_row_q((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
6644
6713
  wdata += row_size;
6645
6714
  }
@@ -6688,7 +6757,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
6688
6757
 
6689
6758
  assert(ne00 % 32 == 0);
6690
6759
 
6691
- for (int ic = 0; ic < ne11; ++ic) {
6760
+ for (int64_t ic = 0; ic < ne11; ++ic) {
6692
6761
  vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
6693
6762
  }
6694
6763
  }
@@ -7169,7 +7238,6 @@ static void ggml_compute_forward_rope_f32(
7169
7238
  const struct ggml_tensor * src0,
7170
7239
  const struct ggml_tensor * src1,
7171
7240
  struct ggml_tensor * dst) {
7172
- assert(params->ith == 0);
7173
7241
  assert(src1->type == GGML_TYPE_I32);
7174
7242
  assert(ggml_nelements(src1) == 3);
7175
7243
 
@@ -7181,10 +7249,10 @@ static void ggml_compute_forward_rope_f32(
7181
7249
  const int n_dims = ((int32_t *) src1->data)[1];
7182
7250
  const int mode = ((int32_t *) src1->data)[2];
7183
7251
 
7184
- //const int ne0 = src0->ne[0];
7185
- const int ne1 = src0->ne[1];
7186
- const int ne2 = src0->ne[2];
7187
- const int ne3 = src0->ne[3];
7252
+ //const int64_t ne0 = src0->ne[0];
7253
+ const int64_t ne1 = src0->ne[1];
7254
+ const int64_t ne2 = src0->ne[2];
7255
+ const int64_t ne3 = src0->ne[3];
7188
7256
 
7189
7257
  const int nb0 = src0->nb[0];
7190
7258
  const int nb1 = src0->nb[1];
@@ -7196,11 +7264,28 @@ static void ggml_compute_forward_rope_f32(
7196
7264
 
7197
7265
  assert(nb0 == sizeof(float));
7198
7266
 
7199
- // TODO: optimize
7200
- for (int i3 = 0; i3 < ne3; i3++) {
7201
- for (int i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
7267
+ const int ith = params->ith;
7268
+ const int nth = params->nth;
7269
+
7270
+ const int nr = ggml_nrows(src0);
7271
+
7272
+ // rows per thread
7273
+ const int dr = (nr + nth - 1)/nth;
7274
+
7275
+ // row range for this thread
7276
+ const int ir0 = dr*ith;
7277
+ const int ir1 = MIN(ir0 + dr, nr);
7278
+
7279
+ // row index used to determine which thread to use
7280
+ int ir = 0;
7281
+
7282
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
7283
+ for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
7202
7284
  const int p = (mode == 0 ? n_past + i2 : i2);
7203
- for (int i1 = 0; i1 < ne1; i1++) {
7285
+ for (int64_t i1 = 0; i1 < ne1; i1++) {
7286
+ if (ir++ < ir0) continue;
7287
+ if (ir > ir1) break;
7288
+
7204
7289
  for (int i0 = 0; i0 < n_dims; i0 += 2) {
7205
7290
  const float theta = powf(10000.0, ((float)-i0)/n_dims);
7206
7291
 
@@ -7226,7 +7311,6 @@ static void ggml_compute_forward_rope_f16(
7226
7311
  const struct ggml_tensor * src0,
7227
7312
  const struct ggml_tensor * src1,
7228
7313
  struct ggml_tensor * dst) {
7229
- assert(params->ith == 0);
7230
7314
  assert(src1->type == GGML_TYPE_I32);
7231
7315
  assert(ggml_nelements(src1) == 3);
7232
7316
 
@@ -7238,10 +7322,10 @@ static void ggml_compute_forward_rope_f16(
7238
7322
  const int n_dims = ((int32_t *) src1->data)[1];
7239
7323
  const int mode = ((int32_t *) src1->data)[2];
7240
7324
 
7241
- //const int ne0 = src0->ne[0];
7242
- const int ne1 = src0->ne[1];
7243
- const int ne2 = src0->ne[2];
7244
- const int ne3 = src0->ne[3];
7325
+ //const int64_t ne0 = src0->ne[0];
7326
+ const int64_t ne1 = src0->ne[1];
7327
+ const int64_t ne2 = src0->ne[2];
7328
+ const int64_t ne3 = src0->ne[3];
7245
7329
 
7246
7330
  const int nb0 = src0->nb[0];
7247
7331
  const int nb1 = src0->nb[1];
@@ -7253,10 +7337,28 @@ static void ggml_compute_forward_rope_f16(
7253
7337
 
7254
7338
  assert(nb0 == sizeof(ggml_fp16_t));
7255
7339
 
7256
- for (int i3 = 0; i3 < ne3; i3++) {
7257
- for (int i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
7340
+ const int ith = params->ith;
7341
+ const int nth = params->nth;
7342
+
7343
+ const int nr = ggml_nrows(src0);
7344
+
7345
+ // rows per thread
7346
+ const int dr = (nr + nth - 1)/nth;
7347
+
7348
+ // row range for this thread
7349
+ const int ir0 = dr*ith;
7350
+ const int ir1 = MIN(ir0 + dr, nr);
7351
+
7352
+ // row index used to determine which thread to use
7353
+ int ir = 0;
7354
+
7355
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
7356
+ for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
7258
7357
  const int p = (mode == 0 ? n_past + i2 : i2);
7259
- for (int i1 = 0; i1 < ne1; i1++) {
7358
+ for (int64_t i1 = 0; i1 < ne1; i1++) {
7359
+ if (ir++ < ir0) continue;
7360
+ if (ir > ir1) break;
7361
+
7260
7362
  for (int i0 = 0; i0 < n_dims; i0 += 2) {
7261
7363
  const float theta = powf(10000.0, ((float)-i0)/n_dims);
7262
7364
 
@@ -7317,21 +7419,21 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
7317
7419
  int64_t t0 = ggml_perf_time_us();
7318
7420
  UNUSED(t0);
7319
7421
 
7320
- const int ne00 = src0->ne[0];
7321
- const int ne01 = src0->ne[1];
7322
- const int ne02 = src0->ne[2];
7323
- //const int ne03 = src0->ne[3];
7422
+ const int64_t ne00 = src0->ne[0];
7423
+ const int64_t ne01 = src0->ne[1];
7424
+ const int64_t ne02 = src0->ne[2];
7425
+ //const int64_t ne03 = src0->ne[3];
7324
7426
 
7325
- const int ne10 = src1->ne[0];
7326
- const int ne11 = src1->ne[1];
7327
- //const int ne12 = src1->ne[2];
7328
- //const int ne13 = src1->ne[3];
7427
+ const int64_t ne10 = src1->ne[0];
7428
+ const int64_t ne11 = src1->ne[1];
7429
+ //const int64_t ne12 = src1->ne[2];
7430
+ //const int64_t ne13 = src1->ne[3];
7329
7431
 
7330
- //const int ne0 = dst->ne[0];
7331
- //const int ne1 = dst->ne[1];
7332
- //const int ne2 = dst->ne[2];
7333
- //const int ne3 = dst->ne[3];
7334
- //const int ne = ne0*ne1*ne2*ne3;
7432
+ //const int64_t ne0 = dst->ne[0];
7433
+ //const int64_t ne1 = dst->ne[1];
7434
+ //const int64_t ne2 = dst->ne[2];
7435
+ //const int64_t ne3 = dst->ne[3];
7436
+ //const int64_t ne = ne0*ne1*ne2*ne3;
7335
7437
 
7336
7438
  const int nb00 = src0->nb[0];
7337
7439
  const int nb01 = src0->nb[1];
@@ -7368,11 +7470,11 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
7368
7470
  {
7369
7471
  ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
7370
7472
 
7371
- for (int i02 = 0; i02 < ne02; i02++) {
7372
- for (int i01 = 0; i01 < ne01; i01++) {
7473
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
7474
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
7373
7475
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
7374
7476
  ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
7375
- for (int i00 = 0; i00 < ne00; i00++) {
7477
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
7376
7478
  dst_data[i00*ew0 + i01] = src[i00];
7377
7479
  }
7378
7480
  }
@@ -7383,10 +7485,10 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
7383
7485
  {
7384
7486
  ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
7385
7487
 
7386
- for (int i11 = 0; i11 < ne11; i11++) {
7488
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
7387
7489
  const float * const src = (float *)((char *) src1->data + i11*nb11);
7388
7490
  ggml_fp16_t * dst_data = wdata;
7389
- for (int i10 = 0; i10 < ne10; i10++) {
7491
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
7390
7492
  dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
7391
7493
  }
7392
7494
  }
@@ -7411,7 +7513,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
7411
7513
 
7412
7514
  for (int i1 = ir0; i1 < ir1; i1++) {
7413
7515
  float * dst_data = (float *)((char *) dst->data + i1*nb1);
7414
- for (int i0 = 0; i0 < ne10; ++i0) {
7516
+ for (int64_t i0 = 0; i0 < ne10; ++i0) {
7415
7517
  dst_data[i0] = 0;
7416
7518
  for (int k = -nh; k <= nh; k++) {
7417
7519
  float v = 0.0f;
@@ -7437,21 +7539,21 @@ static void ggml_compute_forward_conv_1d_1s_f32(
7437
7539
  int64_t t0 = ggml_perf_time_us();
7438
7540
  UNUSED(t0);
7439
7541
 
7440
- const int ne00 = src0->ne[0];
7441
- const int ne01 = src0->ne[1];
7442
- const int ne02 = src0->ne[2];
7443
- //const int ne03 = src0->ne[3];
7542
+ const int64_t ne00 = src0->ne[0];
7543
+ const int64_t ne01 = src0->ne[1];
7544
+ const int64_t ne02 = src0->ne[2];
7545
+ //const int64_t ne03 = src0->ne[3];
7444
7546
 
7445
- const int ne10 = src1->ne[0];
7446
- const int ne11 = src1->ne[1];
7447
- //const int ne12 = src1->ne[2];
7448
- //const int ne13 = src1->ne[3];
7547
+ const int64_t ne10 = src1->ne[0];
7548
+ const int64_t ne11 = src1->ne[1];
7549
+ //const int64_t ne12 = src1->ne[2];
7550
+ //const int64_t ne13 = src1->ne[3];
7449
7551
 
7450
- //const int ne0 = dst->ne[0];
7451
- //const int ne1 = dst->ne[1];
7452
- //const int ne2 = dst->ne[2];
7453
- //const int ne3 = dst->ne[3];
7454
- //const int ne = ne0*ne1*ne2*ne3;
7552
+ //const int64_t ne0 = dst->ne[0];
7553
+ //const int64_t ne1 = dst->ne[1];
7554
+ //const int64_t ne2 = dst->ne[2];
7555
+ //const int64_t ne3 = dst->ne[3];
7556
+ //const int64_t ne = ne0*ne1*ne2*ne3;
7455
7557
 
7456
7558
  const int nb00 = src0->nb[0];
7457
7559
  const int nb01 = src0->nb[1];
@@ -7488,11 +7590,11 @@ static void ggml_compute_forward_conv_1d_1s_f32(
7488
7590
  {
7489
7591
  float * const wdata = (float *) params->wdata + 0;
7490
7592
 
7491
- for (int i02 = 0; i02 < ne02; i02++) {
7492
- for (int i01 = 0; i01 < ne01; i01++) {
7593
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
7594
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
7493
7595
  const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
7494
7596
  float * dst_data = wdata + i02*ew0*ne00;
7495
- for (int i00 = 0; i00 < ne00; i00++) {
7597
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
7496
7598
  dst_data[i00*ew0 + i01] = src[i00];
7497
7599
  }
7498
7600
  }
@@ -7503,10 +7605,10 @@ static void ggml_compute_forward_conv_1d_1s_f32(
7503
7605
  {
7504
7606
  float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
7505
7607
 
7506
- for (int i11 = 0; i11 < ne11; i11++) {
7608
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
7507
7609
  const float * const src = (float *)((char *) src1->data + i11*nb11);
7508
7610
  float * dst_data = wdata;
7509
- for (int i10 = 0; i10 < ne10; i10++) {
7611
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
7510
7612
  dst_data[(i10 + nh)*ew0 + i11] = src[i10];
7511
7613
  }
7512
7614
  }
@@ -7531,7 +7633,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
7531
7633
 
7532
7634
  for (int i1 = ir0; i1 < ir1; i1++) {
7533
7635
  float * dst_data = (float *)((char *) dst->data + i1*nb1);
7534
- for (int i0 = 0; i0 < ne10; ++i0) {
7636
+ for (int64_t i0 = 0; i0 < ne10; ++i0) {
7535
7637
  dst_data[i0] = 0;
7536
7638
  for (int k = -nh; k <= nh; k++) {
7537
7639
  float v = 0.0f;
@@ -7585,21 +7687,21 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
7585
7687
  int64_t t0 = ggml_perf_time_us();
7586
7688
  UNUSED(t0);
7587
7689
 
7588
- const int ne00 = src0->ne[0];
7589
- const int ne01 = src0->ne[1];
7590
- const int ne02 = src0->ne[2];
7591
- //const int ne03 = src0->ne[3];
7690
+ const int64_t ne00 = src0->ne[0];
7691
+ const int64_t ne01 = src0->ne[1];
7692
+ const int64_t ne02 = src0->ne[2];
7693
+ //const int64_t ne03 = src0->ne[3];
7592
7694
 
7593
- const int ne10 = src1->ne[0];
7594
- const int ne11 = src1->ne[1];
7595
- //const int ne12 = src1->ne[2];
7596
- //const int ne13 = src1->ne[3];
7695
+ const int64_t ne10 = src1->ne[0];
7696
+ const int64_t ne11 = src1->ne[1];
7697
+ //const int64_t ne12 = src1->ne[2];
7698
+ //const int64_t ne13 = src1->ne[3];
7597
7699
 
7598
- //const int ne0 = dst->ne[0];
7599
- //const int ne1 = dst->ne[1];
7600
- //const int ne2 = dst->ne[2];
7601
- //const int ne3 = dst->ne[3];
7602
- //const int ne = ne0*ne1*ne2*ne3;
7700
+ //const int64_t ne0 = dst->ne[0];
7701
+ //const int64_t ne1 = dst->ne[1];
7702
+ //const int64_t ne2 = dst->ne[2];
7703
+ //const int64_t ne3 = dst->ne[3];
7704
+ //const int64_t ne = ne0*ne1*ne2*ne3;
7603
7705
 
7604
7706
  const int nb00 = src0->nb[0];
7605
7707
  const int nb01 = src0->nb[1];
@@ -7636,11 +7738,11 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
7636
7738
  {
7637
7739
  ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
7638
7740
 
7639
- for (int i02 = 0; i02 < ne02; i02++) {
7640
- for (int i01 = 0; i01 < ne01; i01++) {
7741
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
7742
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
7641
7743
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
7642
7744
  ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
7643
- for (int i00 = 0; i00 < ne00; i00++) {
7745
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
7644
7746
  dst_data[i00*ew0 + i01] = src[i00];
7645
7747
  }
7646
7748
  }
@@ -7651,10 +7753,10 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
7651
7753
  {
7652
7754
  ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
7653
7755
 
7654
- for (int i11 = 0; i11 < ne11; i11++) {
7756
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
7655
7757
  const float * const src = (float *)((char *) src1->data + i11*nb11);
7656
7758
  ggml_fp16_t * dst_data = wdata;
7657
- for (int i10 = 0; i10 < ne10; i10++) {
7759
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
7658
7760
  dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
7659
7761
  }
7660
7762
  }
@@ -7679,7 +7781,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
7679
7781
 
7680
7782
  for (int i1 = ir0; i1 < ir1; i1++) {
7681
7783
  float * dst_data = (float *)((char *) dst->data + i1*nb1);
7682
- for (int i0 = 0; i0 < ne10; i0 += 2) {
7784
+ for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
7683
7785
  dst_data[i0/2] = 0;
7684
7786
  for (int k = -nh; k <= nh; k++) {
7685
7787
  float v = 0.0f;
@@ -7705,21 +7807,21 @@ static void ggml_compute_forward_conv_1d_2s_f32(
7705
7807
  int64_t t0 = ggml_perf_time_us();
7706
7808
  UNUSED(t0);
7707
7809
 
7708
- const int ne00 = src0->ne[0];
7709
- const int ne01 = src0->ne[1];
7710
- const int ne02 = src0->ne[2];
7711
- //const int ne03 = src0->ne[3];
7810
+ const int64_t ne00 = src0->ne[0];
7811
+ const int64_t ne01 = src0->ne[1];
7812
+ const int64_t ne02 = src0->ne[2];
7813
+ //const int64_t ne03 = src0->ne[3];
7712
7814
 
7713
- const int ne10 = src1->ne[0];
7714
- const int ne11 = src1->ne[1];
7715
- //const int ne12 = src1->ne[2];
7716
- //const int ne13 = src1->ne[3];
7815
+ const int64_t ne10 = src1->ne[0];
7816
+ const int64_t ne11 = src1->ne[1];
7817
+ //const int64_t ne12 = src1->ne[2];
7818
+ //const int64_t ne13 = src1->ne[3];
7717
7819
 
7718
- //const int ne0 = dst->ne[0];
7719
- //const int ne1 = dst->ne[1];
7720
- //const int ne2 = dst->ne[2];
7721
- //const int ne3 = dst->ne[3];
7722
- //const int ne = ne0*ne1*ne2*ne3;
7820
+ //const int64_t ne0 = dst->ne[0];
7821
+ //const int64_t ne1 = dst->ne[1];
7822
+ //const int64_t ne2 = dst->ne[2];
7823
+ //const int64_t ne3 = dst->ne[3];
7824
+ //const int64_t ne = ne0*ne1*ne2*ne3;
7723
7825
 
7724
7826
  const int nb00 = src0->nb[0];
7725
7827
  const int nb01 = src0->nb[1];
@@ -7756,11 +7858,11 @@ static void ggml_compute_forward_conv_1d_2s_f32(
7756
7858
  {
7757
7859
  float * const wdata = (float *) params->wdata + 0;
7758
7860
 
7759
- for (int i02 = 0; i02 < ne02; i02++) {
7760
- for (int i01 = 0; i01 < ne01; i01++) {
7861
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
7862
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
7761
7863
  const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
7762
7864
  float * dst_data = wdata + i02*ew0*ne00;
7763
- for (int i00 = 0; i00 < ne00; i00++) {
7865
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
7764
7866
  dst_data[i00*ew0 + i01] = src[i00];
7765
7867
  }
7766
7868
  }
@@ -7771,10 +7873,10 @@ static void ggml_compute_forward_conv_1d_2s_f32(
7771
7873
  {
7772
7874
  float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
7773
7875
 
7774
- for (int i11 = 0; i11 < ne11; i11++) {
7876
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
7775
7877
  const float * const src = (float *)((char *) src1->data + i11*nb11);
7776
7878
  float * dst_data = wdata;
7777
- for (int i10 = 0; i10 < ne10; i10++) {
7879
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
7778
7880
  dst_data[(i10 + nh)*ew0 + i11] = src[i10];
7779
7881
  }
7780
7882
  }
@@ -7799,7 +7901,7 @@ static void ggml_compute_forward_conv_1d_2s_f32(
7799
7901
 
7800
7902
  for (int i1 = ir0; i1 < ir1; i1++) {
7801
7903
  float * dst_data = (float *)((char *) dst->data + i1*nb1);
7802
- for (int i0 = 0; i0 < ne10; i0 += 2) {
7904
+ for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
7803
7905
  dst_data[i0/2] = 0;
7804
7906
  for (int k = -nh; k <= nh; k++) {
7805
7907
  float v = 0.0f;
@@ -7851,25 +7953,25 @@ static void ggml_compute_forward_flash_attn_f32(
7851
7953
  int64_t t0 = ggml_perf_time_us();
7852
7954
  UNUSED(t0);
7853
7955
 
7854
- const int neq0 = q->ne[0];
7855
- const int neq1 = q->ne[1];
7856
- const int neq2 = q->ne[2];
7857
- const int neq3 = q->ne[3];
7956
+ const int64_t neq0 = q->ne[0];
7957
+ const int64_t neq1 = q->ne[1];
7958
+ const int64_t neq2 = q->ne[2];
7959
+ const int64_t neq3 = q->ne[3];
7858
7960
 
7859
- const int nek0 = k->ne[0];
7860
- const int nek1 = k->ne[1];
7861
- //const int nek2 = k->ne[2];
7862
- //const int nek3 = k->ne[3];
7961
+ const int64_t nek0 = k->ne[0];
7962
+ const int64_t nek1 = k->ne[1];
7963
+ //const int64_t nek2 = k->ne[2];
7964
+ //const int64_t nek3 = k->ne[3];
7863
7965
 
7864
- //const int nev0 = v->ne[0];
7865
- const int nev1 = v->ne[1];
7866
- //const int nev2 = v->ne[2];
7867
- //const int nev3 = v->ne[3];
7966
+ //const int64_t nev0 = v->ne[0];
7967
+ const int64_t nev1 = v->ne[1];
7968
+ //const int64_t nev2 = v->ne[2];
7969
+ //const int64_t nev3 = v->ne[3];
7868
7970
 
7869
- const int ne0 = dst->ne[0];
7870
- const int ne1 = dst->ne[1];
7871
- //const int ne2 = dst->ne[2];
7872
- //const int ne3 = dst->ne[3];
7971
+ const int64_t ne0 = dst->ne[0];
7972
+ const int64_t ne1 = dst->ne[1];
7973
+ //const int64_t ne2 = dst->ne[2];
7974
+ //const int64_t ne3 = dst->ne[3];
7873
7975
 
7874
7976
  const int nbk0 = k->nb[0];
7875
7977
  const int nbk1 = k->nb[1];
@@ -7894,10 +7996,10 @@ static void ggml_compute_forward_flash_attn_f32(
7894
7996
  const int ith = params->ith;
7895
7997
  const int nth = params->nth;
7896
7998
 
7897
- const int D = neq0;
7898
- const int N = neq1;
7899
- const int P = nek1 - N;
7900
- const int M = P + N;
7999
+ const int64_t D = neq0;
8000
+ const int64_t N = neq1;
8001
+ const int64_t P = nek1 - N;
8002
+ const int64_t M = P + N;
7901
8003
 
7902
8004
  const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
7903
8005
 
@@ -7959,7 +8061,7 @@ static void ggml_compute_forward_flash_attn_f32(
7959
8061
  S[i] = -INFINITY;
7960
8062
  }
7961
8063
 
7962
- for (int ic = 0; ic < nek1; ++ic) {
8064
+ for (int64_t ic = 0; ic < nek1; ++ic) {
7963
8065
  // k indices
7964
8066
  const int ik3 = iq3;
7965
8067
  const int ik2 = iq2;
@@ -7978,7 +8080,7 @@ static void ggml_compute_forward_flash_attn_f32(
7978
8080
  ggml_vec_scale_f32(nek1, S, scale);
7979
8081
 
7980
8082
  if (masked) {
7981
- for (int i = P; i < M; i++) {
8083
+ for (int64_t i = P; i < M; i++) {
7982
8084
  if (i > P + iq1) {
7983
8085
  S[i] = -INFINITY;
7984
8086
  }
@@ -8036,7 +8138,7 @@ static void ggml_compute_forward_flash_attn_f32(
8036
8138
  #endif
8037
8139
  }
8038
8140
 
8039
- for (int ic = 0; ic < nev1; ++ic) {
8141
+ for (int64_t ic = 0; ic < nev1; ++ic) {
8040
8142
  // dst indices
8041
8143
  const int i1 = iq1;
8042
8144
  const int i2 = iq2;
@@ -8060,25 +8162,25 @@ static void ggml_compute_forward_flash_attn_f16(
8060
8162
  int64_t t0 = ggml_perf_time_us();
8061
8163
  UNUSED(t0);
8062
8164
 
8063
- const int neq0 = q->ne[0];
8064
- const int neq1 = q->ne[1];
8065
- const int neq2 = q->ne[2];
8066
- const int neq3 = q->ne[3];
8165
+ const int64_t neq0 = q->ne[0];
8166
+ const int64_t neq1 = q->ne[1];
8167
+ const int64_t neq2 = q->ne[2];
8168
+ const int64_t neq3 = q->ne[3];
8067
8169
 
8068
- const int nek0 = k->ne[0];
8069
- const int nek1 = k->ne[1];
8070
- //const int nek2 = k->ne[2];
8071
- //const int nek3 = k->ne[3];
8170
+ const int64_t nek0 = k->ne[0];
8171
+ const int64_t nek1 = k->ne[1];
8172
+ //const int64_t nek2 = k->ne[2];
8173
+ //const int64_t nek3 = k->ne[3];
8072
8174
 
8073
- //const int nev0 = v->ne[0];
8074
- const int nev1 = v->ne[1];
8075
- //const int nev2 = v->ne[2];
8076
- //const int nev3 = v->ne[3];
8175
+ //const int64_t nev0 = v->ne[0];
8176
+ const int64_t nev1 = v->ne[1];
8177
+ //const int64_t nev2 = v->ne[2];
8178
+ //const int64_t nev3 = v->ne[3];
8077
8179
 
8078
- const int ne0 = dst->ne[0];
8079
- const int ne1 = dst->ne[1];
8080
- //const int ne2 = dst->ne[2];
8081
- //const int ne3 = dst->ne[3];
8180
+ const int64_t ne0 = dst->ne[0];
8181
+ const int64_t ne1 = dst->ne[1];
8182
+ //const int64_t ne2 = dst->ne[2];
8183
+ //const int64_t ne3 = dst->ne[3];
8082
8184
 
8083
8185
  const int nbk0 = k->nb[0];
8084
8186
  const int nbk1 = k->nb[1];
@@ -8103,10 +8205,10 @@ static void ggml_compute_forward_flash_attn_f16(
8103
8205
  const int ith = params->ith;
8104
8206
  const int nth = params->nth;
8105
8207
 
8106
- const int D = neq0;
8107
- const int N = neq1;
8108
- const int P = nek1 - N;
8109
- const int M = P + N;
8208
+ const int64_t D = neq0;
8209
+ const int64_t N = neq1;
8210
+ const int64_t P = nek1 - N;
8211
+ const int64_t M = P + N;
8110
8212
 
8111
8213
  const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
8112
8214
 
@@ -8169,7 +8271,7 @@ static void ggml_compute_forward_flash_attn_f16(
8169
8271
  }
8170
8272
 
8171
8273
  if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
8172
- for (int ic = 0; ic < nek1; ++ic) {
8274
+ for (int64_t ic = 0; ic < nek1; ++ic) {
8173
8275
  // k indices
8174
8276
  const int ik3 = iq3;
8175
8277
  const int ik2 = iq2;
@@ -8184,7 +8286,7 @@ static void ggml_compute_forward_flash_attn_f16(
8184
8286
  (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
8185
8287
  }
8186
8288
  } else {
8187
- for (int ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
8289
+ for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
8188
8290
  // k indices
8189
8291
  const int ik3 = iq3;
8190
8292
  const int ik2 = iq2;
@@ -8204,7 +8306,7 @@ static void ggml_compute_forward_flash_attn_f16(
8204
8306
  ggml_vec_scale_f32(nek1, S, scale);
8205
8307
 
8206
8308
  if (masked) {
8207
- for (int i = P; i < M; i++) {
8309
+ for (int64_t i = P; i < M; i++) {
8208
8310
  if (i > P + iq1) {
8209
8311
  S[i] = -INFINITY;
8210
8312
  }
@@ -8264,12 +8366,12 @@ static void ggml_compute_forward_flash_attn_f16(
8264
8366
 
8265
8367
  ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);
8266
8368
 
8267
- for (int i = 0; i < M; i++) {
8369
+ for (int64_t i = 0; i < M; i++) {
8268
8370
  S16[i] = GGML_FP32_TO_FP16(S[i]);
8269
8371
  }
8270
8372
 
8271
8373
  if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {
8272
- for (int ic = 0; ic < nev1; ++ic) {
8374
+ for (int64_t ic = 0; ic < nev1; ++ic) {
8273
8375
  // dst indices
8274
8376
  const int i1 = iq1;
8275
8377
  const int i2 = iq2;
@@ -8281,7 +8383,7 @@ static void ggml_compute_forward_flash_attn_f16(
8281
8383
  S16);
8282
8384
  }
8283
8385
  } else {
8284
- for (int ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
8386
+ for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
8285
8387
  // dst indices
8286
8388
  const int i1 = iq1;
8287
8389
  const int i2 = iq2;
@@ -8337,35 +8439,35 @@ static void ggml_compute_forward_flash_ff_f16(
8337
8439
  int64_t t0 = ggml_perf_time_us();
8338
8440
  UNUSED(t0);
8339
8441
 
8340
- const int nea0 = a->ne[0];
8341
- const int nea1 = a->ne[1];
8342
- const int nea2 = a->ne[2];
8343
- const int nea3 = a->ne[3];
8442
+ const int64_t nea0 = a->ne[0];
8443
+ const int64_t nea1 = a->ne[1];
8444
+ const int64_t nea2 = a->ne[2];
8445
+ const int64_t nea3 = a->ne[3];
8344
8446
 
8345
- const int neb00 = b0->ne[0];
8346
- const int neb01 = b0->ne[1];
8347
- //const int neb02 = b0->ne[2];
8348
- //const int neb03 = b0->ne[3];
8447
+ const int64_t neb00 = b0->ne[0];
8448
+ const int64_t neb01 = b0->ne[1];
8449
+ //const int64_t neb02 = b0->ne[2];
8450
+ //const int64_t neb03 = b0->ne[3];
8349
8451
 
8350
- const int neb10 = b1->ne[0];
8351
- const int neb11 = b1->ne[1];
8352
- //const int neb12 = b1->ne[2];
8353
- //const int neb13 = b1->ne[3];
8452
+ const int64_t neb10 = b1->ne[0];
8453
+ const int64_t neb11 = b1->ne[1];
8454
+ //const int64_t neb12 = b1->ne[2];
8455
+ //const int64_t neb13 = b1->ne[3];
8354
8456
 
8355
- const int nec00 = c0->ne[0];
8356
- const int nec01 = c0->ne[1];
8357
- //const int nec02 = c0->ne[2];
8358
- //const int nec03 = c0->ne[3];
8457
+ const int64_t nec00 = c0->ne[0];
8458
+ const int64_t nec01 = c0->ne[1];
8459
+ //const int64_t nec02 = c0->ne[2];
8460
+ //const int64_t nec03 = c0->ne[3];
8359
8461
 
8360
- const int nec10 = c1->ne[0];
8361
- const int nec11 = c1->ne[1];
8362
- //const int nec12 = c1->ne[2];
8363
- //const int nec13 = c1->ne[3];
8462
+ const int64_t nec10 = c1->ne[0];
8463
+ const int64_t nec11 = c1->ne[1];
8464
+ //const int64_t nec12 = c1->ne[2];
8465
+ //const int64_t nec13 = c1->ne[3];
8364
8466
 
8365
- const int ne0 = dst->ne[0];
8366
- const int ne1 = dst->ne[1];
8367
- const int ne2 = dst->ne[2];
8368
- //const int ne3 = dst->ne[3];
8467
+ const int64_t ne0 = dst->ne[0];
8468
+ const int64_t ne1 = dst->ne[1];
8469
+ const int64_t ne2 = dst->ne[2];
8470
+ //const int64_t ne3 = dst->ne[3];
8369
8471
 
8370
8472
  const int nba0 = a->nb[0];
8371
8473
  const int nba1 = a->nb[1];
@@ -8400,9 +8502,9 @@ static void ggml_compute_forward_flash_ff_f16(
8400
8502
  const int ith = params->ith;
8401
8503
  const int nth = params->nth;
8402
8504
 
8403
- const int D = nea0;
8404
- //const int N = nea1;
8405
- const int M = neb01;
8505
+ const int64_t D = nea0;
8506
+ //const int64_t N = nea1;
8507
+ const int64_t M = neb01;
8406
8508
 
8407
8509
  GGML_ASSERT(ne0 == nea0);
8408
8510
  GGML_ASSERT(ne1 == nea1);
@@ -8458,7 +8560,7 @@ static void ggml_compute_forward_flash_ff_f16(
8458
8560
 
8459
8561
  float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
8460
8562
 
8461
- for (int ic = 0; ic < neb01; ++ic) {
8563
+ for (int64_t ic = 0; ic < neb01; ++ic) {
8462
8564
  // b0 indices
8463
8565
  const int ib03 = ia3;
8464
8566
  const int ib02 = ia2;
@@ -8478,7 +8580,7 @@ static void ggml_compute_forward_flash_ff_f16(
8478
8580
 
8479
8581
  ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
8480
8582
 
8481
- for (int i = 0; i < M; i++) {
8583
+ for (int64_t i = 0; i < M; i++) {
8482
8584
  S16[i] = GGML_FP32_TO_FP16(S[i]);
8483
8585
  }
8484
8586
 
@@ -8490,7 +8592,7 @@ static void ggml_compute_forward_flash_ff_f16(
8490
8592
  const int i2 = ia2;
8491
8593
  const int i3 = ia3;
8492
8594
 
8493
- for (int ic = 0; ic < nec01; ++ic) {
8595
+ for (int64_t ic = 0; ic < nec01; ++ic) {
8494
8596
 
8495
8597
  ggml_vec_dot_f16(neb01,
8496
8598
  (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
@@ -9355,7 +9457,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
9355
9457
  } break;
9356
9458
  case GGML_OP_ROPE:
9357
9459
  {
9358
- node->n_tasks = 1;
9460
+ node->n_tasks = n_threads;
9359
9461
  } break;
9360
9462
  case GGML_OP_CONV_1D_1S:
9361
9463
  case GGML_OP_CONV_1D_2S:
@@ -9393,7 +9495,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
9393
9495
 
9394
9496
  size_t cur = 0;
9395
9497
 
9396
- const int ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
9498
+ const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
9397
9499
 
9398
9500
  if (node->src1->type == GGML_TYPE_F32) {
9399
9501
  cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
@@ -9652,7 +9754,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
9652
9754
 
9653
9755
  perf_total_per_op_us[node->op] += node->perf_time_us;
9654
9756
 
9655
- GGML_PRINT(" - %3d: [ %6d, %6d, %6d] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
9757
+ GGML_PRINT(" - %3d: [ %" PRId64 ", %" PRId64 ", %" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
9656
9758
  i,
9657
9759
  node->ne[0], node->ne[1], node->ne[2],
9658
9760
  GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
@@ -9666,7 +9768,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
9666
9768
  for (int i = 0; i < cgraph->n_leafs; i++) {
9667
9769
  struct ggml_tensor * node = cgraph->leafs[i];
9668
9770
 
9669
- GGML_PRINT(" - %3d: [ %6d, %6d] %8s\n",
9771
+ GGML_PRINT(" - %3d: [ %" PRId64 ", %" PRId64 "] %8s\n",
9670
9772
  i,
9671
9773
  node->ne[0], node->ne[1],
9672
9774
  GGML_OP_LABEL[node->op]);
@@ -9737,7 +9839,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
9737
9839
 
9738
9840
  fprintf(fp, " \"%p\" [ \
9739
9841
  style = filled; fillcolor = %s; shape = record; \
9740
- label=\"%d [%d, %d] | <x>%s",
9842
+ label=\"%d [%" PRId64 ", %" PRId64 "] | <x>%s",
9741
9843
  (void *) node, color,
9742
9844
  i, node->ne[0], node->ne[1],
9743
9845
  GGML_OP_SYMBOL[node->op]);
@@ -9762,7 +9864,7 @@ label=\"<x>%.1e\"; ]\n",
9762
9864
  } else {
9763
9865
  fprintf(fp, " \"%p\" [ \
9764
9866
  style = filled; fillcolor = %s; shape = record; \
9765
- label=\"<x>CONST %d [%d, %d]\"; ]\n",
9867
+ label=\"<x>CONST %d [%" PRId64 ", %" PRId64 "]\"; ]\n",
9766
9868
  (void *) node, color,
9767
9869
  i, node->ne[0], node->ne[1]);
9768
9870
  }
@@ -9826,9 +9928,9 @@ label=\"<x>CONST %d [%d, %d]\"; ]\n",
9826
9928
  static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) {
9827
9929
  int i = 0;
9828
9930
  for (int p = 0; p < np; ++p) {
9829
- const int ne = ggml_nelements(ps[p]) ;
9931
+ const int64_t ne = ggml_nelements(ps[p]) ;
9830
9932
  // TODO: add function to set tensor from array
9831
- for (int j = 0; j < ne; ++j) {
9933
+ for (int64_t j = 0; j < ne; ++j) {
9832
9934
  ggml_set_f32_1d(ps[p], j, x[i++]);
9833
9935
  }
9834
9936
  }
@@ -9837,9 +9939,9 @@ static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const f
9837
9939
  static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {
9838
9940
  int i = 0;
9839
9941
  for (int p = 0; p < np; ++p) {
9840
- const int ne = ggml_nelements(ps[p]) ;
9942
+ const int64_t ne = ggml_nelements(ps[p]) ;
9841
9943
  // TODO: add function to get all elements at once
9842
- for (int j = 0; j < ne; ++j) {
9944
+ for (int64_t j = 0; j < ne; ++j) {
9843
9945
  x[i++] = ggml_get_f32_1d(ps[p], j);
9844
9946
  }
9845
9947
  }
@@ -9848,9 +9950,9 @@ static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float *
9848
9950
  static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
9849
9951
  int i = 0;
9850
9952
  for (int p = 0; p < np; ++p) {
9851
- const int ne = ggml_nelements(ps[p]) ;
9953
+ const int64_t ne = ggml_nelements(ps[p]) ;
9852
9954
  // TODO: add function to get all elements at once
9853
- for (int j = 0; j < ne; ++j) {
9955
+ for (int64_t j = 0; j < ne; ++j) {
9854
9956
  g[i++] = ggml_get_f32_1d(ps[p]->grad, j);
9855
9957
  }
9856
9958
  }