llama_cpp 0.12.5 → 0.12.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -428,8 +428,8 @@ int64_t ggml_cycles_per_ms(void) {
428
428
 
429
429
  static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
430
430
 
431
- static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
432
- static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
431
+ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
432
+ static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
433
433
 
434
434
  static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
435
435
  [GGML_TYPE_I8] = {
@@ -457,6 +457,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
457
457
  .is_quantized = false,
458
458
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
459
459
  .vec_dot_type = GGML_TYPE_F32,
460
+ .nrows = 1,
460
461
  },
461
462
  [GGML_TYPE_F16] = {
462
463
  .type_name = "f16",
@@ -468,6 +469,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
468
469
  .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
469
470
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
470
471
  .vec_dot_type = GGML_TYPE_F16,
472
+ .nrows = 1,
471
473
  },
472
474
  [GGML_TYPE_Q4_0] = {
473
475
  .type_name = "q4_0",
@@ -479,6 +481,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
479
481
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
480
482
  .vec_dot = ggml_vec_dot_q4_0_q8_0,
481
483
  .vec_dot_type = GGML_TYPE_Q8_0,
484
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
485
+ .nrows = 2,
486
+ #else
487
+ .nrows = 1,
488
+ #endif
482
489
  },
483
490
  [GGML_TYPE_Q4_1] = {
484
491
  .type_name = "q4_1",
@@ -490,6 +497,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
490
497
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
491
498
  .vec_dot = ggml_vec_dot_q4_1_q8_1,
492
499
  .vec_dot_type = GGML_TYPE_Q8_1,
500
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
501
+ .nrows = 2,
502
+ #else
503
+ .nrows = 1,
504
+ #endif
493
505
  },
494
506
  [4] = { // GGML_TYPE_Q4_2
495
507
  .type_name = "DEPRECATED",
@@ -501,6 +513,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
501
513
  .from_float_reference = NULL,
502
514
  .vec_dot = NULL,
503
515
  .vec_dot_type = GGML_TYPE_COUNT,
516
+ .nrows = 1,
504
517
  },
505
518
  [5] = { // GGML_TYPE_Q4_3
506
519
  .type_name = "DEPRECATED",
@@ -512,6 +525,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
512
525
  .from_float_reference = NULL,
513
526
  .vec_dot = NULL,
514
527
  .vec_dot_type = GGML_TYPE_COUNT,
528
+ .nrows = 1,
515
529
  },
516
530
  [GGML_TYPE_Q5_0] = {
517
531
  .type_name = "q5_0",
@@ -523,6 +537,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
523
537
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
524
538
  .vec_dot = ggml_vec_dot_q5_0_q8_0,
525
539
  .vec_dot_type = GGML_TYPE_Q8_0,
540
+ .nrows = 1,
526
541
  },
527
542
  [GGML_TYPE_Q5_1] = {
528
543
  .type_name = "q5_1",
@@ -534,6 +549,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
534
549
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
535
550
  .vec_dot = ggml_vec_dot_q5_1_q8_1,
536
551
  .vec_dot_type = GGML_TYPE_Q8_1,
552
+ .nrows = 1,
537
553
  },
538
554
  [GGML_TYPE_Q8_0] = {
539
555
  .type_name = "q8_0",
@@ -545,6 +561,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
545
561
  .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
546
562
  .vec_dot = ggml_vec_dot_q8_0_q8_0,
547
563
  .vec_dot_type = GGML_TYPE_Q8_0,
564
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
565
+ .nrows = 2,
566
+ #else
567
+ .nrows = 1,
568
+ #endif
548
569
  },
549
570
  [GGML_TYPE_Q8_1] = {
550
571
  .type_name = "q8_1",
@@ -554,6 +575,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
554
575
  .from_float = quantize_row_q8_1,
555
576
  .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
556
577
  .vec_dot_type = GGML_TYPE_Q8_1,
578
+ .nrows = 1,
557
579
  },
558
580
  [GGML_TYPE_Q2_K] = {
559
581
  .type_name = "q2_K",
@@ -565,6 +587,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
565
587
  .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
566
588
  .vec_dot = ggml_vec_dot_q2_K_q8_K,
567
589
  .vec_dot_type = GGML_TYPE_Q8_K,
590
+ .nrows = 1,
568
591
  },
569
592
  [GGML_TYPE_Q3_K] = {
570
593
  .type_name = "q3_K",
@@ -576,6 +599,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
576
599
  .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
577
600
  .vec_dot = ggml_vec_dot_q3_K_q8_K,
578
601
  .vec_dot_type = GGML_TYPE_Q8_K,
602
+ .nrows = 1,
579
603
  },
580
604
  [GGML_TYPE_Q4_K] = {
581
605
  .type_name = "q4_K",
@@ -587,6 +611,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
587
611
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
588
612
  .vec_dot = ggml_vec_dot_q4_K_q8_K,
589
613
  .vec_dot_type = GGML_TYPE_Q8_K,
614
+ .nrows = 1,
590
615
  },
591
616
  [GGML_TYPE_Q5_K] = {
592
617
  .type_name = "q5_K",
@@ -598,6 +623,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
598
623
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
599
624
  .vec_dot = ggml_vec_dot_q5_K_q8_K,
600
625
  .vec_dot_type = GGML_TYPE_Q8_K,
626
+ .nrows = 1,
601
627
  },
602
628
  [GGML_TYPE_Q6_K] = {
603
629
  .type_name = "q6_K",
@@ -609,6 +635,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
609
635
  .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
610
636
  .vec_dot = ggml_vec_dot_q6_K_q8_K,
611
637
  .vec_dot_type = GGML_TYPE_Q8_K,
638
+ .nrows = 1,
612
639
  },
613
640
  [GGML_TYPE_IQ2_XXS] = {
614
641
  .type_name = "iq2_xxs",
@@ -620,6 +647,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
620
647
  .from_float_reference = NULL,
621
648
  .vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
622
649
  .vec_dot_type = GGML_TYPE_Q8_K,
650
+ .nrows = 1,
623
651
  },
624
652
  [GGML_TYPE_IQ2_XS] = {
625
653
  .type_name = "iq2_xs",
@@ -631,6 +659,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
631
659
  .from_float_reference = NULL,
632
660
  .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
633
661
  .vec_dot_type = GGML_TYPE_Q8_K,
662
+ .nrows = 1,
634
663
  },
635
664
  [GGML_TYPE_IQ3_XXS] = {
636
665
  .type_name = "iq3_xxs",
@@ -642,6 +671,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
642
671
  .from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
643
672
  .vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
644
673
  .vec_dot_type = GGML_TYPE_Q8_K,
674
+ .nrows = 1,
645
675
  },
646
676
  [GGML_TYPE_Q8_K] = {
647
677
  .type_name = "q8_K",
@@ -1212,7 +1242,13 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
1212
1242
  inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
1213
1243
  inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
1214
1244
 
1215
- static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
1245
+ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
1246
+ assert(nrc == 1);
1247
+ UNUSED(nrc);
1248
+ UNUSED(bx);
1249
+ UNUSED(by);
1250
+ UNUSED(bs);
1251
+
1216
1252
  #ifdef GGML_SIMD
1217
1253
  float sumf = 0.0f;
1218
1254
  const int np = (n & ~(GGML_F32_STEP - 1));
@@ -1249,7 +1285,13 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
1249
1285
  *s = sumf;
1250
1286
  }
1251
1287
 
1252
- static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
1288
+ static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
1289
+ assert(nrc == 1);
1290
+ UNUSED(nrc);
1291
+ UNUSED(bx);
1292
+ UNUSED(by);
1293
+ UNUSED(bs);
1294
+
1253
1295
  ggml_float sumf = 0.0;
1254
1296
 
1255
1297
  #if defined(GGML_SIMD)
@@ -1455,7 +1497,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
1455
1497
  #endif
1456
1498
  }
1457
1499
 
1458
- inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); }
1500
+ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
1459
1501
  inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
1460
1502
  inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
1461
1503
  inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
@@ -2607,7 +2649,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2607
2649
  /*.nb =*/ { 0, 0, 0, 0 },
2608
2650
  /*.op =*/ GGML_OP_NONE,
2609
2651
  /*.op_params =*/ { 0 },
2610
- /*.is_param =*/ false,
2652
+ /*.flags =*/ 0,
2611
2653
  /*.grad =*/ NULL,
2612
2654
  /*.src =*/ { NULL },
2613
2655
  /*.perf_runs =*/ 0,
@@ -6509,7 +6551,7 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
6509
6551
  void ggml_set_param(
6510
6552
  struct ggml_context * ctx,
6511
6553
  struct ggml_tensor * tensor) {
6512
- tensor->is_param = true;
6554
+ tensor->flags |= GGML_TENSOR_FLAG_PARAM;
6513
6555
 
6514
6556
  GGML_ASSERT(tensor->grad == NULL);
6515
6557
  tensor->grad = ggml_dup_tensor(ctx, tensor);
@@ -9992,6 +10034,7 @@ static void ggml_compute_forward_mul_mat(
9992
10034
  ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
9993
10035
  enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
9994
10036
  ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
10037
+ int64_t const vec_dot_num_rows = type_traits[type].nrows;
9995
10038
 
9996
10039
  GGML_ASSERT(ne0 == ne01);
9997
10040
  GGML_ASSERT(ne1 == ne11);
@@ -10159,12 +10202,23 @@ static void ggml_compute_forward_mul_mat(
10159
10202
  const int64_t blck_0 = 16;
10160
10203
  const int64_t blck_1 = 16;
10161
10204
 
10205
+ // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
10206
+ int64_t nrc = vec_dot_num_rows;
10207
+ // TODO: currently the mmla kernels support only even numbered rows/cols.
10208
+ // this check can be removed once they are extended to support odd numbered rows/cols too
10209
+ if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
10210
+ nrc = 1;
10211
+ }
10212
+
10213
+ const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
10214
+
10162
10215
  // attempt to reduce false-sharing (does not seem to make a difference)
10163
- float tmp[16];
10216
+ // 16 * 2, accounting for mmla kernels
10217
+ float tmp[32];
10164
10218
 
10165
10219
  for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
10166
10220
  for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
10167
- for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
10221
+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) {
10168
10222
  const int64_t i13 = (ir1/(ne12*ne1));
10169
10223
  const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
10170
10224
  const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
@@ -10187,17 +10241,19 @@ static void ggml_compute_forward_mul_mat(
10187
10241
  (src1_cont || src1->type != vec_dot_type
10188
10242
  ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
10189
10243
  : (i11*nb11 + i12*nb12 + i13*nb13));
10190
-
10191
10244
  float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
10192
10245
 
10193
10246
  //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
10194
10247
  // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
10195
10248
  //}
10196
10249
 
10197
- for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
10198
- vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
10250
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) {
10251
+ vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc);
10252
+ }
10253
+
10254
+ for (int cn = 0; cn < nrc; ++cn) {
10255
+ memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
10199
10256
  }
10200
- memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
10201
10257
  }
10202
10258
  }
10203
10259
  }
@@ -10386,7 +10442,7 @@ static void ggml_compute_forward_mul_mat_id(
10386
10442
  //}
10387
10443
 
10388
10444
  for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
10389
- vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
10445
+ vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
10390
10446
  }
10391
10447
  memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
10392
10448
  }
@@ -11568,7 +11624,7 @@ static void ggml_compute_forward_soft_max_back_f32(
11568
11624
 
11569
11625
  // linear runtime, no additional memory
11570
11626
  float dot_y_dy = 0;
11571
- ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy);
11627
+ ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
11572
11628
  ggml_vec_cpy_f32 (nc, dx, dy);
11573
11629
  ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
11574
11630
  ggml_vec_mul_f32 (nc, dx, dx, y);
@@ -12369,9 +12425,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12369
12425
  const int i1n = i10*ne11;
12370
12426
  for (int i00 = 0; i00 < ne00; i00++) {
12371
12427
  float v = 0;
12372
- ggml_vec_dot_f16(ne02, &v,
12373
- (ggml_fp16_t *) wdata_src + i1n,
12374
- (ggml_fp16_t *) wdata_kernel + i00*ne02);
12428
+ ggml_vec_dot_f16(ne02, &v, 0,
12429
+ (ggml_fp16_t *) wdata_src + i1n, 0,
12430
+ (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
12375
12431
  dst_data[i10*s0 + i00] += v;
12376
12432
  }
12377
12433
  }
@@ -12466,9 +12522,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12466
12522
  const int i1n = i10*ne11;
12467
12523
  for (int i00 = 0; i00 < ne00; i00++) {
12468
12524
  float v = 0;
12469
- ggml_vec_dot_f32(ne02, &v,
12470
- wdata_src + i1n,
12471
- wdata_kernel + i00*ne02);
12525
+ ggml_vec_dot_f32(ne02, &v, 0,
12526
+ wdata_src + i1n, 0,
12527
+ wdata_kernel + i00*ne02, 0, 1);
12472
12528
  dst_data[i10*s0 + i00] += v;
12473
12529
  }
12474
12530
  }
@@ -12783,9 +12839,9 @@ static void ggml_compute_forward_conv_transpose_2d(
12783
12839
  for (int i01 = 0; i01 < ne01; i01++) {
12784
12840
  for (int i00 = 0; i00 < ne00; i00++) {
12785
12841
  float v = 0;
12786
- ggml_vec_dot_f16(ne03, &v,
12787
- wdata_src + i1n,
12788
- wdata_kernel + i01*ne00*ne03 + i00*ne03);
12842
+ ggml_vec_dot_f16(ne03, &v, 0,
12843
+ wdata_src + i1n, 0,
12844
+ wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
12789
12845
  dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
12790
12846
  }
12791
12847
  }
@@ -13214,9 +13270,9 @@ static void ggml_compute_forward_flash_attn_f32(
13214
13270
  const int i1 = ik1;
13215
13271
 
13216
13272
  ggml_vec_dot_f32(neq0,
13217
- S + i1,
13218
- (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
13219
- (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
13273
+ S + i1, 0,
13274
+ (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
13275
+ (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
13220
13276
  }
13221
13277
 
13222
13278
  // scale
@@ -13299,9 +13355,9 @@ static void ggml_compute_forward_flash_attn_f32(
13299
13355
  const int iv3 = iq3;
13300
13356
 
13301
13357
  ggml_vec_dot_f32(masked_begin,
13302
- (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
13303
- (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
13304
- S);
13358
+ (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
13359
+ (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
13360
+ S, 0, 1);
13305
13361
  }
13306
13362
  }
13307
13363
  }
@@ -13404,9 +13460,9 @@ static void ggml_compute_forward_flash_attn_f16(
13404
13460
  const int i1 = ik1;
13405
13461
 
13406
13462
  ggml_vec_dot_f16(neq0,
13407
- S + i1,
13408
- (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
13409
- (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
13463
+ S + i1, 0,
13464
+ (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
13465
+ (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
13410
13466
  }
13411
13467
  } else {
13412
13468
  for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
@@ -13508,9 +13564,9 @@ static void ggml_compute_forward_flash_attn_f16(
13508
13564
  const int iv3 = iq3;
13509
13565
 
13510
13566
  ggml_vec_dot_f16(nev0,
13511
- (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
13512
- (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
13513
- S16);
13567
+ (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
13568
+ (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
13569
+ S16, 0, 1);
13514
13570
  }
13515
13571
  } else {
13516
13572
  for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
@@ -13652,9 +13708,9 @@ static void ggml_compute_forward_flash_ff_f16(
13652
13708
  const int i1 = ib01;
13653
13709
 
13654
13710
  ggml_vec_dot_f16(nea0,
13655
- S + i1,
13656
- (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)),
13657
- (ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)));
13711
+ S + i1, 0,
13712
+ (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
13713
+ (ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1);
13658
13714
  }
13659
13715
 
13660
13716
  ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
@@ -13677,9 +13733,9 @@ static void ggml_compute_forward_flash_ff_f16(
13677
13733
  for (int64_t ic = 0; ic < nec01; ++ic) {
13678
13734
 
13679
13735
  ggml_vec_dot_f16(neb01,
13680
- (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
13681
- (ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)),
13682
- S16);
13736
+ (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
13737
+ (ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
13738
+ S16, 0, 1);
13683
13739
  }
13684
13740
 
13685
13741
  ggml_vec_add_f32(nec01,
@@ -13866,9 +13922,9 @@ static void ggml_compute_forward_flash_attn_back_f32(
13866
13922
  const int i1 = ik1;
13867
13923
 
13868
13924
  ggml_vec_dot_f32(neq0,
13869
- S + i1,
13870
- (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
13871
- (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
13925
+ S + i1, 0,
13926
+ (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
13927
+ (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
13872
13928
  }
13873
13929
 
13874
13930
  // scale
@@ -14013,7 +14069,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
14013
14069
 
14014
14070
  // S = SM * (S - dot(SM, S))
14015
14071
  float dot_SM_gradSM = 0;
14016
- ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S);
14072
+ ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
14017
14073
  ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
14018
14074
  ggml_vec_mul_f32 (masked_begin, S, S, SM);
14019
14075
 
@@ -15311,7 +15367,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
15311
15367
  return NULL;
15312
15368
  }
15313
15369
 
15314
- if (node->is_param) {
15370
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
15315
15371
  return node;
15316
15372
  }
15317
15373
 
@@ -15345,7 +15401,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
15345
15401
 
15346
15402
  clone->op = node->op;
15347
15403
  clone->grad = node->grad;
15348
- clone->is_param = node->is_param;
15404
+ clone->flags = node->flags;
15349
15405
  clone->extra = node->extra;
15350
15406
  for (int k = 0; k < GGML_MAX_DIMS; ++k) {
15351
15407
  clone->nb[k] = node->nb[k];
@@ -16377,7 +16433,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
16377
16433
  for (int i = 0; i < gf->n_nodes; i++) {
16378
16434
  struct ggml_tensor * node = gf->nodes[i];
16379
16435
 
16380
- if (node->is_param) {
16436
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
16381
16437
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16382
16438
  ggml_build_forward_expand(gb, node->grad);
16383
16439
  }
@@ -16649,7 +16705,7 @@ struct ggml_compute_state_shared {
16649
16705
  atomic_int node_n; // active graph node
16650
16706
  atomic_int node_task; // active graph node task phase
16651
16707
 
16652
- bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
16708
+ ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
16653
16709
  void * abort_callback_data;
16654
16710
  };
16655
16711
 
@@ -17862,7 +17918,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17862
17918
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
17863
17919
  i,
17864
17920
  node->ne[0], node->ne[1], node->ne[2],
17865
- ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
17921
+ ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs,
17866
17922
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
17867
17923
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
17868
17924
  (double) node->perf_time_us / 1000.0,
@@ -17955,7 +18011,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17955
18011
  continue;
17956
18012
  }
17957
18013
 
17958
- if (node->is_param) {
18014
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
17959
18015
  snprintf(color, sizeof(color), "yellow");
17960
18016
  } else if (node->grad) {
17961
18017
  if (ggml_graph_find(gf, node)) {
@@ -18129,7 +18185,7 @@ static enum ggml_opt_result ggml_opt_adam(
18129
18185
  int np = 0;
18130
18186
  int64_t nx = 0;
18131
18187
  for (int i = 0; i < gf->n_nodes; ++i) {
18132
- if (gf->nodes[i]->is_param) {
18188
+ if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
18133
18189
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
18134
18190
 
18135
18191
  GGML_ASSERT(np < GGML_MAX_PARAMS);
@@ -18382,7 +18438,7 @@ static enum ggml_opt_result linesearch_backtracking(
18382
18438
  }
18383
18439
 
18384
18440
  // compute the initial gradient in the search direction
18385
- ggml_vec_dot_f32(nx, &dginit, g, d);
18441
+ ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1);
18386
18442
 
18387
18443
  // make sure that d points to a descent direction
18388
18444
  if (0 < dginit) {
@@ -18432,7 +18488,7 @@ static enum ggml_opt_result linesearch_backtracking(
18432
18488
  return count;
18433
18489
  }
18434
18490
 
18435
- ggml_vec_dot_f32(nx, &dg, g, d);
18491
+ ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1);
18436
18492
 
18437
18493
  // check the Wolfe condition
18438
18494
  if (dg < params->lbfgs.wolfe * dginit) {
@@ -18492,7 +18548,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18492
18548
  int np = 0;
18493
18549
  int nx = 0;
18494
18550
  for (int i = 0; i < gf->n_nodes; ++i) {
18495
- if (gf->nodes[i]->is_param) {
18551
+ if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
18496
18552
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
18497
18553
 
18498
18554
  GGML_ASSERT(np < GGML_MAX_PARAMS);
@@ -18693,8 +18749,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18693
18749
  // ys = y^t \cdot s -> 1 / \rho.
18694
18750
  // yy = y^t \cdot y.
18695
18751
  //
18696
- ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
18697
- ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
18752
+ ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1);
18753
+ ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1);
18698
18754
 
18699
18755
  lm_ys[end[0]] = ys;
18700
18756
 
@@ -18713,7 +18769,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18713
18769
  for (int i = 0; i < bound; ++i) {
18714
18770
  j[0] = (j[0] + m - 1) % m;
18715
18771
  // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
18716
- ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d);
18772
+ ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1);
18717
18773
  lm_alpha[j[0]] /= lm_ys[j[0]];
18718
18774
  // q_{i} = q_{i+1} - \alpha_{i} y_{i}
18719
18775
  ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
@@ -18723,7 +18779,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18723
18779
 
18724
18780
  for (int i = 0; i < bound; ++i) {
18725
18781
  // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
18726
- ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d);
18782
+ ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1);
18727
18783
  beta /= lm_ys[j[0]];
18728
18784
  // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
18729
18785
  ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
@@ -18967,6 +19023,16 @@ enum ggml_opt_result ggml_opt_resume_g(
18967
19023
 
18968
19024
  ////////////////////////////////////////////////////////////////////////////////
18969
19025
 
19026
+ void ggml_set_input(struct ggml_tensor * tensor) {
19027
+ tensor->flags |= GGML_TENSOR_FLAG_INPUT;
19028
+ }
19029
+
19030
+ void ggml_set_output(struct ggml_tensor * tensor) {
19031
+ tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
19032
+ }
19033
+
19034
+ ////////////////////////////////////////////////////////////////////////////////
19035
+
18970
19036
  void ggml_quantize_init(enum ggml_type type) {
18971
19037
  ggml_critical_section_start();
18972
19038
 
@@ -20611,4 +20677,12 @@ int ggml_cpu_has_vsx(void) {
20611
20677
  #endif
20612
20678
  }
20613
20679
 
20680
+ int ggml_cpu_has_matmul_int8(void) {
20681
+ #if defined(__ARM_FEATURE_MATMUL_INT8)
20682
+ return 1;
20683
+ #else
20684
+ return 0;
20685
+ #endif
20686
+ }
20687
+
20614
20688
  ////////////////////////////////////////////////////////////////////////////////
@@ -505,11 +505,17 @@ extern "C" {
505
505
 
506
506
  enum ggml_log_level {
507
507
  GGML_LOG_LEVEL_ERROR = 2,
508
- GGML_LOG_LEVEL_WARN = 3,
509
- GGML_LOG_LEVEL_INFO = 4,
508
+ GGML_LOG_LEVEL_WARN = 3,
509
+ GGML_LOG_LEVEL_INFO = 4,
510
510
  GGML_LOG_LEVEL_DEBUG = 5
511
511
  };
512
512
 
513
+ enum ggml_tensor_flag {
514
+ GGML_TENSOR_FLAG_INPUT = 1,
515
+ GGML_TENSOR_FLAG_OUTPUT = 2,
516
+ GGML_TENSOR_FLAG_PARAM = 4,
517
+ };
518
+
513
519
  // ggml object
514
520
  struct ggml_object {
515
521
  size_t offs;
@@ -543,7 +549,7 @@ extern "C" {
543
549
  // op params - allocated as int32_t for alignment
544
550
  int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
545
551
 
546
- bool is_param;
552
+ int32_t flags;
547
553
 
548
554
  struct ggml_tensor * grad;
549
555
  struct ggml_tensor * src[GGML_MAX_SRC];
@@ -567,6 +573,11 @@ extern "C" {
567
573
 
568
574
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
569
575
 
576
+ // Abort callback
577
+ // If not NULL, called before ggml computation
578
+ // If it returns true, the computation is aborted
579
+ typedef bool (*ggml_abort_callback)(void * data);
580
+
570
581
  // the compute plan that needs to be prepared for ggml_graph_compute()
571
582
  // since https://github.com/ggerganov/ggml/issues/287
572
583
  struct ggml_cplan {
@@ -576,8 +587,8 @@ extern "C" {
576
587
  int n_threads;
577
588
 
578
589
  // abort ggml_graph_compute when true
579
- bool (*abort_callback)(void * data);
580
- void * abort_callback_data;
590
+ ggml_abort_callback abort_callback;
591
+ void * abort_callback_data;
581
592
  };
582
593
 
583
594
  enum ggml_cgraph_eval_order {
@@ -2087,6 +2098,12 @@ extern "C" {
2087
2098
  ggml_opt_callback callback,
2088
2099
  void * callback_data);
2089
2100
 
2101
+ //
2102
+ // tensor flags
2103
+ //
2104
+ GGML_API void ggml_set_input(struct ggml_tensor * tensor);
2105
+ GGML_API void ggml_set_output(struct ggml_tensor * tensor);
2106
+
2090
2107
  //
2091
2108
  // quantization
2092
2109
  //
@@ -2273,6 +2290,7 @@ extern "C" {
2273
2290
  GGML_API int ggml_cpu_has_ssse3 (void);
2274
2291
  GGML_API int ggml_cpu_has_sycl (void);
2275
2292
  GGML_API int ggml_cpu_has_vsx (void);
2293
+ GGML_API int ggml_cpu_has_matmul_int8(void);
2276
2294
 
2277
2295
  //
2278
2296
  // Internal types and functions exposed for tests and benchmarks
@@ -2286,7 +2304,8 @@ extern "C" {
2286
2304
  #endif
2287
2305
  typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
2288
2306
  typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
2289
- typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
2307
+ typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
2308
+ const void * GGML_RESTRICT y, size_t by, int nrc);
2290
2309
 
2291
2310
  typedef struct {
2292
2311
  const char * type_name;
@@ -2298,6 +2317,7 @@ extern "C" {
2298
2317
  ggml_from_float_t from_float_reference;
2299
2318
  ggml_vec_dot_t vec_dot;
2300
2319
  enum ggml_type vec_dot_type;
2320
+ int64_t nrows; // number of rows to process simultaneously;
2301
2321
  } ggml_type_traits_t;
2302
2322
 
2303
2323
  GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);