llama_cpp 0.12.5 → 0.12.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -428,8 +428,8 @@ int64_t ggml_cycles_per_ms(void) {
428
428
 
429
429
  static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
430
430
 
431
- static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
432
- static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
431
+ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
432
+ static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
433
433
 
434
434
  static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
435
435
  [GGML_TYPE_I8] = {
@@ -457,6 +457,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
457
457
  .is_quantized = false,
458
458
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
459
459
  .vec_dot_type = GGML_TYPE_F32,
460
+ .nrows = 1,
460
461
  },
461
462
  [GGML_TYPE_F16] = {
462
463
  .type_name = "f16",
@@ -468,6 +469,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
468
469
  .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
469
470
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
470
471
  .vec_dot_type = GGML_TYPE_F16,
472
+ .nrows = 1,
471
473
  },
472
474
  [GGML_TYPE_Q4_0] = {
473
475
  .type_name = "q4_0",
@@ -479,6 +481,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
479
481
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
480
482
  .vec_dot = ggml_vec_dot_q4_0_q8_0,
481
483
  .vec_dot_type = GGML_TYPE_Q8_0,
484
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
485
+ .nrows = 2,
486
+ #else
487
+ .nrows = 1,
488
+ #endif
482
489
  },
483
490
  [GGML_TYPE_Q4_1] = {
484
491
  .type_name = "q4_1",
@@ -490,6 +497,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
490
497
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
491
498
  .vec_dot = ggml_vec_dot_q4_1_q8_1,
492
499
  .vec_dot_type = GGML_TYPE_Q8_1,
500
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
501
+ .nrows = 2,
502
+ #else
503
+ .nrows = 1,
504
+ #endif
493
505
  },
494
506
  [4] = { // GGML_TYPE_Q4_2
495
507
  .type_name = "DEPRECATED",
@@ -501,6 +513,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
501
513
  .from_float_reference = NULL,
502
514
  .vec_dot = NULL,
503
515
  .vec_dot_type = GGML_TYPE_COUNT,
516
+ .nrows = 1,
504
517
  },
505
518
  [5] = { // GGML_TYPE_Q4_3
506
519
  .type_name = "DEPRECATED",
@@ -512,6 +525,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
512
525
  .from_float_reference = NULL,
513
526
  .vec_dot = NULL,
514
527
  .vec_dot_type = GGML_TYPE_COUNT,
528
+ .nrows = 1,
515
529
  },
516
530
  [GGML_TYPE_Q5_0] = {
517
531
  .type_name = "q5_0",
@@ -523,6 +537,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
523
537
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
524
538
  .vec_dot = ggml_vec_dot_q5_0_q8_0,
525
539
  .vec_dot_type = GGML_TYPE_Q8_0,
540
+ .nrows = 1,
526
541
  },
527
542
  [GGML_TYPE_Q5_1] = {
528
543
  .type_name = "q5_1",
@@ -534,6 +549,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
534
549
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
535
550
  .vec_dot = ggml_vec_dot_q5_1_q8_1,
536
551
  .vec_dot_type = GGML_TYPE_Q8_1,
552
+ .nrows = 1,
537
553
  },
538
554
  [GGML_TYPE_Q8_0] = {
539
555
  .type_name = "q8_0",
@@ -545,6 +561,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
545
561
  .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
546
562
  .vec_dot = ggml_vec_dot_q8_0_q8_0,
547
563
  .vec_dot_type = GGML_TYPE_Q8_0,
564
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
565
+ .nrows = 2,
566
+ #else
567
+ .nrows = 1,
568
+ #endif
548
569
  },
549
570
  [GGML_TYPE_Q8_1] = {
550
571
  .type_name = "q8_1",
@@ -554,6 +575,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
554
575
  .from_float = quantize_row_q8_1,
555
576
  .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
556
577
  .vec_dot_type = GGML_TYPE_Q8_1,
578
+ .nrows = 1,
557
579
  },
558
580
  [GGML_TYPE_Q2_K] = {
559
581
  .type_name = "q2_K",
@@ -565,6 +587,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
565
587
  .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
566
588
  .vec_dot = ggml_vec_dot_q2_K_q8_K,
567
589
  .vec_dot_type = GGML_TYPE_Q8_K,
590
+ .nrows = 1,
568
591
  },
569
592
  [GGML_TYPE_Q3_K] = {
570
593
  .type_name = "q3_K",
@@ -576,6 +599,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
576
599
  .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
577
600
  .vec_dot = ggml_vec_dot_q3_K_q8_K,
578
601
  .vec_dot_type = GGML_TYPE_Q8_K,
602
+ .nrows = 1,
579
603
  },
580
604
  [GGML_TYPE_Q4_K] = {
581
605
  .type_name = "q4_K",
@@ -587,6 +611,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
587
611
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
588
612
  .vec_dot = ggml_vec_dot_q4_K_q8_K,
589
613
  .vec_dot_type = GGML_TYPE_Q8_K,
614
+ .nrows = 1,
590
615
  },
591
616
  [GGML_TYPE_Q5_K] = {
592
617
  .type_name = "q5_K",
@@ -598,6 +623,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
598
623
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
599
624
  .vec_dot = ggml_vec_dot_q5_K_q8_K,
600
625
  .vec_dot_type = GGML_TYPE_Q8_K,
626
+ .nrows = 1,
601
627
  },
602
628
  [GGML_TYPE_Q6_K] = {
603
629
  .type_name = "q6_K",
@@ -609,6 +635,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
609
635
  .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
610
636
  .vec_dot = ggml_vec_dot_q6_K_q8_K,
611
637
  .vec_dot_type = GGML_TYPE_Q8_K,
638
+ .nrows = 1,
612
639
  },
613
640
  [GGML_TYPE_IQ2_XXS] = {
614
641
  .type_name = "iq2_xxs",
@@ -620,6 +647,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
620
647
  .from_float_reference = NULL,
621
648
  .vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
622
649
  .vec_dot_type = GGML_TYPE_Q8_K,
650
+ .nrows = 1,
623
651
  },
624
652
  [GGML_TYPE_IQ2_XS] = {
625
653
  .type_name = "iq2_xs",
@@ -631,6 +659,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
631
659
  .from_float_reference = NULL,
632
660
  .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
633
661
  .vec_dot_type = GGML_TYPE_Q8_K,
662
+ .nrows = 1,
634
663
  },
635
664
  [GGML_TYPE_IQ3_XXS] = {
636
665
  .type_name = "iq3_xxs",
@@ -642,6 +671,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
642
671
  .from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
643
672
  .vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
644
673
  .vec_dot_type = GGML_TYPE_Q8_K,
674
+ .nrows = 1,
645
675
  },
646
676
  [GGML_TYPE_Q8_K] = {
647
677
  .type_name = "q8_K",
@@ -1212,7 +1242,13 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
1212
1242
  inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
1213
1243
  inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
1214
1244
 
1215
- static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
1245
+ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
1246
+ assert(nrc == 1);
1247
+ UNUSED(nrc);
1248
+ UNUSED(bx);
1249
+ UNUSED(by);
1250
+ UNUSED(bs);
1251
+
1216
1252
  #ifdef GGML_SIMD
1217
1253
  float sumf = 0.0f;
1218
1254
  const int np = (n & ~(GGML_F32_STEP - 1));
@@ -1249,7 +1285,13 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
1249
1285
  *s = sumf;
1250
1286
  }
1251
1287
 
1252
- static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
1288
+ static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
1289
+ assert(nrc == 1);
1290
+ UNUSED(nrc);
1291
+ UNUSED(bx);
1292
+ UNUSED(by);
1293
+ UNUSED(bs);
1294
+
1253
1295
  ggml_float sumf = 0.0;
1254
1296
 
1255
1297
  #if defined(GGML_SIMD)
@@ -1455,7 +1497,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
1455
1497
  #endif
1456
1498
  }
1457
1499
 
1458
- inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); }
1500
+ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
1459
1501
  inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
1460
1502
  inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
1461
1503
  inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
@@ -2607,7 +2649,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2607
2649
  /*.nb =*/ { 0, 0, 0, 0 },
2608
2650
  /*.op =*/ GGML_OP_NONE,
2609
2651
  /*.op_params =*/ { 0 },
2610
- /*.is_param =*/ false,
2652
+ /*.flags =*/ 0,
2611
2653
  /*.grad =*/ NULL,
2612
2654
  /*.src =*/ { NULL },
2613
2655
  /*.perf_runs =*/ 0,
@@ -6509,7 +6551,7 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
6509
6551
  void ggml_set_param(
6510
6552
  struct ggml_context * ctx,
6511
6553
  struct ggml_tensor * tensor) {
6512
- tensor->is_param = true;
6554
+ tensor->flags |= GGML_TENSOR_FLAG_PARAM;
6513
6555
 
6514
6556
  GGML_ASSERT(tensor->grad == NULL);
6515
6557
  tensor->grad = ggml_dup_tensor(ctx, tensor);
@@ -9992,6 +10034,7 @@ static void ggml_compute_forward_mul_mat(
9992
10034
  ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
9993
10035
  enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
9994
10036
  ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
10037
+ int64_t const vec_dot_num_rows = type_traits[type].nrows;
9995
10038
 
9996
10039
  GGML_ASSERT(ne0 == ne01);
9997
10040
  GGML_ASSERT(ne1 == ne11);
@@ -10159,12 +10202,23 @@ static void ggml_compute_forward_mul_mat(
10159
10202
  const int64_t blck_0 = 16;
10160
10203
  const int64_t blck_1 = 16;
10161
10204
 
10205
+ // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
10206
+ int64_t nrc = vec_dot_num_rows;
10207
+ // TODO: currently the mmla kernels support only even numbered rows/cols.
10208
+ // this check can be removed once they are extended to support odd numbered rows/cols too
10209
+ if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
10210
+ nrc = 1;
10211
+ }
10212
+
10213
+ const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
10214
+
10162
10215
  // attempt to reduce false-sharing (does not seem to make a difference)
10163
- float tmp[16];
10216
+ // 16 * 2, accounting for mmla kernels
10217
+ float tmp[32];
10164
10218
 
10165
10219
  for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
10166
10220
  for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
10167
- for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
10221
+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) {
10168
10222
  const int64_t i13 = (ir1/(ne12*ne1));
10169
10223
  const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
10170
10224
  const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
@@ -10187,17 +10241,19 @@ static void ggml_compute_forward_mul_mat(
10187
10241
  (src1_cont || src1->type != vec_dot_type
10188
10242
  ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
10189
10243
  : (i11*nb11 + i12*nb12 + i13*nb13));
10190
-
10191
10244
  float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
10192
10245
 
10193
10246
  //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
10194
10247
  // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
10195
10248
  //}
10196
10249
 
10197
- for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
10198
- vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
10250
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) {
10251
+ vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc);
10252
+ }
10253
+
10254
+ for (int cn = 0; cn < nrc; ++cn) {
10255
+ memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
10199
10256
  }
10200
- memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
10201
10257
  }
10202
10258
  }
10203
10259
  }
@@ -10386,7 +10442,7 @@ static void ggml_compute_forward_mul_mat_id(
10386
10442
  //}
10387
10443
 
10388
10444
  for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
10389
- vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
10445
+ vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
10390
10446
  }
10391
10447
  memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
10392
10448
  }
@@ -11568,7 +11624,7 @@ static void ggml_compute_forward_soft_max_back_f32(
11568
11624
 
11569
11625
  // linear runtime, no additional memory
11570
11626
  float dot_y_dy = 0;
11571
- ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy);
11627
+ ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
11572
11628
  ggml_vec_cpy_f32 (nc, dx, dy);
11573
11629
  ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
11574
11630
  ggml_vec_mul_f32 (nc, dx, dx, y);
@@ -12369,9 +12425,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12369
12425
  const int i1n = i10*ne11;
12370
12426
  for (int i00 = 0; i00 < ne00; i00++) {
12371
12427
  float v = 0;
12372
- ggml_vec_dot_f16(ne02, &v,
12373
- (ggml_fp16_t *) wdata_src + i1n,
12374
- (ggml_fp16_t *) wdata_kernel + i00*ne02);
12428
+ ggml_vec_dot_f16(ne02, &v, 0,
12429
+ (ggml_fp16_t *) wdata_src + i1n, 0,
12430
+ (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
12375
12431
  dst_data[i10*s0 + i00] += v;
12376
12432
  }
12377
12433
  }
@@ -12466,9 +12522,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12466
12522
  const int i1n = i10*ne11;
12467
12523
  for (int i00 = 0; i00 < ne00; i00++) {
12468
12524
  float v = 0;
12469
- ggml_vec_dot_f32(ne02, &v,
12470
- wdata_src + i1n,
12471
- wdata_kernel + i00*ne02);
12525
+ ggml_vec_dot_f32(ne02, &v, 0,
12526
+ wdata_src + i1n, 0,
12527
+ wdata_kernel + i00*ne02, 0, 1);
12472
12528
  dst_data[i10*s0 + i00] += v;
12473
12529
  }
12474
12530
  }
@@ -12783,9 +12839,9 @@ static void ggml_compute_forward_conv_transpose_2d(
12783
12839
  for (int i01 = 0; i01 < ne01; i01++) {
12784
12840
  for (int i00 = 0; i00 < ne00; i00++) {
12785
12841
  float v = 0;
12786
- ggml_vec_dot_f16(ne03, &v,
12787
- wdata_src + i1n,
12788
- wdata_kernel + i01*ne00*ne03 + i00*ne03);
12842
+ ggml_vec_dot_f16(ne03, &v, 0,
12843
+ wdata_src + i1n, 0,
12844
+ wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
12789
12845
  dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
12790
12846
  }
12791
12847
  }
@@ -13214,9 +13270,9 @@ static void ggml_compute_forward_flash_attn_f32(
13214
13270
  const int i1 = ik1;
13215
13271
 
13216
13272
  ggml_vec_dot_f32(neq0,
13217
- S + i1,
13218
- (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
13219
- (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
13273
+ S + i1, 0,
13274
+ (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
13275
+ (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
13220
13276
  }
13221
13277
 
13222
13278
  // scale
@@ -13299,9 +13355,9 @@ static void ggml_compute_forward_flash_attn_f32(
13299
13355
  const int iv3 = iq3;
13300
13356
 
13301
13357
  ggml_vec_dot_f32(masked_begin,
13302
- (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
13303
- (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
13304
- S);
13358
+ (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
13359
+ (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
13360
+ S, 0, 1);
13305
13361
  }
13306
13362
  }
13307
13363
  }
@@ -13404,9 +13460,9 @@ static void ggml_compute_forward_flash_attn_f16(
13404
13460
  const int i1 = ik1;
13405
13461
 
13406
13462
  ggml_vec_dot_f16(neq0,
13407
- S + i1,
13408
- (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
13409
- (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
13463
+ S + i1, 0,
13464
+ (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
13465
+ (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
13410
13466
  }
13411
13467
  } else {
13412
13468
  for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
@@ -13508,9 +13564,9 @@ static void ggml_compute_forward_flash_attn_f16(
13508
13564
  const int iv3 = iq3;
13509
13565
 
13510
13566
  ggml_vec_dot_f16(nev0,
13511
- (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
13512
- (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
13513
- S16);
13567
+ (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
13568
+ (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
13569
+ S16, 0, 1);
13514
13570
  }
13515
13571
  } else {
13516
13572
  for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
@@ -13652,9 +13708,9 @@ static void ggml_compute_forward_flash_ff_f16(
13652
13708
  const int i1 = ib01;
13653
13709
 
13654
13710
  ggml_vec_dot_f16(nea0,
13655
- S + i1,
13656
- (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)),
13657
- (ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)));
13711
+ S + i1, 0,
13712
+ (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
13713
+ (ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1);
13658
13714
  }
13659
13715
 
13660
13716
  ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
@@ -13677,9 +13733,9 @@ static void ggml_compute_forward_flash_ff_f16(
13677
13733
  for (int64_t ic = 0; ic < nec01; ++ic) {
13678
13734
 
13679
13735
  ggml_vec_dot_f16(neb01,
13680
- (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
13681
- (ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)),
13682
- S16);
13736
+ (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
13737
+ (ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
13738
+ S16, 0, 1);
13683
13739
  }
13684
13740
 
13685
13741
  ggml_vec_add_f32(nec01,
@@ -13866,9 +13922,9 @@ static void ggml_compute_forward_flash_attn_back_f32(
13866
13922
  const int i1 = ik1;
13867
13923
 
13868
13924
  ggml_vec_dot_f32(neq0,
13869
- S + i1,
13870
- (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
13871
- (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
13925
+ S + i1, 0,
13926
+ (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
13927
+ (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
13872
13928
  }
13873
13929
 
13874
13930
  // scale
@@ -14013,7 +14069,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
14013
14069
 
14014
14070
  // S = SM * (S - dot(SM, S))
14015
14071
  float dot_SM_gradSM = 0;
14016
- ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S);
14072
+ ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
14017
14073
  ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
14018
14074
  ggml_vec_mul_f32 (masked_begin, S, S, SM);
14019
14075
 
@@ -15311,7 +15367,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
15311
15367
  return NULL;
15312
15368
  }
15313
15369
 
15314
- if (node->is_param) {
15370
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
15315
15371
  return node;
15316
15372
  }
15317
15373
 
@@ -15345,7 +15401,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
15345
15401
 
15346
15402
  clone->op = node->op;
15347
15403
  clone->grad = node->grad;
15348
- clone->is_param = node->is_param;
15404
+ clone->flags = node->flags;
15349
15405
  clone->extra = node->extra;
15350
15406
  for (int k = 0; k < GGML_MAX_DIMS; ++k) {
15351
15407
  clone->nb[k] = node->nb[k];
@@ -16377,7 +16433,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
16377
16433
  for (int i = 0; i < gf->n_nodes; i++) {
16378
16434
  struct ggml_tensor * node = gf->nodes[i];
16379
16435
 
16380
- if (node->is_param) {
16436
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
16381
16437
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16382
16438
  ggml_build_forward_expand(gb, node->grad);
16383
16439
  }
@@ -16649,7 +16705,7 @@ struct ggml_compute_state_shared {
16649
16705
  atomic_int node_n; // active graph node
16650
16706
  atomic_int node_task; // active graph node task phase
16651
16707
 
16652
- bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
16708
+ ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
16653
16709
  void * abort_callback_data;
16654
16710
  };
16655
16711
 
@@ -17862,7 +17918,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17862
17918
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
17863
17919
  i,
17864
17920
  node->ne[0], node->ne[1], node->ne[2],
17865
- ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
17921
+ ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs,
17866
17922
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
17867
17923
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
17868
17924
  (double) node->perf_time_us / 1000.0,
@@ -17955,7 +18011,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17955
18011
  continue;
17956
18012
  }
17957
18013
 
17958
- if (node->is_param) {
18014
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
17959
18015
  snprintf(color, sizeof(color), "yellow");
17960
18016
  } else if (node->grad) {
17961
18017
  if (ggml_graph_find(gf, node)) {
@@ -18129,7 +18185,7 @@ static enum ggml_opt_result ggml_opt_adam(
18129
18185
  int np = 0;
18130
18186
  int64_t nx = 0;
18131
18187
  for (int i = 0; i < gf->n_nodes; ++i) {
18132
- if (gf->nodes[i]->is_param) {
18188
+ if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
18133
18189
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
18134
18190
 
18135
18191
  GGML_ASSERT(np < GGML_MAX_PARAMS);
@@ -18382,7 +18438,7 @@ static enum ggml_opt_result linesearch_backtracking(
18382
18438
  }
18383
18439
 
18384
18440
  // compute the initial gradient in the search direction
18385
- ggml_vec_dot_f32(nx, &dginit, g, d);
18441
+ ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1);
18386
18442
 
18387
18443
  // make sure that d points to a descent direction
18388
18444
  if (0 < dginit) {
@@ -18432,7 +18488,7 @@ static enum ggml_opt_result linesearch_backtracking(
18432
18488
  return count;
18433
18489
  }
18434
18490
 
18435
- ggml_vec_dot_f32(nx, &dg, g, d);
18491
+ ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1);
18436
18492
 
18437
18493
  // check the Wolfe condition
18438
18494
  if (dg < params->lbfgs.wolfe * dginit) {
@@ -18492,7 +18548,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18492
18548
  int np = 0;
18493
18549
  int nx = 0;
18494
18550
  for (int i = 0; i < gf->n_nodes; ++i) {
18495
- if (gf->nodes[i]->is_param) {
18551
+ if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
18496
18552
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
18497
18553
 
18498
18554
  GGML_ASSERT(np < GGML_MAX_PARAMS);
@@ -18693,8 +18749,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18693
18749
  // ys = y^t \cdot s -> 1 / \rho.
18694
18750
  // yy = y^t \cdot y.
18695
18751
  //
18696
- ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
18697
- ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
18752
+ ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1);
18753
+ ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1);
18698
18754
 
18699
18755
  lm_ys[end[0]] = ys;
18700
18756
 
@@ -18713,7 +18769,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18713
18769
  for (int i = 0; i < bound; ++i) {
18714
18770
  j[0] = (j[0] + m - 1) % m;
18715
18771
  // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
18716
- ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d);
18772
+ ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1);
18717
18773
  lm_alpha[j[0]] /= lm_ys[j[0]];
18718
18774
  // q_{i} = q_{i+1} - \alpha_{i} y_{i}
18719
18775
  ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
@@ -18723,7 +18779,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18723
18779
 
18724
18780
  for (int i = 0; i < bound; ++i) {
18725
18781
  // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
18726
- ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d);
18782
+ ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1);
18727
18783
  beta /= lm_ys[j[0]];
18728
18784
  // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
18729
18785
  ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
@@ -18967,6 +19023,16 @@ enum ggml_opt_result ggml_opt_resume_g(
18967
19023
 
18968
19024
  ////////////////////////////////////////////////////////////////////////////////
18969
19025
 
19026
+ void ggml_set_input(struct ggml_tensor * tensor) {
19027
+ tensor->flags |= GGML_TENSOR_FLAG_INPUT;
19028
+ }
19029
+
19030
+ void ggml_set_output(struct ggml_tensor * tensor) {
19031
+ tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
19032
+ }
19033
+
19034
+ ////////////////////////////////////////////////////////////////////////////////
19035
+
18970
19036
  void ggml_quantize_init(enum ggml_type type) {
18971
19037
  ggml_critical_section_start();
18972
19038
 
@@ -20611,4 +20677,12 @@ int ggml_cpu_has_vsx(void) {
20611
20677
  #endif
20612
20678
  }
20613
20679
 
20680
+ int ggml_cpu_has_matmul_int8(void) {
20681
+ #if defined(__ARM_FEATURE_MATMUL_INT8)
20682
+ return 1;
20683
+ #else
20684
+ return 0;
20685
+ #endif
20686
+ }
20687
+
20614
20688
  ////////////////////////////////////////////////////////////////////////////////
@@ -505,11 +505,17 @@ extern "C" {
505
505
 
506
506
  enum ggml_log_level {
507
507
  GGML_LOG_LEVEL_ERROR = 2,
508
- GGML_LOG_LEVEL_WARN = 3,
509
- GGML_LOG_LEVEL_INFO = 4,
508
+ GGML_LOG_LEVEL_WARN = 3,
509
+ GGML_LOG_LEVEL_INFO = 4,
510
510
  GGML_LOG_LEVEL_DEBUG = 5
511
511
  };
512
512
 
513
+ enum ggml_tensor_flag {
514
+ GGML_TENSOR_FLAG_INPUT = 1,
515
+ GGML_TENSOR_FLAG_OUTPUT = 2,
516
+ GGML_TENSOR_FLAG_PARAM = 4,
517
+ };
518
+
513
519
  // ggml object
514
520
  struct ggml_object {
515
521
  size_t offs;
@@ -543,7 +549,7 @@ extern "C" {
543
549
  // op params - allocated as int32_t for alignment
544
550
  int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
545
551
 
546
- bool is_param;
552
+ int32_t flags;
547
553
 
548
554
  struct ggml_tensor * grad;
549
555
  struct ggml_tensor * src[GGML_MAX_SRC];
@@ -567,6 +573,11 @@ extern "C" {
567
573
 
568
574
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
569
575
 
576
+ // Abort callback
577
+ // If not NULL, called before ggml computation
578
+ // If it returns true, the computation is aborted
579
+ typedef bool (*ggml_abort_callback)(void * data);
580
+
570
581
  // the compute plan that needs to be prepared for ggml_graph_compute()
571
582
  // since https://github.com/ggerganov/ggml/issues/287
572
583
  struct ggml_cplan {
@@ -576,8 +587,8 @@ extern "C" {
576
587
  int n_threads;
577
588
 
578
589
  // abort ggml_graph_compute when true
579
- bool (*abort_callback)(void * data);
580
- void * abort_callback_data;
590
+ ggml_abort_callback abort_callback;
591
+ void * abort_callback_data;
581
592
  };
582
593
 
583
594
  enum ggml_cgraph_eval_order {
@@ -2087,6 +2098,12 @@ extern "C" {
2087
2098
  ggml_opt_callback callback,
2088
2099
  void * callback_data);
2089
2100
 
2101
+ //
2102
+ // tensor flags
2103
+ //
2104
+ GGML_API void ggml_set_input(struct ggml_tensor * tensor);
2105
+ GGML_API void ggml_set_output(struct ggml_tensor * tensor);
2106
+
2090
2107
  //
2091
2108
  // quantization
2092
2109
  //
@@ -2273,6 +2290,7 @@ extern "C" {
2273
2290
  GGML_API int ggml_cpu_has_ssse3 (void);
2274
2291
  GGML_API int ggml_cpu_has_sycl (void);
2275
2292
  GGML_API int ggml_cpu_has_vsx (void);
2293
+ GGML_API int ggml_cpu_has_matmul_int8(void);
2276
2294
 
2277
2295
  //
2278
2296
  // Internal types and functions exposed for tests and benchmarks
@@ -2286,7 +2304,8 @@ extern "C" {
2286
2304
  #endif
2287
2305
  typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
2288
2306
  typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
2289
- typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
2307
+ typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
2308
+ const void * GGML_RESTRICT y, size_t by, int nrc);
2290
2309
 
2291
2310
  typedef struct {
2292
2311
  const char * type_name;
@@ -2298,6 +2317,7 @@ extern "C" {
2298
2317
  ggml_from_float_t from_float_reference;
2299
2318
  ggml_vec_dot_t vec_dot;
2300
2319
  enum ggml_type vec_dot_type;
2320
+ int64_t nrows; // number of rows to process simultaneously;
2301
2321
  } ggml_type_traits_t;
2302
2322
 
2303
2323
  GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);