llama_cpp 0.12.4 → 0.12.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -428,8 +428,8 @@ int64_t ggml_cycles_per_ms(void) {
428
428
 
429
429
  static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
430
430
 
431
- static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
432
- static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
431
+ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
432
+ static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
433
433
 
434
434
  static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
435
435
  [GGML_TYPE_I8] = {
@@ -457,6 +457,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
457
457
  .is_quantized = false,
458
458
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
459
459
  .vec_dot_type = GGML_TYPE_F32,
460
+ .nrows = 1,
460
461
  },
461
462
  [GGML_TYPE_F16] = {
462
463
  .type_name = "f16",
@@ -468,6 +469,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
468
469
  .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
469
470
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
470
471
  .vec_dot_type = GGML_TYPE_F16,
472
+ .nrows = 1,
471
473
  },
472
474
  [GGML_TYPE_Q4_0] = {
473
475
  .type_name = "q4_0",
@@ -479,6 +481,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
479
481
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
480
482
  .vec_dot = ggml_vec_dot_q4_0_q8_0,
481
483
  .vec_dot_type = GGML_TYPE_Q8_0,
484
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
485
+ .nrows = 2,
486
+ #else
487
+ .nrows = 1,
488
+ #endif
482
489
  },
483
490
  [GGML_TYPE_Q4_1] = {
484
491
  .type_name = "q4_1",
@@ -490,6 +497,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
490
497
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
491
498
  .vec_dot = ggml_vec_dot_q4_1_q8_1,
492
499
  .vec_dot_type = GGML_TYPE_Q8_1,
500
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
501
+ .nrows = 2,
502
+ #else
503
+ .nrows = 1,
504
+ #endif
493
505
  },
494
506
  [4] = { // GGML_TYPE_Q4_2
495
507
  .type_name = "DEPRECATED",
@@ -501,6 +513,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
501
513
  .from_float_reference = NULL,
502
514
  .vec_dot = NULL,
503
515
  .vec_dot_type = GGML_TYPE_COUNT,
516
+ .nrows = 1,
504
517
  },
505
518
  [5] = { // GGML_TYPE_Q4_3
506
519
  .type_name = "DEPRECATED",
@@ -512,6 +525,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
512
525
  .from_float_reference = NULL,
513
526
  .vec_dot = NULL,
514
527
  .vec_dot_type = GGML_TYPE_COUNT,
528
+ .nrows = 1,
515
529
  },
516
530
  [GGML_TYPE_Q5_0] = {
517
531
  .type_name = "q5_0",
@@ -523,6 +537,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
523
537
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
524
538
  .vec_dot = ggml_vec_dot_q5_0_q8_0,
525
539
  .vec_dot_type = GGML_TYPE_Q8_0,
540
+ .nrows = 1,
526
541
  },
527
542
  [GGML_TYPE_Q5_1] = {
528
543
  .type_name = "q5_1",
@@ -534,6 +549,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
534
549
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
535
550
  .vec_dot = ggml_vec_dot_q5_1_q8_1,
536
551
  .vec_dot_type = GGML_TYPE_Q8_1,
552
+ .nrows = 1,
537
553
  },
538
554
  [GGML_TYPE_Q8_0] = {
539
555
  .type_name = "q8_0",
@@ -545,6 +561,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
545
561
  .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
546
562
  .vec_dot = ggml_vec_dot_q8_0_q8_0,
547
563
  .vec_dot_type = GGML_TYPE_Q8_0,
564
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
565
+ .nrows = 2,
566
+ #else
567
+ .nrows = 1,
568
+ #endif
548
569
  },
549
570
  [GGML_TYPE_Q8_1] = {
550
571
  .type_name = "q8_1",
@@ -554,6 +575,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
554
575
  .from_float = quantize_row_q8_1,
555
576
  .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
556
577
  .vec_dot_type = GGML_TYPE_Q8_1,
578
+ .nrows = 1,
557
579
  },
558
580
  [GGML_TYPE_Q2_K] = {
559
581
  .type_name = "q2_K",
@@ -565,6 +587,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
565
587
  .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
566
588
  .vec_dot = ggml_vec_dot_q2_K_q8_K,
567
589
  .vec_dot_type = GGML_TYPE_Q8_K,
590
+ .nrows = 1,
568
591
  },
569
592
  [GGML_TYPE_Q3_K] = {
570
593
  .type_name = "q3_K",
@@ -576,6 +599,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
576
599
  .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
577
600
  .vec_dot = ggml_vec_dot_q3_K_q8_K,
578
601
  .vec_dot_type = GGML_TYPE_Q8_K,
602
+ .nrows = 1,
579
603
  },
580
604
  [GGML_TYPE_Q4_K] = {
581
605
  .type_name = "q4_K",
@@ -587,6 +611,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
587
611
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
588
612
  .vec_dot = ggml_vec_dot_q4_K_q8_K,
589
613
  .vec_dot_type = GGML_TYPE_Q8_K,
614
+ .nrows = 1,
590
615
  },
591
616
  [GGML_TYPE_Q5_K] = {
592
617
  .type_name = "q5_K",
@@ -598,6 +623,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
598
623
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
599
624
  .vec_dot = ggml_vec_dot_q5_K_q8_K,
600
625
  .vec_dot_type = GGML_TYPE_Q8_K,
626
+ .nrows = 1,
601
627
  },
602
628
  [GGML_TYPE_Q6_K] = {
603
629
  .type_name = "q6_K",
@@ -609,6 +635,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
609
635
  .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
610
636
  .vec_dot = ggml_vec_dot_q6_K_q8_K,
611
637
  .vec_dot_type = GGML_TYPE_Q8_K,
638
+ .nrows = 1,
612
639
  },
613
640
  [GGML_TYPE_IQ2_XXS] = {
614
641
  .type_name = "iq2_xxs",
@@ -620,6 +647,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
620
647
  .from_float_reference = NULL,
621
648
  .vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
622
649
  .vec_dot_type = GGML_TYPE_Q8_K,
650
+ .nrows = 1,
623
651
  },
624
652
  [GGML_TYPE_IQ2_XS] = {
625
653
  .type_name = "iq2_xs",
@@ -631,6 +659,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
631
659
  .from_float_reference = NULL,
632
660
  .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
633
661
  .vec_dot_type = GGML_TYPE_Q8_K,
662
+ .nrows = 1,
634
663
  },
635
664
  [GGML_TYPE_IQ3_XXS] = {
636
665
  .type_name = "iq3_xxs",
@@ -642,6 +671,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
642
671
  .from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
643
672
  .vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
644
673
  .vec_dot_type = GGML_TYPE_Q8_K,
674
+ .nrows = 1,
645
675
  },
646
676
  [GGML_TYPE_Q8_K] = {
647
677
  .type_name = "q8_K",
@@ -1212,7 +1242,13 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
1212
1242
  inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
1213
1243
  inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
1214
1244
 
1215
- static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
1245
+ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
1246
+ assert(nrc == 1);
1247
+ UNUSED(nrc);
1248
+ UNUSED(bx);
1249
+ UNUSED(by);
1250
+ UNUSED(bs);
1251
+
1216
1252
  #ifdef GGML_SIMD
1217
1253
  float sumf = 0.0f;
1218
1254
  const int np = (n & ~(GGML_F32_STEP - 1));
@@ -1249,7 +1285,13 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
1249
1285
  *s = sumf;
1250
1286
  }
1251
1287
 
1252
- static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
1288
+ static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
1289
+ assert(nrc == 1);
1290
+ UNUSED(nrc);
1291
+ UNUSED(bx);
1292
+ UNUSED(by);
1293
+ UNUSED(bs);
1294
+
1253
1295
  ggml_float sumf = 0.0;
1254
1296
 
1255
1297
  #if defined(GGML_SIMD)
@@ -1455,7 +1497,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
1455
1497
  #endif
1456
1498
  }
1457
1499
 
1458
- inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); }
1500
+ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
1459
1501
  inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
1460
1502
  inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
1461
1503
  inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
@@ -2343,7 +2385,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2343
2385
  #elif defined(GGML_USE_CLBLAST)
2344
2386
  ggml_cl_init();
2345
2387
  #elif defined(GGML_USE_VULKAN)
2346
- ggml_vk_init();
2388
+ ggml_vk_init_cpu_assist();
2347
2389
  #elif defined(GGML_USE_SYCL)
2348
2390
  ggml_init_sycl();
2349
2391
  #endif
@@ -2470,7 +2512,8 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
2470
2512
  size_t max_size = 0;
2471
2513
 
2472
2514
  for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
2473
- max_size = MAX(max_size, ggml_nbytes(tensor));
2515
+ size_t bytes = ggml_nbytes(tensor);
2516
+ max_size = MAX(max_size, bytes);
2474
2517
  }
2475
2518
 
2476
2519
  return max_size;
@@ -2606,7 +2649,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2606
2649
  /*.nb =*/ { 0, 0, 0, 0 },
2607
2650
  /*.op =*/ GGML_OP_NONE,
2608
2651
  /*.op_params =*/ { 0 },
2609
- /*.is_param =*/ false,
2652
+ /*.flags =*/ 0,
2610
2653
  /*.grad =*/ NULL,
2611
2654
  /*.src =*/ { NULL },
2612
2655
  /*.perf_runs =*/ 0,
@@ -6508,7 +6551,7 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
6508
6551
  void ggml_set_param(
6509
6552
  struct ggml_context * ctx,
6510
6553
  struct ggml_tensor * tensor) {
6511
- tensor->is_param = true;
6554
+ tensor->flags |= GGML_TENSOR_FLAG_PARAM;
6512
6555
 
6513
6556
  GGML_ASSERT(tensor->grad == NULL);
6514
6557
  tensor->grad = ggml_dup_tensor(ctx, tensor);
@@ -9991,6 +10034,7 @@ static void ggml_compute_forward_mul_mat(
9991
10034
  ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
9992
10035
  enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
9993
10036
  ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
10037
+ int64_t const vec_dot_num_rows = type_traits[type].nrows;
9994
10038
 
9995
10039
  GGML_ASSERT(ne0 == ne01);
9996
10040
  GGML_ASSERT(ne1 == ne11);
@@ -10158,12 +10202,23 @@ static void ggml_compute_forward_mul_mat(
10158
10202
  const int64_t blck_0 = 16;
10159
10203
  const int64_t blck_1 = 16;
10160
10204
 
10205
+ // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
10206
+ int64_t nrc = vec_dot_num_rows;
10207
+ // TODO: currently the mmla kernels support only even numbered rows/cols.
10208
+ // this check can be removed once they are extended to support odd numbered rows/cols too
10209
+ if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
10210
+ nrc = 1;
10211
+ }
10212
+
10213
+ const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
10214
+
10161
10215
  // attempt to reduce false-sharing (does not seem to make a difference)
10162
- float tmp[16];
10216
+ // 16 * 2, accounting for mmla kernels
10217
+ float tmp[32];
10163
10218
 
10164
10219
  for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
10165
10220
  for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
10166
- for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
10221
+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) {
10167
10222
  const int64_t i13 = (ir1/(ne12*ne1));
10168
10223
  const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
10169
10224
  const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
@@ -10186,17 +10241,19 @@ static void ggml_compute_forward_mul_mat(
10186
10241
  (src1_cont || src1->type != vec_dot_type
10187
10242
  ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
10188
10243
  : (i11*nb11 + i12*nb12 + i13*nb13));
10189
-
10190
10244
  float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
10191
10245
 
10192
10246
  //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
10193
10247
  // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
10194
10248
  //}
10195
10249
 
10196
- for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
10197
- vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
10250
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) {
10251
+ vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc);
10252
+ }
10253
+
10254
+ for (int cn = 0; cn < nrc; ++cn) {
10255
+ memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
10198
10256
  }
10199
- memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
10200
10257
  }
10201
10258
  }
10202
10259
  }
@@ -10385,7 +10442,7 @@ static void ggml_compute_forward_mul_mat_id(
10385
10442
  //}
10386
10443
 
10387
10444
  for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
10388
- vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
10445
+ vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
10389
10446
  }
10390
10447
  memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
10391
10448
  }
@@ -11567,7 +11624,7 @@ static void ggml_compute_forward_soft_max_back_f32(
11567
11624
 
11568
11625
  // linear runtime, no additional memory
11569
11626
  float dot_y_dy = 0;
11570
- ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy);
11627
+ ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
11571
11628
  ggml_vec_cpy_f32 (nc, dx, dy);
11572
11629
  ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
11573
11630
  ggml_vec_mul_f32 (nc, dx, dx, y);
@@ -11887,8 +11944,10 @@ GGML_CALL void ggml_rope_yarn_corr_dims(
11887
11944
  int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
11888
11945
  ) {
11889
11946
  // start and end correction dims
11890
- dims[0] = MAX(0, floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base)));
11891
- dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base)));
11947
+ float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base));
11948
+ float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base));
11949
+ dims[0] = MAX(0, start);
11950
+ dims[1] = MIN(n_dims - 1, end);
11892
11951
  }
11893
11952
 
11894
11953
  static void ggml_compute_forward_rope_f32(
@@ -12366,9 +12425,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12366
12425
  const int i1n = i10*ne11;
12367
12426
  for (int i00 = 0; i00 < ne00; i00++) {
12368
12427
  float v = 0;
12369
- ggml_vec_dot_f16(ne02, &v,
12370
- (ggml_fp16_t *) wdata_src + i1n,
12371
- (ggml_fp16_t *) wdata_kernel + i00*ne02);
12428
+ ggml_vec_dot_f16(ne02, &v, 0,
12429
+ (ggml_fp16_t *) wdata_src + i1n, 0,
12430
+ (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
12372
12431
  dst_data[i10*s0 + i00] += v;
12373
12432
  }
12374
12433
  }
@@ -12463,9 +12522,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12463
12522
  const int i1n = i10*ne11;
12464
12523
  for (int i00 = 0; i00 < ne00; i00++) {
12465
12524
  float v = 0;
12466
- ggml_vec_dot_f32(ne02, &v,
12467
- wdata_src + i1n,
12468
- wdata_kernel + i00*ne02);
12525
+ ggml_vec_dot_f32(ne02, &v, 0,
12526
+ wdata_src + i1n, 0,
12527
+ wdata_kernel + i00*ne02, 0, 1);
12469
12528
  dst_data[i10*s0 + i00] += v;
12470
12529
  }
12471
12530
  }
@@ -12780,9 +12839,9 @@ static void ggml_compute_forward_conv_transpose_2d(
12780
12839
  for (int i01 = 0; i01 < ne01; i01++) {
12781
12840
  for (int i00 = 0; i00 < ne00; i00++) {
12782
12841
  float v = 0;
12783
- ggml_vec_dot_f16(ne03, &v,
12784
- wdata_src + i1n,
12785
- wdata_kernel + i01*ne00*ne03 + i00*ne03);
12842
+ ggml_vec_dot_f16(ne03, &v, 0,
12843
+ wdata_src + i1n, 0,
12844
+ wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
12786
12845
  dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
12787
12846
  }
12788
12847
  }
@@ -13211,9 +13270,9 @@ static void ggml_compute_forward_flash_attn_f32(
13211
13270
  const int i1 = ik1;
13212
13271
 
13213
13272
  ggml_vec_dot_f32(neq0,
13214
- S + i1,
13215
- (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
13216
- (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
13273
+ S + i1, 0,
13274
+ (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
13275
+ (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
13217
13276
  }
13218
13277
 
13219
13278
  // scale
@@ -13296,9 +13355,9 @@ static void ggml_compute_forward_flash_attn_f32(
13296
13355
  const int iv3 = iq3;
13297
13356
 
13298
13357
  ggml_vec_dot_f32(masked_begin,
13299
- (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
13300
- (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
13301
- S);
13358
+ (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
13359
+ (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
13360
+ S, 0, 1);
13302
13361
  }
13303
13362
  }
13304
13363
  }
@@ -13401,9 +13460,9 @@ static void ggml_compute_forward_flash_attn_f16(
13401
13460
  const int i1 = ik1;
13402
13461
 
13403
13462
  ggml_vec_dot_f16(neq0,
13404
- S + i1,
13405
- (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
13406
- (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
13463
+ S + i1, 0,
13464
+ (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
13465
+ (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
13407
13466
  }
13408
13467
  } else {
13409
13468
  for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
@@ -13505,9 +13564,9 @@ static void ggml_compute_forward_flash_attn_f16(
13505
13564
  const int iv3 = iq3;
13506
13565
 
13507
13566
  ggml_vec_dot_f16(nev0,
13508
- (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
13509
- (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
13510
- S16);
13567
+ (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
13568
+ (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
13569
+ S16, 0, 1);
13511
13570
  }
13512
13571
  } else {
13513
13572
  for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
@@ -13649,9 +13708,9 @@ static void ggml_compute_forward_flash_ff_f16(
13649
13708
  const int i1 = ib01;
13650
13709
 
13651
13710
  ggml_vec_dot_f16(nea0,
13652
- S + i1,
13653
- (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)),
13654
- (ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)));
13711
+ S + i1, 0,
13712
+ (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
13713
+ (ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1);
13655
13714
  }
13656
13715
 
13657
13716
  ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
@@ -13674,9 +13733,9 @@ static void ggml_compute_forward_flash_ff_f16(
13674
13733
  for (int64_t ic = 0; ic < nec01; ++ic) {
13675
13734
 
13676
13735
  ggml_vec_dot_f16(neb01,
13677
- (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
13678
- (ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)),
13679
- S16);
13736
+ (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
13737
+ (ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
13738
+ S16, 0, 1);
13680
13739
  }
13681
13740
 
13682
13741
  ggml_vec_add_f32(nec01,
@@ -13863,9 +13922,9 @@ static void ggml_compute_forward_flash_attn_back_f32(
13863
13922
  const int i1 = ik1;
13864
13923
 
13865
13924
  ggml_vec_dot_f32(neq0,
13866
- S + i1,
13867
- (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
13868
- (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
13925
+ S + i1, 0,
13926
+ (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
13927
+ (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
13869
13928
  }
13870
13929
 
13871
13930
  // scale
@@ -14010,7 +14069,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
14010
14069
 
14011
14070
  // S = SM * (S - dot(SM, S))
14012
14071
  float dot_SM_gradSM = 0;
14013
- ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S);
14072
+ ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
14014
14073
  ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
14015
14074
  ggml_vec_mul_f32 (masked_begin, S, S, SM);
14016
14075
 
@@ -14847,10 +14906,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14847
14906
  GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
14848
14907
  GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
14849
14908
  #elif defined(GGML_USE_VULKAN)
14850
- const bool skip_cpu = ggml_vk_compute_forward(params, tensor);
14909
+ const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
14851
14910
  #ifdef GGML_VULKAN_CHECK_RESULTS
14852
14911
  if (skip_cpu) {
14853
- ggml_vk_check_results_1(params, tensor);
14912
+ ggml_vk_check_results_1_cpu_assist(params, tensor);
14854
14913
  }
14855
14914
  #endif
14856
14915
  if (skip_cpu) {
@@ -15308,7 +15367,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
15308
15367
  return NULL;
15309
15368
  }
15310
15369
 
15311
- if (node->is_param) {
15370
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
15312
15371
  return node;
15313
15372
  }
15314
15373
 
@@ -15342,7 +15401,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
15342
15401
 
15343
15402
  clone->op = node->op;
15344
15403
  clone->grad = node->grad;
15345
- clone->is_param = node->is_param;
15404
+ clone->flags = node->flags;
15346
15405
  clone->extra = node->extra;
15347
15406
  for (int k = 0; k < GGML_MAX_DIMS; ++k) {
15348
15407
  clone->nb[k] = node->nb[k];
@@ -16374,7 +16433,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
16374
16433
  for (int i = 0; i < gf->n_nodes; i++) {
16375
16434
  struct ggml_tensor * node = gf->nodes[i];
16376
16435
 
16377
- if (node->is_param) {
16436
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
16378
16437
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16379
16438
  ggml_build_forward_expand(gb, node->grad);
16380
16439
  }
@@ -16646,7 +16705,7 @@ struct ggml_compute_state_shared {
16646
16705
  atomic_int node_n; // active graph node
16647
16706
  atomic_int node_task; // active graph node task phase
16648
16707
 
16649
- bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
16708
+ ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
16650
16709
  void * abort_callback_data;
16651
16710
  };
16652
16711
 
@@ -17266,12 +17325,12 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17266
17325
 
17267
17326
  #ifdef GGML_USE_VULKAN
17268
17327
  for (int i = 0; i < cgraph->n_nodes; i++) {
17269
- ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]);
17328
+ ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
17270
17329
  }
17271
- ggml_vk_preallocate_buffers();
17330
+ ggml_vk_preallocate_buffers_cpu_assist();
17272
17331
 
17273
17332
  for (int i = 0; i < cgraph->n_nodes; i++) {
17274
- ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
17333
+ ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
17275
17334
  }
17276
17335
  #endif
17277
17336
 
@@ -17327,7 +17386,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17327
17386
  }
17328
17387
 
17329
17388
  #ifdef GGML_USE_VULKAN
17330
- ggml_vk_graph_cleanup();
17389
+ ggml_vk_graph_cleanup_cpu_assist();
17331
17390
  #endif
17332
17391
 
17333
17392
  // performance stats (graph)
@@ -17859,7 +17918,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17859
17918
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
17860
17919
  i,
17861
17920
  node->ne[0], node->ne[1], node->ne[2],
17862
- ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
17921
+ ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs,
17863
17922
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
17864
17923
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
17865
17924
  (double) node->perf_time_us / 1000.0,
@@ -17952,7 +18011,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17952
18011
  continue;
17953
18012
  }
17954
18013
 
17955
- if (node->is_param) {
18014
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
17956
18015
  snprintf(color, sizeof(color), "yellow");
17957
18016
  } else if (node->grad) {
17958
18017
  if (ggml_graph_find(gf, node)) {
@@ -18126,7 +18185,7 @@ static enum ggml_opt_result ggml_opt_adam(
18126
18185
  int np = 0;
18127
18186
  int64_t nx = 0;
18128
18187
  for (int i = 0; i < gf->n_nodes; ++i) {
18129
- if (gf->nodes[i]->is_param) {
18188
+ if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
18130
18189
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
18131
18190
 
18132
18191
  GGML_ASSERT(np < GGML_MAX_PARAMS);
@@ -18379,7 +18438,7 @@ static enum ggml_opt_result linesearch_backtracking(
18379
18438
  }
18380
18439
 
18381
18440
  // compute the initial gradient in the search direction
18382
- ggml_vec_dot_f32(nx, &dginit, g, d);
18441
+ ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1);
18383
18442
 
18384
18443
  // make sure that d points to a descent direction
18385
18444
  if (0 < dginit) {
@@ -18429,7 +18488,7 @@ static enum ggml_opt_result linesearch_backtracking(
18429
18488
  return count;
18430
18489
  }
18431
18490
 
18432
- ggml_vec_dot_f32(nx, &dg, g, d);
18491
+ ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1);
18433
18492
 
18434
18493
  // check the Wolfe condition
18435
18494
  if (dg < params->lbfgs.wolfe * dginit) {
@@ -18489,7 +18548,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18489
18548
  int np = 0;
18490
18549
  int nx = 0;
18491
18550
  for (int i = 0; i < gf->n_nodes; ++i) {
18492
- if (gf->nodes[i]->is_param) {
18551
+ if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
18493
18552
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
18494
18553
 
18495
18554
  GGML_ASSERT(np < GGML_MAX_PARAMS);
@@ -18690,8 +18749,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18690
18749
  // ys = y^t \cdot s -> 1 / \rho.
18691
18750
  // yy = y^t \cdot y.
18692
18751
  //
18693
- ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
18694
- ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
18752
+ ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1);
18753
+ ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1);
18695
18754
 
18696
18755
  lm_ys[end[0]] = ys;
18697
18756
 
@@ -18710,7 +18769,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18710
18769
  for (int i = 0; i < bound; ++i) {
18711
18770
  j[0] = (j[0] + m - 1) % m;
18712
18771
  // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
18713
- ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d);
18772
+ ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1);
18714
18773
  lm_alpha[j[0]] /= lm_ys[j[0]];
18715
18774
  // q_{i} = q_{i+1} - \alpha_{i} y_{i}
18716
18775
  ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
@@ -18720,7 +18779,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18720
18779
 
18721
18780
  for (int i = 0; i < bound; ++i) {
18722
18781
  // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
18723
- ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d);
18782
+ ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1);
18724
18783
  beta /= lm_ys[j[0]];
18725
18784
  // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
18726
18785
  ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
@@ -18964,6 +19023,16 @@ enum ggml_opt_result ggml_opt_resume_g(
18964
19023
 
18965
19024
  ////////////////////////////////////////////////////////////////////////////////
18966
19025
 
19026
+ void ggml_set_input(struct ggml_tensor * tensor) {
19027
+ tensor->flags |= GGML_TENSOR_FLAG_INPUT;
19028
+ }
19029
+
19030
+ void ggml_set_output(struct ggml_tensor * tensor) {
19031
+ tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
19032
+ }
19033
+
19034
+ ////////////////////////////////////////////////////////////////////////////////
19035
+
18967
19036
  void ggml_quantize_init(enum ggml_type type) {
18968
19037
  ggml_critical_section_start();
18969
19038
 
@@ -20608,4 +20677,12 @@ int ggml_cpu_has_vsx(void) {
20608
20677
  #endif
20609
20678
  }
20610
20679
 
20680
+ int ggml_cpu_has_matmul_int8(void) {
20681
+ #if defined(__ARM_FEATURE_MATMUL_INT8)
20682
+ return 1;
20683
+ #else
20684
+ return 0;
20685
+ #endif
20686
+ }
20687
+
20611
20688
  ////////////////////////////////////////////////////////////////////////////////