llama_cpp 0.12.5 → 0.12.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +46 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -1
- data/vendor/tmp/llama.cpp/ggml-alloc.c +563 -490
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +250 -262
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-metal.m +2 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +347 -40
- data/vendor/tmp/llama.cpp/ggml-quants.h +14 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +14 -61
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +89 -6
- data/vendor/tmp/llama.cpp/ggml.c +134 -60
- data/vendor/tmp/llama.cpp/ggml.h +26 -6
- data/vendor/tmp/llama.cpp/llama.cpp +654 -130
- data/vendor/tmp/llama.cpp/llama.h +6 -0
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -428,8 +428,8 @@ int64_t ggml_cycles_per_ms(void) {
|
|
428
428
|
|
429
429
|
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
430
430
|
|
431
|
-
static void ggml_vec_dot_f32(
|
432
|
-
static void ggml_vec_dot_f16(
|
431
|
+
static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
|
432
|
+
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
|
433
433
|
|
434
434
|
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
435
435
|
[GGML_TYPE_I8] = {
|
@@ -457,6 +457,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
457
457
|
.is_quantized = false,
|
458
458
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
459
459
|
.vec_dot_type = GGML_TYPE_F32,
|
460
|
+
.nrows = 1,
|
460
461
|
},
|
461
462
|
[GGML_TYPE_F16] = {
|
462
463
|
.type_name = "f16",
|
@@ -468,6 +469,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
468
469
|
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
469
470
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
470
471
|
.vec_dot_type = GGML_TYPE_F16,
|
472
|
+
.nrows = 1,
|
471
473
|
},
|
472
474
|
[GGML_TYPE_Q4_0] = {
|
473
475
|
.type_name = "q4_0",
|
@@ -479,6 +481,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
479
481
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
480
482
|
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
481
483
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
484
|
+
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
485
|
+
.nrows = 2,
|
486
|
+
#else
|
487
|
+
.nrows = 1,
|
488
|
+
#endif
|
482
489
|
},
|
483
490
|
[GGML_TYPE_Q4_1] = {
|
484
491
|
.type_name = "q4_1",
|
@@ -490,6 +497,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
490
497
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
491
498
|
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
492
499
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
500
|
+
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
501
|
+
.nrows = 2,
|
502
|
+
#else
|
503
|
+
.nrows = 1,
|
504
|
+
#endif
|
493
505
|
},
|
494
506
|
[4] = { // GGML_TYPE_Q4_2
|
495
507
|
.type_name = "DEPRECATED",
|
@@ -501,6 +513,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
501
513
|
.from_float_reference = NULL,
|
502
514
|
.vec_dot = NULL,
|
503
515
|
.vec_dot_type = GGML_TYPE_COUNT,
|
516
|
+
.nrows = 1,
|
504
517
|
},
|
505
518
|
[5] = { // GGML_TYPE_Q4_3
|
506
519
|
.type_name = "DEPRECATED",
|
@@ -512,6 +525,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
512
525
|
.from_float_reference = NULL,
|
513
526
|
.vec_dot = NULL,
|
514
527
|
.vec_dot_type = GGML_TYPE_COUNT,
|
528
|
+
.nrows = 1,
|
515
529
|
},
|
516
530
|
[GGML_TYPE_Q5_0] = {
|
517
531
|
.type_name = "q5_0",
|
@@ -523,6 +537,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
523
537
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
524
538
|
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
525
539
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
540
|
+
.nrows = 1,
|
526
541
|
},
|
527
542
|
[GGML_TYPE_Q5_1] = {
|
528
543
|
.type_name = "q5_1",
|
@@ -534,6 +549,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
534
549
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
535
550
|
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
536
551
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
552
|
+
.nrows = 1,
|
537
553
|
},
|
538
554
|
[GGML_TYPE_Q8_0] = {
|
539
555
|
.type_name = "q8_0",
|
@@ -545,6 +561,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
545
561
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
546
562
|
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
547
563
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
564
|
+
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
565
|
+
.nrows = 2,
|
566
|
+
#else
|
567
|
+
.nrows = 1,
|
568
|
+
#endif
|
548
569
|
},
|
549
570
|
[GGML_TYPE_Q8_1] = {
|
550
571
|
.type_name = "q8_1",
|
@@ -554,6 +575,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
554
575
|
.from_float = quantize_row_q8_1,
|
555
576
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
556
577
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
578
|
+
.nrows = 1,
|
557
579
|
},
|
558
580
|
[GGML_TYPE_Q2_K] = {
|
559
581
|
.type_name = "q2_K",
|
@@ -565,6 +587,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
565
587
|
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
566
588
|
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
567
589
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
590
|
+
.nrows = 1,
|
568
591
|
},
|
569
592
|
[GGML_TYPE_Q3_K] = {
|
570
593
|
.type_name = "q3_K",
|
@@ -576,6 +599,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
576
599
|
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
577
600
|
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
578
601
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
602
|
+
.nrows = 1,
|
579
603
|
},
|
580
604
|
[GGML_TYPE_Q4_K] = {
|
581
605
|
.type_name = "q4_K",
|
@@ -587,6 +611,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
587
611
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
588
612
|
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
589
613
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
614
|
+
.nrows = 1,
|
590
615
|
},
|
591
616
|
[GGML_TYPE_Q5_K] = {
|
592
617
|
.type_name = "q5_K",
|
@@ -598,6 +623,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
598
623
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
599
624
|
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
600
625
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
626
|
+
.nrows = 1,
|
601
627
|
},
|
602
628
|
[GGML_TYPE_Q6_K] = {
|
603
629
|
.type_name = "q6_K",
|
@@ -609,6 +635,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
609
635
|
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
610
636
|
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
611
637
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
638
|
+
.nrows = 1,
|
612
639
|
},
|
613
640
|
[GGML_TYPE_IQ2_XXS] = {
|
614
641
|
.type_name = "iq2_xxs",
|
@@ -620,6 +647,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
620
647
|
.from_float_reference = NULL,
|
621
648
|
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
622
649
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
650
|
+
.nrows = 1,
|
623
651
|
},
|
624
652
|
[GGML_TYPE_IQ2_XS] = {
|
625
653
|
.type_name = "iq2_xs",
|
@@ -631,6 +659,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
631
659
|
.from_float_reference = NULL,
|
632
660
|
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
633
661
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
662
|
+
.nrows = 1,
|
634
663
|
},
|
635
664
|
[GGML_TYPE_IQ3_XXS] = {
|
636
665
|
.type_name = "iq3_xxs",
|
@@ -642,6 +671,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
642
671
|
.from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
|
643
672
|
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
644
673
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
674
|
+
.nrows = 1,
|
645
675
|
},
|
646
676
|
[GGML_TYPE_Q8_K] = {
|
647
677
|
.type_name = "q8_K",
|
@@ -1212,7 +1242,13 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
|
|
1212
1242
|
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
1213
1243
|
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
1214
1244
|
|
1215
|
-
static void ggml_vec_dot_f32(
|
1245
|
+
static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
|
1246
|
+
assert(nrc == 1);
|
1247
|
+
UNUSED(nrc);
|
1248
|
+
UNUSED(bx);
|
1249
|
+
UNUSED(by);
|
1250
|
+
UNUSED(bs);
|
1251
|
+
|
1216
1252
|
#ifdef GGML_SIMD
|
1217
1253
|
float sumf = 0.0f;
|
1218
1254
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
@@ -1249,7 +1285,13 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
|
|
1249
1285
|
*s = sumf;
|
1250
1286
|
}
|
1251
1287
|
|
1252
|
-
static void ggml_vec_dot_f16(
|
1288
|
+
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
|
1289
|
+
assert(nrc == 1);
|
1290
|
+
UNUSED(nrc);
|
1291
|
+
UNUSED(bx);
|
1292
|
+
UNUSED(by);
|
1293
|
+
UNUSED(bs);
|
1294
|
+
|
1253
1295
|
ggml_float sumf = 0.0;
|
1254
1296
|
|
1255
1297
|
#if defined(GGML_SIMD)
|
@@ -1455,7 +1497,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
|
1455
1497
|
#endif
|
1456
1498
|
}
|
1457
1499
|
|
1458
|
-
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); }
|
1500
|
+
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
|
1459
1501
|
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
1460
1502
|
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
1461
1503
|
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
|
@@ -2607,7 +2649,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
2607
2649
|
/*.nb =*/ { 0, 0, 0, 0 },
|
2608
2650
|
/*.op =*/ GGML_OP_NONE,
|
2609
2651
|
/*.op_params =*/ { 0 },
|
2610
|
-
/*.
|
2652
|
+
/*.flags =*/ 0,
|
2611
2653
|
/*.grad =*/ NULL,
|
2612
2654
|
/*.src =*/ { NULL },
|
2613
2655
|
/*.perf_runs =*/ 0,
|
@@ -6509,7 +6551,7 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
|
|
6509
6551
|
void ggml_set_param(
|
6510
6552
|
struct ggml_context * ctx,
|
6511
6553
|
struct ggml_tensor * tensor) {
|
6512
|
-
tensor->
|
6554
|
+
tensor->flags |= GGML_TENSOR_FLAG_PARAM;
|
6513
6555
|
|
6514
6556
|
GGML_ASSERT(tensor->grad == NULL);
|
6515
6557
|
tensor->grad = ggml_dup_tensor(ctx, tensor);
|
@@ -9992,6 +10034,7 @@ static void ggml_compute_forward_mul_mat(
|
|
9992
10034
|
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
9993
10035
|
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
9994
10036
|
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
10037
|
+
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
9995
10038
|
|
9996
10039
|
GGML_ASSERT(ne0 == ne01);
|
9997
10040
|
GGML_ASSERT(ne1 == ne11);
|
@@ -10159,12 +10202,23 @@ static void ggml_compute_forward_mul_mat(
|
|
10159
10202
|
const int64_t blck_0 = 16;
|
10160
10203
|
const int64_t blck_1 = 16;
|
10161
10204
|
|
10205
|
+
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
|
10206
|
+
int64_t nrc = vec_dot_num_rows;
|
10207
|
+
// TODO: currently the mmla kernels support only even numbered rows/cols.
|
10208
|
+
// this check can be removed once they are extended to support odd numbered rows/cols too
|
10209
|
+
if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
|
10210
|
+
nrc = 1;
|
10211
|
+
}
|
10212
|
+
|
10213
|
+
const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
|
10214
|
+
|
10162
10215
|
// attempt to reduce false-sharing (does not seem to make a difference)
|
10163
|
-
|
10216
|
+
// 16 * 2, accounting for mmla kernels
|
10217
|
+
float tmp[32];
|
10164
10218
|
|
10165
10219
|
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
10166
10220
|
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
10167
|
-
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111;
|
10221
|
+
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) {
|
10168
10222
|
const int64_t i13 = (ir1/(ne12*ne1));
|
10169
10223
|
const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
|
10170
10224
|
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
@@ -10187,17 +10241,19 @@ static void ggml_compute_forward_mul_mat(
|
|
10187
10241
|
(src1_cont || src1->type != vec_dot_type
|
10188
10242
|
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
10189
10243
|
: (i11*nb11 + i12*nb12 + i13*nb13));
|
10190
|
-
|
10191
10244
|
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
10192
10245
|
|
10193
10246
|
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
10194
10247
|
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
10195
10248
|
//}
|
10196
10249
|
|
10197
|
-
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011;
|
10198
|
-
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
10250
|
+
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) {
|
10251
|
+
vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc);
|
10252
|
+
}
|
10253
|
+
|
10254
|
+
for (int cn = 0; cn < nrc; ++cn) {
|
10255
|
+
memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
10199
10256
|
}
|
10200
|
-
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
10201
10257
|
}
|
10202
10258
|
}
|
10203
10259
|
}
|
@@ -10386,7 +10442,7 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10386
10442
|
//}
|
10387
10443
|
|
10388
10444
|
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
10389
|
-
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
10445
|
+
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
|
10390
10446
|
}
|
10391
10447
|
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
10392
10448
|
}
|
@@ -11568,7 +11624,7 @@ static void ggml_compute_forward_soft_max_back_f32(
|
|
11568
11624
|
|
11569
11625
|
// linear runtime, no additional memory
|
11570
11626
|
float dot_y_dy = 0;
|
11571
|
-
ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy);
|
11627
|
+
ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
|
11572
11628
|
ggml_vec_cpy_f32 (nc, dx, dy);
|
11573
11629
|
ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
|
11574
11630
|
ggml_vec_mul_f32 (nc, dx, dx, y);
|
@@ -12369,9 +12425,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
|
|
12369
12425
|
const int i1n = i10*ne11;
|
12370
12426
|
for (int i00 = 0; i00 < ne00; i00++) {
|
12371
12427
|
float v = 0;
|
12372
|
-
ggml_vec_dot_f16(ne02, &v,
|
12373
|
-
(ggml_fp16_t *) wdata_src + i1n,
|
12374
|
-
(ggml_fp16_t *) wdata_kernel + i00*ne02);
|
12428
|
+
ggml_vec_dot_f16(ne02, &v, 0,
|
12429
|
+
(ggml_fp16_t *) wdata_src + i1n, 0,
|
12430
|
+
(ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
|
12375
12431
|
dst_data[i10*s0 + i00] += v;
|
12376
12432
|
}
|
12377
12433
|
}
|
@@ -12466,9 +12522,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
|
|
12466
12522
|
const int i1n = i10*ne11;
|
12467
12523
|
for (int i00 = 0; i00 < ne00; i00++) {
|
12468
12524
|
float v = 0;
|
12469
|
-
ggml_vec_dot_f32(ne02, &v,
|
12470
|
-
wdata_src + i1n,
|
12471
|
-
wdata_kernel + i00*ne02);
|
12525
|
+
ggml_vec_dot_f32(ne02, &v, 0,
|
12526
|
+
wdata_src + i1n, 0,
|
12527
|
+
wdata_kernel + i00*ne02, 0, 1);
|
12472
12528
|
dst_data[i10*s0 + i00] += v;
|
12473
12529
|
}
|
12474
12530
|
}
|
@@ -12783,9 +12839,9 @@ static void ggml_compute_forward_conv_transpose_2d(
|
|
12783
12839
|
for (int i01 = 0; i01 < ne01; i01++) {
|
12784
12840
|
for (int i00 = 0; i00 < ne00; i00++) {
|
12785
12841
|
float v = 0;
|
12786
|
-
ggml_vec_dot_f16(ne03, &v,
|
12787
|
-
wdata_src + i1n,
|
12788
|
-
wdata_kernel + i01*ne00*ne03 + i00*ne03);
|
12842
|
+
ggml_vec_dot_f16(ne03, &v, 0,
|
12843
|
+
wdata_src + i1n, 0,
|
12844
|
+
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
|
12789
12845
|
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
|
12790
12846
|
}
|
12791
12847
|
}
|
@@ -13214,9 +13270,9 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
13214
13270
|
const int i1 = ik1;
|
13215
13271
|
|
13216
13272
|
ggml_vec_dot_f32(neq0,
|
13217
|
-
S + i1,
|
13218
|
-
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
13219
|
-
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
13273
|
+
S + i1, 0,
|
13274
|
+
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
13275
|
+
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
13220
13276
|
}
|
13221
13277
|
|
13222
13278
|
// scale
|
@@ -13299,9 +13355,9 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
13299
13355
|
const int iv3 = iq3;
|
13300
13356
|
|
13301
13357
|
ggml_vec_dot_f32(masked_begin,
|
13302
|
-
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
13303
|
-
(float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
|
13304
|
-
S);
|
13358
|
+
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
13359
|
+
(float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
|
13360
|
+
S, 0, 1);
|
13305
13361
|
}
|
13306
13362
|
}
|
13307
13363
|
}
|
@@ -13404,9 +13460,9 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
13404
13460
|
const int i1 = ik1;
|
13405
13461
|
|
13406
13462
|
ggml_vec_dot_f16(neq0,
|
13407
|
-
S + i1,
|
13408
|
-
(ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
13409
|
-
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
13463
|
+
S + i1, 0,
|
13464
|
+
(ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
13465
|
+
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
13410
13466
|
}
|
13411
13467
|
} else {
|
13412
13468
|
for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
|
@@ -13508,9 +13564,9 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
13508
13564
|
const int iv3 = iq3;
|
13509
13565
|
|
13510
13566
|
ggml_vec_dot_f16(nev0,
|
13511
|
-
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
13512
|
-
(ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
|
13513
|
-
S16);
|
13567
|
+
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
13568
|
+
(ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
|
13569
|
+
S16, 0, 1);
|
13514
13570
|
}
|
13515
13571
|
} else {
|
13516
13572
|
for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
|
@@ -13652,9 +13708,9 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
13652
13708
|
const int i1 = ib01;
|
13653
13709
|
|
13654
13710
|
ggml_vec_dot_f16(nea0,
|
13655
|
-
S + i1,
|
13656
|
-
(ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)),
|
13657
|
-
(ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)));
|
13711
|
+
S + i1, 0,
|
13712
|
+
(ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
|
13713
|
+
(ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1);
|
13658
13714
|
}
|
13659
13715
|
|
13660
13716
|
ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
|
@@ -13677,9 +13733,9 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
13677
13733
|
for (int64_t ic = 0; ic < nec01; ++ic) {
|
13678
13734
|
|
13679
13735
|
ggml_vec_dot_f16(neb01,
|
13680
|
-
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
13681
|
-
(ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)),
|
13682
|
-
S16);
|
13736
|
+
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
13737
|
+
(ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
|
13738
|
+
S16, 0, 1);
|
13683
13739
|
}
|
13684
13740
|
|
13685
13741
|
ggml_vec_add_f32(nec01,
|
@@ -13866,9 +13922,9 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
13866
13922
|
const int i1 = ik1;
|
13867
13923
|
|
13868
13924
|
ggml_vec_dot_f32(neq0,
|
13869
|
-
S + i1,
|
13870
|
-
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
13871
|
-
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
13925
|
+
S + i1, 0,
|
13926
|
+
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
13927
|
+
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
13872
13928
|
}
|
13873
13929
|
|
13874
13930
|
// scale
|
@@ -14013,7 +14069,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
14013
14069
|
|
14014
14070
|
// S = SM * (S - dot(SM, S))
|
14015
14071
|
float dot_SM_gradSM = 0;
|
14016
|
-
ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S);
|
14072
|
+
ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
|
14017
14073
|
ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
|
14018
14074
|
ggml_vec_mul_f32 (masked_begin, S, S, SM);
|
14019
14075
|
|
@@ -15311,7 +15367,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
|
|
15311
15367
|
return NULL;
|
15312
15368
|
}
|
15313
15369
|
|
15314
|
-
if (node->
|
15370
|
+
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
|
15315
15371
|
return node;
|
15316
15372
|
}
|
15317
15373
|
|
@@ -15345,7 +15401,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
|
|
15345
15401
|
|
15346
15402
|
clone->op = node->op;
|
15347
15403
|
clone->grad = node->grad;
|
15348
|
-
clone->
|
15404
|
+
clone->flags = node->flags;
|
15349
15405
|
clone->extra = node->extra;
|
15350
15406
|
for (int k = 0; k < GGML_MAX_DIMS; ++k) {
|
15351
15407
|
clone->nb[k] = node->nb[k];
|
@@ -16377,7 +16433,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
|
|
16377
16433
|
for (int i = 0; i < gf->n_nodes; i++) {
|
16378
16434
|
struct ggml_tensor * node = gf->nodes[i];
|
16379
16435
|
|
16380
|
-
if (node->
|
16436
|
+
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
|
16381
16437
|
GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
|
16382
16438
|
ggml_build_forward_expand(gb, node->grad);
|
16383
16439
|
}
|
@@ -16649,7 +16705,7 @@ struct ggml_compute_state_shared {
|
|
16649
16705
|
atomic_int node_n; // active graph node
|
16650
16706
|
atomic_int node_task; // active graph node task phase
|
16651
16707
|
|
16652
|
-
|
16708
|
+
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
16653
16709
|
void * abort_callback_data;
|
16654
16710
|
};
|
16655
16711
|
|
@@ -17862,7 +17918,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
17862
17918
|
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
17863
17919
|
i,
|
17864
17920
|
node->ne[0], node->ne[1], node->ne[2],
|
17865
|
-
ggml_op_name(node->op), node->
|
17921
|
+
ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
17866
17922
|
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
|
17867
17923
|
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
|
17868
17924
|
(double) node->perf_time_us / 1000.0,
|
@@ -17955,7 +18011,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
17955
18011
|
continue;
|
17956
18012
|
}
|
17957
18013
|
|
17958
|
-
if (node->
|
18014
|
+
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
|
17959
18015
|
snprintf(color, sizeof(color), "yellow");
|
17960
18016
|
} else if (node->grad) {
|
17961
18017
|
if (ggml_graph_find(gf, node)) {
|
@@ -18129,7 +18185,7 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
18129
18185
|
int np = 0;
|
18130
18186
|
int64_t nx = 0;
|
18131
18187
|
for (int i = 0; i < gf->n_nodes; ++i) {
|
18132
|
-
if (gf->nodes[i]->
|
18188
|
+
if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
|
18133
18189
|
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
18134
18190
|
|
18135
18191
|
GGML_ASSERT(np < GGML_MAX_PARAMS);
|
@@ -18382,7 +18438,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
18382
18438
|
}
|
18383
18439
|
|
18384
18440
|
// compute the initial gradient in the search direction
|
18385
|
-
ggml_vec_dot_f32(nx, &dginit, g, d);
|
18441
|
+
ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1);
|
18386
18442
|
|
18387
18443
|
// make sure that d points to a descent direction
|
18388
18444
|
if (0 < dginit) {
|
@@ -18432,7 +18488,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
18432
18488
|
return count;
|
18433
18489
|
}
|
18434
18490
|
|
18435
|
-
ggml_vec_dot_f32(nx, &dg, g, d);
|
18491
|
+
ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1);
|
18436
18492
|
|
18437
18493
|
// check the Wolfe condition
|
18438
18494
|
if (dg < params->lbfgs.wolfe * dginit) {
|
@@ -18492,7 +18548,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18492
18548
|
int np = 0;
|
18493
18549
|
int nx = 0;
|
18494
18550
|
for (int i = 0; i < gf->n_nodes; ++i) {
|
18495
|
-
if (gf->nodes[i]->
|
18551
|
+
if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
|
18496
18552
|
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
18497
18553
|
|
18498
18554
|
GGML_ASSERT(np < GGML_MAX_PARAMS);
|
@@ -18693,8 +18749,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18693
18749
|
// ys = y^t \cdot s -> 1 / \rho.
|
18694
18750
|
// yy = y^t \cdot y.
|
18695
18751
|
//
|
18696
|
-
ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
|
18697
|
-
ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
|
18752
|
+
ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1);
|
18753
|
+
ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1);
|
18698
18754
|
|
18699
18755
|
lm_ys[end[0]] = ys;
|
18700
18756
|
|
@@ -18713,7 +18769,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18713
18769
|
for (int i = 0; i < bound; ++i) {
|
18714
18770
|
j[0] = (j[0] + m - 1) % m;
|
18715
18771
|
// \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
|
18716
|
-
ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d);
|
18772
|
+
ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1);
|
18717
18773
|
lm_alpha[j[0]] /= lm_ys[j[0]];
|
18718
18774
|
// q_{i} = q_{i+1} - \alpha_{i} y_{i}
|
18719
18775
|
ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
|
@@ -18723,7 +18779,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18723
18779
|
|
18724
18780
|
for (int i = 0; i < bound; ++i) {
|
18725
18781
|
// \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
|
18726
|
-
ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d);
|
18782
|
+
ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1);
|
18727
18783
|
beta /= lm_ys[j[0]];
|
18728
18784
|
// \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
|
18729
18785
|
ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
|
@@ -18967,6 +19023,16 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|
18967
19023
|
|
18968
19024
|
////////////////////////////////////////////////////////////////////////////////
|
18969
19025
|
|
19026
|
+
void ggml_set_input(struct ggml_tensor * tensor) {
|
19027
|
+
tensor->flags |= GGML_TENSOR_FLAG_INPUT;
|
19028
|
+
}
|
19029
|
+
|
19030
|
+
void ggml_set_output(struct ggml_tensor * tensor) {
|
19031
|
+
tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
|
19032
|
+
}
|
19033
|
+
|
19034
|
+
////////////////////////////////////////////////////////////////////////////////
|
19035
|
+
|
18970
19036
|
void ggml_quantize_init(enum ggml_type type) {
|
18971
19037
|
ggml_critical_section_start();
|
18972
19038
|
|
@@ -20611,4 +20677,12 @@ int ggml_cpu_has_vsx(void) {
|
|
20611
20677
|
#endif
|
20612
20678
|
}
|
20613
20679
|
|
20680
|
+
int ggml_cpu_has_matmul_int8(void) {
|
20681
|
+
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
20682
|
+
return 1;
|
20683
|
+
#else
|
20684
|
+
return 0;
|
20685
|
+
#endif
|
20686
|
+
}
|
20687
|
+
|
20614
20688
|
////////////////////////////////////////////////////////////////////////////////
|
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -505,11 +505,17 @@ extern "C" {
|
|
505
505
|
|
506
506
|
enum ggml_log_level {
|
507
507
|
GGML_LOG_LEVEL_ERROR = 2,
|
508
|
-
GGML_LOG_LEVEL_WARN
|
509
|
-
GGML_LOG_LEVEL_INFO
|
508
|
+
GGML_LOG_LEVEL_WARN = 3,
|
509
|
+
GGML_LOG_LEVEL_INFO = 4,
|
510
510
|
GGML_LOG_LEVEL_DEBUG = 5
|
511
511
|
};
|
512
512
|
|
513
|
+
enum ggml_tensor_flag {
|
514
|
+
GGML_TENSOR_FLAG_INPUT = 1,
|
515
|
+
GGML_TENSOR_FLAG_OUTPUT = 2,
|
516
|
+
GGML_TENSOR_FLAG_PARAM = 4,
|
517
|
+
};
|
518
|
+
|
513
519
|
// ggml object
|
514
520
|
struct ggml_object {
|
515
521
|
size_t offs;
|
@@ -543,7 +549,7 @@ extern "C" {
|
|
543
549
|
// op params - allocated as int32_t for alignment
|
544
550
|
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
545
551
|
|
546
|
-
|
552
|
+
int32_t flags;
|
547
553
|
|
548
554
|
struct ggml_tensor * grad;
|
549
555
|
struct ggml_tensor * src[GGML_MAX_SRC];
|
@@ -567,6 +573,11 @@ extern "C" {
|
|
567
573
|
|
568
574
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
569
575
|
|
576
|
+
// Abort callback
|
577
|
+
// If not NULL, called before ggml computation
|
578
|
+
// If it returns true, the computation is aborted
|
579
|
+
typedef bool (*ggml_abort_callback)(void * data);
|
580
|
+
|
570
581
|
// the compute plan that needs to be prepared for ggml_graph_compute()
|
571
582
|
// since https://github.com/ggerganov/ggml/issues/287
|
572
583
|
struct ggml_cplan {
|
@@ -576,8 +587,8 @@ extern "C" {
|
|
576
587
|
int n_threads;
|
577
588
|
|
578
589
|
// abort ggml_graph_compute when true
|
579
|
-
|
580
|
-
void *
|
590
|
+
ggml_abort_callback abort_callback;
|
591
|
+
void * abort_callback_data;
|
581
592
|
};
|
582
593
|
|
583
594
|
enum ggml_cgraph_eval_order {
|
@@ -2087,6 +2098,12 @@ extern "C" {
|
|
2087
2098
|
ggml_opt_callback callback,
|
2088
2099
|
void * callback_data);
|
2089
2100
|
|
2101
|
+
//
|
2102
|
+
// tensor flags
|
2103
|
+
//
|
2104
|
+
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
|
2105
|
+
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
|
2106
|
+
|
2090
2107
|
//
|
2091
2108
|
// quantization
|
2092
2109
|
//
|
@@ -2273,6 +2290,7 @@ extern "C" {
|
|
2273
2290
|
GGML_API int ggml_cpu_has_ssse3 (void);
|
2274
2291
|
GGML_API int ggml_cpu_has_sycl (void);
|
2275
2292
|
GGML_API int ggml_cpu_has_vsx (void);
|
2293
|
+
GGML_API int ggml_cpu_has_matmul_int8(void);
|
2276
2294
|
|
2277
2295
|
//
|
2278
2296
|
// Internal types and functions exposed for tests and benchmarks
|
@@ -2286,7 +2304,8 @@ extern "C" {
|
|
2286
2304
|
#endif
|
2287
2305
|
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
2288
2306
|
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
2289
|
-
typedef void (*ggml_vec_dot_t) (
|
2307
|
+
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
2308
|
+
const void * GGML_RESTRICT y, size_t by, int nrc);
|
2290
2309
|
|
2291
2310
|
typedef struct {
|
2292
2311
|
const char * type_name;
|
@@ -2298,6 +2317,7 @@ extern "C" {
|
|
2298
2317
|
ggml_from_float_t from_float_reference;
|
2299
2318
|
ggml_vec_dot_t vec_dot;
|
2300
2319
|
enum ggml_type vec_dot_type;
|
2320
|
+
int64_t nrows; // number of rows to process simultaneously;
|
2301
2321
|
} ggml_type_traits_t;
|
2302
2322
|
|
2303
2323
|
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|