@fugood/llama.node 1.0.0-beta.7 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/lib/binding.ts +10 -0
- package/lib/index.js +8 -0
- package/lib/index.ts +14 -0
- package/package.json +14 -14
- package/src/LlamaContext.cpp +37 -0
- package/src/LlamaContext.h +1 -0
- package/src/RerankWorker.h +26 -0
- package/src/llama.cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/src/llama.cpp/include/llama.h +6 -3
- package/src/llama.cpp/src/llama-arch.cpp +54 -0
- package/src/llama.cpp/src/llama-arch.h +17 -0
- package/src/llama.cpp/src/llama-batch.cpp +20 -7
- package/src/llama.cpp/src/llama-chat.cpp +11 -6
- package/src/llama.cpp/src/llama-context.cpp +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +19 -4
- package/src/llama.cpp/src/llama-graph.h +14 -2
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
- package/src/llama.cpp/src/llama-kv-cells.h +33 -9
- package/src/llama.cpp/src/llama-model.cpp +518 -1
- package/src/llama.cpp/src/llama-model.h +22 -0
- package/src/llama.cpp/src/llama-quant.cpp +87 -5
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "ggml-quants.h"
|
|
4
4
|
#include "ggml-impl.h"
|
|
5
5
|
#include "ggml-cpu.h"
|
|
6
|
+
#include "simd-mappings.h"
|
|
6
7
|
|
|
7
8
|
#include "../../quants.h"
|
|
8
9
|
#include "../../ggml-cpu-impl.h"
|
|
@@ -62,7 +63,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
62
63
|
const float d = amax / ((1 << 7) - 1);
|
|
63
64
|
const float id = d ? 1.0f/d : 0.0f;
|
|
64
65
|
|
|
65
|
-
y[i].d =
|
|
66
|
+
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
|
66
67
|
|
|
67
68
|
for (int j = 0; j < 8; j++) {
|
|
68
69
|
const float32x4_t v = vmulq_n_f32(srcv[j], id);
|
|
@@ -104,7 +105,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
104
105
|
const float d = amax / ((1 << 7) - 1);
|
|
105
106
|
const float id = d ? 1.0f/d : 0.0f;
|
|
106
107
|
|
|
107
|
-
y[i].d =
|
|
108
|
+
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
|
108
109
|
|
|
109
110
|
int32x4_t accv = vdupq_n_s32(0);
|
|
110
111
|
|
|
@@ -120,7 +121,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
120
121
|
accv = vaddq_s32(accv, vi);
|
|
121
122
|
}
|
|
122
123
|
|
|
123
|
-
y[i].s =
|
|
124
|
+
y[i].s = GGML_CPU_FP32_TO_FP16(d * vaddvq_s32(accv));
|
|
124
125
|
}
|
|
125
126
|
#else
|
|
126
127
|
GGML_UNUSED(nb);
|
|
@@ -194,10 +195,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
194
195
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
|
195
196
|
|
|
196
197
|
float32_t _scale[4] = {
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
198
|
+
GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
|
|
199
|
+
GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d),
|
|
200
|
+
GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
|
|
201
|
+
GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d)
|
|
201
202
|
};
|
|
202
203
|
float32x4_t scale = vld1q_f32(_scale);
|
|
203
204
|
|
|
@@ -274,10 +275,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
274
275
|
// dot product
|
|
275
276
|
sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4,
|
|
276
277
|
svdot_s32(svdup_n_s32(0), qx0ls, qy0l),
|
|
277
|
-
svdot_s32(svdup_n_s32(0), qx0hs, qy0h))),
|
|
278
|
+
svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
|
278
279
|
sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4,
|
|
279
280
|
svdot_s32(svdup_n_s32(0), qx1ls, qy1l),
|
|
280
|
-
svdot_s32(svdup_n_s32(0), qx1hs, qy1h))),
|
|
281
|
+
svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
|
281
282
|
}
|
|
282
283
|
|
|
283
284
|
sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
|
@@ -313,9 +314,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
313
314
|
|
|
314
315
|
// dot product
|
|
315
316
|
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
|
|
316
|
-
svdot_s32(svdup_n_s32(0), qx0s, qy0)),
|
|
317
|
+
svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
|
317
318
|
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
|
|
318
|
-
svdot_s32(svdup_n_s32(0), qx1s, qy1)),
|
|
319
|
+
svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
|
319
320
|
}
|
|
320
321
|
|
|
321
322
|
sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
|
@@ -354,9 +355,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
354
355
|
|
|
355
356
|
// dot product
|
|
356
357
|
sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32,
|
|
357
|
-
svdot_s32(svdup_n_s32(0), qx0s, qy0)),
|
|
358
|
+
svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
|
358
359
|
sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32,
|
|
359
|
-
svdot_s32(svdup_n_s32(0), qx1s, qy1)),
|
|
360
|
+
svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
|
360
361
|
}
|
|
361
362
|
|
|
362
363
|
sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1));
|
|
@@ -404,8 +405,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
404
405
|
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
|
405
406
|
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
|
406
407
|
|
|
407
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0),
|
|
408
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1),
|
|
408
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
|
409
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
|
409
410
|
}
|
|
410
411
|
|
|
411
412
|
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
|
@@ -423,7 +424,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
423
424
|
}
|
|
424
425
|
|
|
425
426
|
int sumi = sumi0 + sumi1;
|
|
426
|
-
sumf += sumi*
|
|
427
|
+
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
427
428
|
}
|
|
428
429
|
|
|
429
430
|
*s = sumf;
|
|
@@ -464,10 +465,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
464
465
|
const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i];
|
|
465
466
|
|
|
466
467
|
float32_t summs_t[4] = {
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
468
|
+
GGML_CPU_FP16_TO_FP32(b_x0->m) * GGML_CPU_FP16_TO_FP32(b_y0->s),
|
|
469
|
+
GGML_CPU_FP16_TO_FP32(b_x1->m) * GGML_CPU_FP16_TO_FP32(b_y0->s),
|
|
470
|
+
GGML_CPU_FP16_TO_FP32(b_x0->m) * GGML_CPU_FP16_TO_FP32(b_y1->s),
|
|
471
|
+
GGML_CPU_FP16_TO_FP32(b_x1->m) * GGML_CPU_FP16_TO_FP32(b_y1->s)
|
|
471
472
|
};
|
|
472
473
|
summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
|
|
473
474
|
|
|
@@ -490,10 +491,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
490
491
|
|
|
491
492
|
// mmla into int32x4_t
|
|
492
493
|
float32_t _scale[4] = {
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
494
|
+
GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
|
|
495
|
+
GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d),
|
|
496
|
+
GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
|
|
497
|
+
GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d)
|
|
497
498
|
};
|
|
498
499
|
float32x4_t scale = vld1q_f32(_scale);
|
|
499
500
|
|
|
@@ -539,7 +540,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
539
540
|
const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
540
541
|
const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
541
542
|
|
|
542
|
-
summs +=
|
|
543
|
+
summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s) + GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
|
|
543
544
|
|
|
544
545
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
|
545
546
|
|
|
@@ -562,8 +563,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
562
563
|
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
|
563
564
|
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
|
564
565
|
|
|
565
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0),
|
|
566
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1),
|
|
566
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
|
567
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
|
567
568
|
}
|
|
568
569
|
|
|
569
570
|
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
|
|
@@ -582,7 +583,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
582
583
|
}
|
|
583
584
|
|
|
584
585
|
int sumi = sumi0 + sumi1;
|
|
585
|
-
sumf += (
|
|
586
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
586
587
|
}
|
|
587
588
|
|
|
588
589
|
*s = sumf;
|
|
@@ -666,10 +667,10 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
666
667
|
|
|
667
668
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
|
668
669
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
|
669
|
-
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))),
|
|
670
|
+
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
|
670
671
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
|
671
672
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
|
672
|
-
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))),
|
|
673
|
+
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
|
673
674
|
}
|
|
674
675
|
|
|
675
676
|
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
|
@@ -694,7 +695,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
694
695
|
}
|
|
695
696
|
|
|
696
697
|
int sumi = sumi0 + sumi1;
|
|
697
|
-
sumf += (
|
|
698
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
698
699
|
}
|
|
699
700
|
|
|
700
701
|
*s = sumf;
|
|
@@ -739,8 +740,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
739
740
|
|
|
740
741
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
|
741
742
|
|
|
742
|
-
summs0 +=
|
|
743
|
-
summs1 +=
|
|
743
|
+
summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
|
|
744
|
+
summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
|
|
744
745
|
|
|
745
746
|
// extract the 5th bit via lookup table ((b) << 4)
|
|
746
747
|
memcpy(&qh0, x0->qh, sizeof(qh0));
|
|
@@ -784,10 +785,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
784
785
|
|
|
785
786
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
|
786
787
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
|
787
|
-
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))),
|
|
788
|
+
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
|
788
789
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
|
789
790
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
|
790
|
-
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))),
|
|
791
|
+
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
|
791
792
|
}
|
|
792
793
|
|
|
793
794
|
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
|
|
@@ -812,7 +813,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
812
813
|
}
|
|
813
814
|
|
|
814
815
|
int sumi = sumi0 + sumi1;
|
|
815
|
-
sumf += (
|
|
816
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
816
817
|
}
|
|
817
818
|
|
|
818
819
|
*s = sumf;
|
|
@@ -864,10 +865,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
864
865
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
|
865
866
|
|
|
866
867
|
float32_t _scale[4] = {
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
868
|
+
GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
|
|
869
|
+
GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d),
|
|
870
|
+
GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
|
|
871
|
+
GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d)
|
|
871
872
|
};
|
|
872
873
|
float32x4_t scale = vld1q_f32(_scale);
|
|
873
874
|
|
|
@@ -934,10 +935,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
934
935
|
|
|
935
936
|
sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16,
|
|
936
937
|
svdot_s32(svdup_n_s32(0), qx0_0, qy0_0),
|
|
937
|
-
svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))),
|
|
938
|
+
svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
|
938
939
|
sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16,
|
|
939
940
|
svdot_s32(svdup_n_s32(0), qx1_0, qy1_0),
|
|
940
|
-
svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))),
|
|
941
|
+
svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
|
941
942
|
}
|
|
942
943
|
|
|
943
944
|
sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1));
|
|
@@ -960,9 +961,9 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
960
961
|
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
|
961
962
|
|
|
962
963
|
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
|
|
963
|
-
svdot_s32(svdup_n_s32(0), qx0, qy0)),
|
|
964
|
+
svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
|
964
965
|
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
|
|
965
|
-
svdot_s32(svdup_n_s32(0), qx1, qy1)),
|
|
966
|
+
svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
|
966
967
|
}
|
|
967
968
|
|
|
968
969
|
sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
|
@@ -1002,8 +1003,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1002
1003
|
qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64);
|
|
1003
1004
|
|
|
1004
1005
|
// scale creation
|
|
1005
|
-
const float32_t deq1 =
|
|
1006
|
-
const float32_t deq2 =
|
|
1006
|
+
const float32_t deq1 = GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d);
|
|
1007
|
+
const float32_t deq2 = GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d);
|
|
1007
1008
|
|
|
1008
1009
|
// duplicate deq1 in first half of vector and deq2 in second half of vector
|
|
1009
1010
|
const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2);
|
|
@@ -1043,11 +1044,11 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1043
1044
|
|
|
1044
1045
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
|
1045
1046
|
ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
|
1046
|
-
ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))),
|
|
1047
|
+
ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
|
1047
1048
|
|
|
1048
1049
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
|
1049
1050
|
ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
|
1050
|
-
ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))),
|
|
1051
|
+
ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
|
1051
1052
|
}
|
|
1052
1053
|
|
|
1053
1054
|
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
|
@@ -1059,7 +1060,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1059
1060
|
sumi += x[ib].qs[j]*y[ib].qs[j];
|
|
1060
1061
|
}
|
|
1061
1062
|
|
|
1062
|
-
sumf += sumi*(
|
|
1063
|
+
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
1063
1064
|
}
|
|
1064
1065
|
|
|
1065
1066
|
*s = sumf;
|
|
@@ -1217,7 +1218,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1217
1218
|
const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
|
|
1218
1219
|
const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
|
|
1219
1220
|
|
|
1220
|
-
const float d =
|
|
1221
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1221
1222
|
|
|
1222
1223
|
#if defined(__ARM_FEATURE_DOTPROD)
|
|
1223
1224
|
sumi0 = vaddq_s32(sumi0, sumi1);
|
|
@@ -1269,7 +1270,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1269
1270
|
}
|
|
1270
1271
|
}
|
|
1271
1272
|
|
|
1272
|
-
sumf += (float) sum * (
|
|
1273
|
+
sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
|
|
1273
1274
|
}
|
|
1274
1275
|
|
|
1275
1276
|
*s = sumf;
|
|
@@ -1362,7 +1363,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1362
1363
|
const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
|
|
1363
1364
|
const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
|
|
1364
1365
|
|
|
1365
|
-
const float d =
|
|
1366
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1366
1367
|
|
|
1367
1368
|
#if defined(__ARM_FEATURE_DOTPROD)
|
|
1368
1369
|
sumi0 = vaddq_s32(sumi0, sumi1);
|
|
@@ -1393,7 +1394,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1393
1394
|
}
|
|
1394
1395
|
}
|
|
1395
1396
|
|
|
1396
|
-
const float d = y[i].d *
|
|
1397
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1397
1398
|
|
|
1398
1399
|
sumf += (float) sumi * d;
|
|
1399
1400
|
}
|
|
@@ -1425,9 +1426,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1425
1426
|
switch (vector_length) {
|
|
1426
1427
|
case 128:
|
|
1427
1428
|
for (int i = 0; i < nb; ++i) {
|
|
1428
|
-
const float d = y[i].d *
|
|
1429
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1429
1430
|
svfloat32_t d_broad = svdup_n_f32((float32_t)d);
|
|
1430
|
-
const float dmin = -y[i].d *
|
|
1431
|
+
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1431
1432
|
svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
|
|
1432
1433
|
|
|
1433
1434
|
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
@@ -1570,9 +1571,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1570
1571
|
case 256:
|
|
1571
1572
|
case 512:
|
|
1572
1573
|
for (int i = 0; i < nb; ++i) {
|
|
1573
|
-
const float d = y[i].d *
|
|
1574
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1574
1575
|
svfloat32_t d_broad = svdup_n_f32((float32_t)d);
|
|
1575
|
-
const float dmin = -y[i].d *
|
|
1576
|
+
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1576
1577
|
svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
|
|
1577
1578
|
|
|
1578
1579
|
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
@@ -1671,8 +1672,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1671
1672
|
float sum = 0;
|
|
1672
1673
|
|
|
1673
1674
|
for (int i = 0; i < nb; ++i) {
|
|
1674
|
-
const float d = y[i].d *
|
|
1675
|
-
const float dmin = -y[i].d *
|
|
1675
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1676
|
+
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1676
1677
|
|
|
1677
1678
|
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
1678
1679
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -1742,8 +1743,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1742
1743
|
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
1743
1744
|
}
|
|
1744
1745
|
|
|
1745
|
-
const float dall = y[i].d *
|
|
1746
|
-
const float dmin = y[i].d *
|
|
1746
|
+
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1747
|
+
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1747
1748
|
|
|
1748
1749
|
int isum = 0;
|
|
1749
1750
|
int is = 0;
|
|
@@ -1805,7 +1806,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1805
1806
|
|
|
1806
1807
|
for (int i = 0; i < nb; ++i) {
|
|
1807
1808
|
|
|
1808
|
-
const float d = y[i].d *
|
|
1809
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1809
1810
|
|
|
1810
1811
|
const uint8_t * GGML_RESTRICT q3_sv = x[i].qs;
|
|
1811
1812
|
const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask;
|
|
@@ -1981,7 +1982,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1981
1982
|
|
|
1982
1983
|
for (int i = 0; i < nb; ++i) {
|
|
1983
1984
|
|
|
1984
|
-
const float d = y[i].d *
|
|
1985
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1985
1986
|
|
|
1986
1987
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
1987
1988
|
const uint8_t * GGML_RESTRICT qh = x[i].hmask;
|
|
@@ -2112,7 +2113,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2112
2113
|
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
2113
2114
|
q8 += 8; a += 8;
|
|
2114
2115
|
}
|
|
2115
|
-
const float d =
|
|
2116
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2116
2117
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2117
2118
|
}
|
|
2118
2119
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -2258,18 +2259,18 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2258
2259
|
bias[3] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x1_mins)),
|
|
2259
2260
|
vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x1_mins))));
|
|
2260
2261
|
const float32x4_t dmins = {
|
|
2261
|
-
|
|
2262
|
-
|
|
2263
|
-
|
|
2264
|
-
|
|
2262
|
+
GGML_CPU_FP16_TO_FP32(x0->dmin) * y0->d,
|
|
2263
|
+
GGML_CPU_FP16_TO_FP32(x0->dmin) * y1->d,
|
|
2264
|
+
GGML_CPU_FP16_TO_FP32(x1->dmin) * y0->d,
|
|
2265
|
+
GGML_CPU_FP16_TO_FP32(x1->dmin) * y1->d,
|
|
2265
2266
|
};
|
|
2266
2267
|
vfsum = vmlsq_f32(vfsum, vcvtq_f32_s32(vld1q_s32(bias)), dmins);
|
|
2267
2268
|
|
|
2268
2269
|
const float32x4_t superblock_scale = {
|
|
2269
|
-
|
|
2270
|
-
|
|
2271
|
-
|
|
2272
|
-
|
|
2270
|
+
GGML_CPU_FP16_TO_FP32(x0->d) * y0->d,
|
|
2271
|
+
GGML_CPU_FP16_TO_FP32(x0->d) * y1->d,
|
|
2272
|
+
GGML_CPU_FP16_TO_FP32(x1->d) * y0->d,
|
|
2273
|
+
GGML_CPU_FP16_TO_FP32(x1->d) * y1->d,
|
|
2273
2274
|
};
|
|
2274
2275
|
vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale);
|
|
2275
2276
|
}
|
|
@@ -2289,8 +2290,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2289
2290
|
float sumf = 0;
|
|
2290
2291
|
for (int i = 0; i < nb; ++i) {
|
|
2291
2292
|
|
|
2292
|
-
const float d = y[i].d *
|
|
2293
|
-
const float dmin = y[i].d *
|
|
2293
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
2294
|
+
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
2294
2295
|
|
|
2295
2296
|
const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
|
|
2296
2297
|
|
|
@@ -2377,8 +2378,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2377
2378
|
|
|
2378
2379
|
for (int i = 0; i < nb; ++i) {
|
|
2379
2380
|
|
|
2380
|
-
const float d = y[i].d *
|
|
2381
|
-
const float dmin = y[i].d *
|
|
2381
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
2382
|
+
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
2382
2383
|
|
|
2383
2384
|
const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
|
|
2384
2385
|
|
|
@@ -2478,9 +2479,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2478
2479
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2479
2480
|
q8 += 8; a += 8;
|
|
2480
2481
|
}
|
|
2481
|
-
const float d =
|
|
2482
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2482
2483
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2483
|
-
const float dmin =
|
|
2484
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
2484
2485
|
sumf -= dmin * sumi;
|
|
2485
2486
|
}
|
|
2486
2487
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -2520,8 +2521,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2520
2521
|
|
|
2521
2522
|
for (int i = 0; i < nb; ++i) {
|
|
2522
2523
|
|
|
2523
|
-
const float d = y[i].d *
|
|
2524
|
-
const float dmin = y[i].d *
|
|
2524
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
2525
|
+
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
2525
2526
|
|
|
2526
2527
|
const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
|
|
2527
2528
|
|
|
@@ -2630,9 +2631,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2630
2631
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2631
2632
|
q8 += 8; a += 8;
|
|
2632
2633
|
}
|
|
2633
|
-
const float d =
|
|
2634
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2634
2635
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2635
|
-
const float dmin =
|
|
2636
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
2636
2637
|
sumf -= dmin * sumi;
|
|
2637
2638
|
}
|
|
2638
2639
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -2827,10 +2828,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2827
2828
|
const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32);
|
|
2828
2829
|
|
|
2829
2830
|
const float32x4_t superblock_scale = {
|
|
2830
|
-
|
|
2831
|
-
|
|
2832
|
-
|
|
2833
|
-
|
|
2831
|
+
GGML_CPU_FP16_TO_FP32(x0->d) * y0->d,
|
|
2832
|
+
GGML_CPU_FP16_TO_FP32(x0->d) * y1->d,
|
|
2833
|
+
GGML_CPU_FP16_TO_FP32(x1->d) * y0->d,
|
|
2834
|
+
GGML_CPU_FP16_TO_FP32(x1->d) * y1->d,
|
|
2834
2835
|
};
|
|
2835
2836
|
|
|
2836
2837
|
visum = vsubq_s32(visum, vibias);
|
|
@@ -2858,7 +2859,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2858
2859
|
svuint8_t q6h_1, q6h_2, q6h_3, q6h_4;
|
|
2859
2860
|
|
|
2860
2861
|
for (int i = 0; i < nb; ++i) {
|
|
2861
|
-
const float d_all =
|
|
2862
|
+
const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
2862
2863
|
|
|
2863
2864
|
const uint8_t * GGML_RESTRICT q6 = x[i].ql;
|
|
2864
2865
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
@@ -3011,7 +3012,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
3011
3012
|
|
|
3012
3013
|
for (int i = 0; i < nb; ++i) {
|
|
3013
3014
|
|
|
3014
|
-
const float d_all =
|
|
3015
|
+
const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
3015
3016
|
|
|
3016
3017
|
const uint8_t * GGML_RESTRICT q6 = x[i].ql;
|
|
3017
3018
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
@@ -3128,7 +3129,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
3128
3129
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
3129
3130
|
q8 += 8; a += 8;
|
|
3130
3131
|
}
|
|
3131
|
-
const float d =
|
|
3132
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3132
3133
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
3133
3134
|
}
|
|
3134
3135
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -3199,7 +3200,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
3199
3200
|
|
|
3200
3201
|
float sumf = 0;
|
|
3201
3202
|
for (int i = 0; i < nb; ++i) {
|
|
3202
|
-
const float d =
|
|
3203
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3203
3204
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
3204
3205
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
3205
3206
|
float sumf1 = 0, sumf2 = 0;
|
|
@@ -3234,7 +3235,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
3234
3235
|
|
|
3235
3236
|
float sumf = 0.f;
|
|
3236
3237
|
for (int i = 0; i < nb; ++i) {
|
|
3237
|
-
const float d =
|
|
3238
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3238
3239
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
3239
3240
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
3240
3241
|
int32_t bsum = 0;
|
|
@@ -3284,7 +3285,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
3284
3285
|
|
|
3285
3286
|
float sumf = 0;
|
|
3286
3287
|
for (int i = 0; i < nb; ++i) {
|
|
3287
|
-
const float d =
|
|
3288
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3288
3289
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
3289
3290
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
3290
3291
|
const uint8x8_t scales8 = vld1_u8(x[i].scales);
|
|
@@ -3329,7 +3330,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
3329
3330
|
|
|
3330
3331
|
float sumf = 0.f;
|
|
3331
3332
|
for (int i = 0; i < nb; ++i) {
|
|
3332
|
-
const float d =
|
|
3333
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3333
3334
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
3334
3335
|
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
3335
3336
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -3398,7 +3399,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3398
3399
|
float sumf = 0;
|
|
3399
3400
|
for (int i = 0; i < nb; ++i) {
|
|
3400
3401
|
|
|
3401
|
-
const float d =
|
|
3402
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3402
3403
|
|
|
3403
3404
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
3404
3405
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
@@ -3458,7 +3459,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3458
3459
|
float sumf = 0;
|
|
3459
3460
|
for (int i = 0; i < nb; i++) {
|
|
3460
3461
|
|
|
3461
|
-
const float d =
|
|
3462
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3462
3463
|
const int8_t * q8 = y[i].qs;
|
|
3463
3464
|
const uint8_t * qs = x[i].qs;
|
|
3464
3465
|
const uint8_t * qh = x[i].qh;
|
|
@@ -3521,7 +3522,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
3521
3522
|
|
|
3522
3523
|
float sumf = 0;
|
|
3523
3524
|
for (int i = 0; i < nb; ++i) {
|
|
3524
|
-
const float d =
|
|
3525
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3525
3526
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
3526
3527
|
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
3527
3528
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -3557,7 +3558,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
3557
3558
|
|
|
3558
3559
|
float sumf = 0.f;
|
|
3559
3560
|
for (int i = 0; i < nb; ++i) {
|
|
3560
|
-
const float d =
|
|
3561
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3561
3562
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
3562
3563
|
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
3563
3564
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -3630,7 +3631,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3630
3631
|
|
|
3631
3632
|
float sumf = 0;
|
|
3632
3633
|
for (int i = 0; i < nb; ++i) {
|
|
3633
|
-
const float d =
|
|
3634
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3634
3635
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
3635
3636
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
3636
3637
|
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
|
|
@@ -3691,7 +3692,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3691
3692
|
|
|
3692
3693
|
float sumf = 0.f;
|
|
3693
3694
|
for (int i = 0; i < nb; ++i) {
|
|
3694
|
-
const float d =
|
|
3695
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3695
3696
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
3696
3697
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
3697
3698
|
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
|
@@ -3786,7 +3787,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3786
3787
|
|
|
3787
3788
|
}
|
|
3788
3789
|
|
|
3789
|
-
sumf += y[i].d *
|
|
3790
|
+
sumf += y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3);
|
|
3790
3791
|
}
|
|
3791
3792
|
|
|
3792
3793
|
*s = sumf;
|
|
@@ -3817,7 +3818,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3817
3818
|
qs += 4;
|
|
3818
3819
|
}
|
|
3819
3820
|
|
|
3820
|
-
sumf +=
|
|
3821
|
+
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
|
3821
3822
|
}
|
|
3822
3823
|
|
|
3823
3824
|
*s = sumf;
|
|
@@ -3905,7 +3906,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3905
3906
|
|
|
3906
3907
|
}
|
|
3907
3908
|
|
|
3908
|
-
sumf += y[i].d *
|
|
3909
|
+
sumf += y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
|
|
3909
3910
|
}
|
|
3910
3911
|
|
|
3911
3912
|
*s = sumf;
|
|
@@ -3952,7 +3953,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3952
3953
|
qh += 2;
|
|
3953
3954
|
}
|
|
3954
3955
|
|
|
3955
|
-
sumf +=
|
|
3956
|
+
sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
|
3956
3957
|
}
|
|
3957
3958
|
|
|
3958
3959
|
*s = sumf;
|
|
@@ -4003,13 +4004,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
4003
4004
|
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
|
4004
4005
|
|
|
4005
4006
|
sumf +=
|
|
4006
|
-
|
|
4007
|
-
|
|
4007
|
+
GGML_CPU_FP16_TO_FP32(x[ib+0].d) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
|
|
4008
|
+
GGML_CPU_FP16_TO_FP32(x[ib+1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
|
|
4008
4009
|
}
|
|
4009
4010
|
|
|
4010
4011
|
#endif
|
|
4011
4012
|
for (; ib < nb; ++ib) {
|
|
4012
|
-
const float d =
|
|
4013
|
+
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
|
4013
4014
|
int sumi1 = 0, sumi2 = 0;
|
|
4014
4015
|
for (int j = 0; j < QK4_NL/2; ++j) {
|
|
4015
4016
|
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
|
@@ -4071,7 +4072,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
4071
4072
|
|
|
4072
4073
|
}
|
|
4073
4074
|
|
|
4074
|
-
sumf +=
|
|
4075
|
+
sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
|
|
4075
4076
|
}
|
|
4076
4077
|
|
|
4077
4078
|
*s = sumf;
|
|
@@ -4079,7 +4080,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
4079
4080
|
#else
|
|
4080
4081
|
float sumf = 0;
|
|
4081
4082
|
for (int ibl = 0; ibl < nb; ++ibl) {
|
|
4082
|
-
const float d4d8 =
|
|
4083
|
+
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
|
4083
4084
|
uint16_t h = x[ibl].scales_h;
|
|
4084
4085
|
const uint8_t * qs = x[ibl].qs;
|
|
4085
4086
|
const int8_t * q8 = y[ibl].qs;
|