@fugood/llama.node 1.0.0-beta.6 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/lib/binding.ts +12 -0
- package/lib/index.js +10 -0
- package/lib/index.ts +17 -1
- package/package.json +14 -14
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +7 -3
- package/src/LlamaCompletionWorker.h +2 -0
- package/src/LlamaContext.cpp +49 -6
- package/src/LlamaContext.h +1 -0
- package/src/RerankWorker.h +26 -0
- package/src/common.hpp +1 -1
- package/src/llama.cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/src/llama.cpp/include/llama.h +6 -3
- package/src/llama.cpp/src/llama-arch.cpp +54 -0
- package/src/llama.cpp/src/llama-arch.h +17 -0
- package/src/llama.cpp/src/llama-batch.cpp +20 -7
- package/src/llama.cpp/src/llama-chat.cpp +11 -6
- package/src/llama.cpp/src/llama-context.cpp +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +19 -4
- package/src/llama.cpp/src/llama-graph.h +14 -2
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
- package/src/llama.cpp/src/llama-kv-cells.h +33 -9
- package/src/llama.cpp/src/llama-model.cpp +518 -1
- package/src/llama.cpp/src/llama-model.h +22 -0
- package/src/llama.cpp/src/llama-quant.cpp +87 -5
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "ggml-quants.h"
|
|
4
4
|
#include "ggml-impl.h"
|
|
5
5
|
#include "ggml-cpu.h"
|
|
6
|
+
#include "simd-mappings.h"
|
|
6
7
|
|
|
7
8
|
#include "../../quants.h"
|
|
8
9
|
#include "../../ggml-cpu-impl.h"
|
|
@@ -67,7 +68,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
67
68
|
const float id = d ? 1.0f/d : 0.0f;
|
|
68
69
|
const vector float vid = vec_splats(id);
|
|
69
70
|
|
|
70
|
-
y[i].d =
|
|
71
|
+
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
|
71
72
|
|
|
72
73
|
for (int j = 0; j < 8; j++) {
|
|
73
74
|
const vector float v = vec_round(vec_mul(srcv[j], vid));
|
|
@@ -112,7 +113,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
112
113
|
const float id = d ? 1.0f/d : 0.0f;
|
|
113
114
|
const vector float vid = vec_splats(id);
|
|
114
115
|
|
|
115
|
-
y[i].d =
|
|
116
|
+
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
|
116
117
|
|
|
117
118
|
vector int accv = vec_splats(0);
|
|
118
119
|
|
|
@@ -127,7 +128,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
127
128
|
|
|
128
129
|
accv = vec_add(accv, vec_sld(accv, accv, 4));
|
|
129
130
|
accv = vec_add(accv, vec_sld(accv, accv, 8));
|
|
130
|
-
y[i].s =
|
|
131
|
+
y[i].s = GGML_CPU_FP32_TO_FP16(d * vec_extract(accv, 0));
|
|
131
132
|
}
|
|
132
133
|
|
|
133
134
|
#else
|
|
@@ -170,8 +171,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
170
171
|
__builtin_prefetch(x[ib].qs, 0, 1);
|
|
171
172
|
__builtin_prefetch(y[ib].qs, 0, 1);
|
|
172
173
|
|
|
173
|
-
vector float vxd = vec_splats(
|
|
174
|
-
vector float vyd = vec_splats(
|
|
174
|
+
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
|
175
|
+
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
175
176
|
vector float vd = vec_mul(vxd, vyd);
|
|
176
177
|
|
|
177
178
|
vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
|
|
@@ -214,7 +215,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
214
215
|
}
|
|
215
216
|
|
|
216
217
|
int sumi = sumi0 + sumi1;
|
|
217
|
-
sumf += sumi*
|
|
218
|
+
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
218
219
|
}
|
|
219
220
|
|
|
220
221
|
*s = sumf;
|
|
@@ -249,12 +250,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
249
250
|
__builtin_prefetch(x[ib].qs, 0, 1);
|
|
250
251
|
__builtin_prefetch(y[ib].qs, 0, 1);
|
|
251
252
|
|
|
252
|
-
vector float vxd = vec_splats(
|
|
253
|
-
vector float vyd = vec_splats(
|
|
253
|
+
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
|
254
|
+
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
254
255
|
vector float vd = vec_mul(vxd, vyd);
|
|
255
256
|
|
|
256
|
-
vector float vxmin = vec_splats(
|
|
257
|
-
vector float vys = {
|
|
257
|
+
vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m));
|
|
258
|
+
vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f};
|
|
258
259
|
vsumf0 = vec_madd(vxmin, vys, vsumf0);
|
|
259
260
|
|
|
260
261
|
vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
|
|
@@ -291,7 +292,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
291
292
|
}
|
|
292
293
|
|
|
293
294
|
int sumi = sumi0 + sumi1;
|
|
294
|
-
sumf += (
|
|
295
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
295
296
|
}
|
|
296
297
|
|
|
297
298
|
*s = sumf;
|
|
@@ -326,8 +327,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
326
327
|
__builtin_prefetch(x[ib].qs, 0, 1);
|
|
327
328
|
__builtin_prefetch(y[ib].qs, 0, 1);
|
|
328
329
|
|
|
329
|
-
vector float vxd = vec_splats(
|
|
330
|
-
vector float vyd = vec_splats(
|
|
330
|
+
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
|
331
|
+
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
331
332
|
vector float vd = vec_mul(vxd, vyd);
|
|
332
333
|
|
|
333
334
|
vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[ib].qh[0]]), (uint64_t)(table_b2b_1[x[ib].qh[1]])};
|
|
@@ -379,7 +380,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
379
380
|
}
|
|
380
381
|
|
|
381
382
|
int sumi = sumi0 + sumi1;
|
|
382
|
-
sumf += (
|
|
383
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
383
384
|
}
|
|
384
385
|
|
|
385
386
|
*s = sumf;
|
|
@@ -415,12 +416,12 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
415
416
|
__builtin_prefetch(x[ib].qs, 0, 1);
|
|
416
417
|
__builtin_prefetch(y[ib].qs, 0, 1);
|
|
417
418
|
|
|
418
|
-
vector float vxd = vec_splats(
|
|
419
|
-
vector float vyd = vec_splats(
|
|
419
|
+
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
|
420
|
+
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
420
421
|
vector float vd = vec_mul(vxd, vyd);
|
|
421
422
|
|
|
422
|
-
vector float vxmin = vec_splats(
|
|
423
|
-
vector float vys = {
|
|
423
|
+
vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m));
|
|
424
|
+
vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f};
|
|
424
425
|
vsumf0 = vec_madd(vxmin, vys, vsumf0);
|
|
425
426
|
|
|
426
427
|
vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[ib].qh[0]]), (uint64_t)(table_b2b_0[x[ib].qh[1]])};
|
|
@@ -470,7 +471,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
470
471
|
}
|
|
471
472
|
|
|
472
473
|
int sumi = sumi0 + sumi1;
|
|
473
|
-
sumf += (
|
|
474
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
474
475
|
}
|
|
475
476
|
|
|
476
477
|
*s = sumf;
|
|
@@ -502,8 +503,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
502
503
|
__builtin_prefetch(x[ib].qs, 0, 1);
|
|
503
504
|
__builtin_prefetch(y[ib].qs, 0, 1);
|
|
504
505
|
|
|
505
|
-
vector float vxd = vec_splats(
|
|
506
|
-
vector float vyd = vec_splats(
|
|
506
|
+
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
|
507
|
+
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
507
508
|
vector float vd = vec_mul(vxd, vyd);
|
|
508
509
|
|
|
509
510
|
vector signed char q8x0 = vec_xl( 0, x[ib].qs);
|
|
@@ -542,7 +543,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
542
543
|
sumi += x[ib].qs[j]*y[ib].qs[j];
|
|
543
544
|
}
|
|
544
545
|
|
|
545
|
-
sumf += sumi*(
|
|
546
|
+
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
546
547
|
}
|
|
547
548
|
|
|
548
549
|
*s = sumf;
|
|
@@ -574,11 +575,11 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
574
575
|
vector float vsumf3 = vec_splats(0.0f);
|
|
575
576
|
|
|
576
577
|
for (int i = 0; i < nb; ++i) {
|
|
577
|
-
vector float vxd = vec_splats(
|
|
578
|
+
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
|
578
579
|
vector float vyd = vec_splats(y[i].d);
|
|
579
580
|
vector float vd = vec_mul(vxd, vyd);
|
|
580
581
|
|
|
581
|
-
vector float vxmin = vec_splats(
|
|
582
|
+
vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
|
|
582
583
|
vector float vdmin = vec_mul(vxmin, vyd);
|
|
583
584
|
|
|
584
585
|
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
|
|
@@ -708,8 +709,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
708
709
|
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
709
710
|
}
|
|
710
711
|
|
|
711
|
-
const float dall = y[i].d *
|
|
712
|
-
const float dmin = y[i].d *
|
|
712
|
+
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
713
|
+
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
713
714
|
|
|
714
715
|
int isum = 0;
|
|
715
716
|
int is = 0;
|
|
@@ -770,7 +771,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
770
771
|
vector float vsumf3 = vec_splats(0.0f);
|
|
771
772
|
|
|
772
773
|
for (int i = 0; i < nb; ++i) {
|
|
773
|
-
vector float vxd = vec_splats(
|
|
774
|
+
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
|
774
775
|
vector float vyd = vec_splats(y[i].d);
|
|
775
776
|
vector float vd = vec_mul(vxd, vyd);
|
|
776
777
|
|
|
@@ -962,7 +963,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
962
963
|
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
963
964
|
q8 += 8; a += 8;
|
|
964
965
|
}
|
|
965
|
-
const float d =
|
|
966
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
966
967
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
967
968
|
}
|
|
968
969
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -1005,11 +1006,11 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1005
1006
|
vector float vsumf3 = vec_splats(0.0f);
|
|
1006
1007
|
|
|
1007
1008
|
for (int i = 0; i < nb; ++i) {
|
|
1008
|
-
vector float vxd = vec_splats(
|
|
1009
|
+
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
|
1009
1010
|
vector float vyd = vec_splats(y[i].d);
|
|
1010
1011
|
vector float vd = vec_mul(vxd, vyd);
|
|
1011
1012
|
|
|
1012
|
-
vector float vxmin = vec_splats(
|
|
1013
|
+
vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
|
|
1013
1014
|
vector float vdmin = vec_mul(vxmin, vyd);
|
|
1014
1015
|
|
|
1015
1016
|
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
|
|
@@ -1177,9 +1178,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1177
1178
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1178
1179
|
q8 += 8; a += 8;
|
|
1179
1180
|
}
|
|
1180
|
-
const float d =
|
|
1181
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1181
1182
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1182
|
-
const float dmin =
|
|
1183
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1183
1184
|
sumf -= dmin * sumi;
|
|
1184
1185
|
}
|
|
1185
1186
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -1222,11 +1223,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1222
1223
|
vector float vsumf3 = vec_splats(0.0f);
|
|
1223
1224
|
|
|
1224
1225
|
for (int i = 0; i < nb; ++i) {
|
|
1225
|
-
vector float vxd = vec_splats(
|
|
1226
|
+
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
|
1226
1227
|
vector float vyd = vec_splats(y[i].d);
|
|
1227
1228
|
vector float vd = vec_mul(vxd, vyd);
|
|
1228
1229
|
|
|
1229
|
-
vector float vxmin = vec_splats(
|
|
1230
|
+
vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
|
|
1230
1231
|
vector float vdmin = vec_mul(vxmin, vyd);
|
|
1231
1232
|
|
|
1232
1233
|
UNUSED(kmask1);
|
|
@@ -1394,9 +1395,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1394
1395
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1395
1396
|
q8 += 8; a += 8;
|
|
1396
1397
|
}
|
|
1397
|
-
const float d =
|
|
1398
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1398
1399
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1399
|
-
const float dmin =
|
|
1400
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1400
1401
|
sumf -= dmin * sumi;
|
|
1401
1402
|
}
|
|
1402
1403
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -1432,7 +1433,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1432
1433
|
vector float vsumf3 = vec_splats(0.0f);
|
|
1433
1434
|
|
|
1434
1435
|
for (int i = 0; i < nb; ++i) {
|
|
1435
|
-
vector float vxd = vec_splats(
|
|
1436
|
+
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
|
1436
1437
|
vector float vyd = vec_splats(y[i].d);
|
|
1437
1438
|
vector float vd = vec_mul(vxd, vyd);
|
|
1438
1439
|
|
|
@@ -1591,7 +1592,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1591
1592
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1592
1593
|
q8 += 8; a += 8;
|
|
1593
1594
|
}
|
|
1594
|
-
const float d =
|
|
1595
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1595
1596
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1596
1597
|
}
|
|
1597
1598
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -1659,7 +1660,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
1659
1660
|
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
|
1660
1661
|
|
|
1661
1662
|
for (int i = 0; i < nb; ++i) {
|
|
1662
|
-
vector float vxd = vec_splats(
|
|
1663
|
+
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
|
1663
1664
|
vector float vyd = vec_splats(y[i].d);
|
|
1664
1665
|
vector float vd = vec_mul(vxd, vyd);
|
|
1665
1666
|
|
|
@@ -1742,7 +1743,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
1742
1743
|
|
|
1743
1744
|
float sumf = 0.f;
|
|
1744
1745
|
for (int i = 0; i < nb; ++i) {
|
|
1745
|
-
const float d =
|
|
1746
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1746
1747
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
1747
1748
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1748
1749
|
int32_t bsum = 0;
|
|
@@ -1790,7 +1791,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
1790
1791
|
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
|
1791
1792
|
|
|
1792
1793
|
for (int i = 0; i < nb; ++i) {
|
|
1793
|
-
vector float vxd = vec_splats(
|
|
1794
|
+
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
|
1794
1795
|
vector float vyd = vec_splats(y[i].d);
|
|
1795
1796
|
vector float vd = vec_mul(vxd, vyd);
|
|
1796
1797
|
|
|
@@ -1871,7 +1872,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
1871
1872
|
|
|
1872
1873
|
float sumf = 0.f;
|
|
1873
1874
|
for (int i = 0; i < nb; ++i) {
|
|
1874
|
-
const float d =
|
|
1875
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1875
1876
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
1876
1877
|
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
1877
1878
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -1939,7 +1940,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1939
1940
|
const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
|
|
1940
1941
|
|
|
1941
1942
|
for (int i = 0; i < nb; ++i) {
|
|
1942
|
-
vector float vxd = vec_splats(
|
|
1943
|
+
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
|
1943
1944
|
vector float vyd = vec_splats(y[i].d);
|
|
1944
1945
|
vector float vd = vec_mul(vxd, vyd);
|
|
1945
1946
|
|
|
@@ -2033,7 +2034,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2033
2034
|
float sumf = 0;
|
|
2034
2035
|
for (int i = 0; i < nb; i++) {
|
|
2035
2036
|
|
|
2036
|
-
const float d =
|
|
2037
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2037
2038
|
const int8_t * q8 = y[i].qs;
|
|
2038
2039
|
const uint8_t * qs = x[i].qs;
|
|
2039
2040
|
const uint8_t * qh = x[i].qh;
|
|
@@ -2096,7 +2097,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
2096
2097
|
vector float vsumf3 = vec_splats(0.0f);
|
|
2097
2098
|
|
|
2098
2099
|
for (int i = 0; i < nb; ++i) {
|
|
2099
|
-
vector float vxd = vec_splats(
|
|
2100
|
+
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
|
2100
2101
|
vector float vyd = vec_splats(y[i].d);
|
|
2101
2102
|
vector float vd = vec_mul(vxd, vyd);
|
|
2102
2103
|
|
|
@@ -2176,7 +2177,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
2176
2177
|
|
|
2177
2178
|
float sumf = 0.f;
|
|
2178
2179
|
for (int i = 0; i < nb; ++i) {
|
|
2179
|
-
const float d =
|
|
2180
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2180
2181
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
2181
2182
|
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
2182
2183
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -2236,7 +2237,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2236
2237
|
const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
|
|
2237
2238
|
|
|
2238
2239
|
for (int i = 0; i < nb; ++i) {
|
|
2239
|
-
vector float vxd = vec_splats(
|
|
2240
|
+
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
|
2240
2241
|
vector float vyd = vec_splats(y[i].d);
|
|
2241
2242
|
vector float vd = vec_mul(vxd, vyd);
|
|
2242
2243
|
|
|
@@ -2329,7 +2330,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2329
2330
|
|
|
2330
2331
|
float sumf = 0.f;
|
|
2331
2332
|
for (int i = 0; i < nb; ++i) {
|
|
2332
|
-
const float d =
|
|
2333
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2333
2334
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
2334
2335
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
2335
2336
|
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
|
@@ -2394,7 +2395,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2394
2395
|
vector float vsumf3 = vec_splats(0.0f);
|
|
2395
2396
|
|
|
2396
2397
|
for (int i = 0; i < nb; ++i) {
|
|
2397
|
-
vector float vxd = vec_splats(
|
|
2398
|
+
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
|
2398
2399
|
vector float vyd = vec_splats(y[i].d);
|
|
2399
2400
|
vector float vd = vec_mul(vxd, vyd);
|
|
2400
2401
|
|
|
@@ -2505,7 +2506,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2505
2506
|
qs += 4;
|
|
2506
2507
|
}
|
|
2507
2508
|
|
|
2508
|
-
sumf +=
|
|
2509
|
+
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
|
2509
2510
|
}
|
|
2510
2511
|
|
|
2511
2512
|
*s = sumf;
|
|
@@ -2546,8 +2547,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
2546
2547
|
__builtin_prefetch(y[ib].qs, 0, 1);
|
|
2547
2548
|
|
|
2548
2549
|
|
|
2549
|
-
vector float vxd = vec_splats(
|
|
2550
|
-
vector float vyd = vec_splats(
|
|
2550
|
+
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
|
2551
|
+
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
2551
2552
|
vector float vd = vec_mul(vxd, vyd);
|
|
2552
2553
|
|
|
2553
2554
|
vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
|
|
@@ -2582,7 +2583,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
2582
2583
|
|
|
2583
2584
|
#endif
|
|
2584
2585
|
for (; ib < nb; ++ib) {
|
|
2585
|
-
const float d =
|
|
2586
|
+
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
|
2586
2587
|
int sumi1 = 0, sumi2 = 0;
|
|
2587
2588
|
for (int j = 0; j < QK4_NL/2; ++j) {
|
|
2588
2589
|
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
|
@@ -2620,7 +2621,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
2620
2621
|
|
|
2621
2622
|
for (int ibl = 0; ibl < nb; ++ibl) {
|
|
2622
2623
|
|
|
2623
|
-
vector float vxd = vec_splats(
|
|
2624
|
+
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ibl].d));
|
|
2624
2625
|
vector float vyd = vec_splats(y[ibl].d);
|
|
2625
2626
|
vector float vd = vec_mul(vxd, vyd);
|
|
2626
2627
|
|
|
@@ -2697,7 +2698,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
2697
2698
|
#else
|
|
2698
2699
|
float sumf = 0;
|
|
2699
2700
|
for (int ibl = 0; ibl < nb; ++ibl) {
|
|
2700
|
-
const float d4d8 =
|
|
2701
|
+
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
|
2701
2702
|
uint16_t h = x[ibl].scales_h;
|
|
2702
2703
|
const uint8_t * qs = x[ibl].qs;
|
|
2703
2704
|
const int8_t * q8 = y[ibl].qs;
|