@fugood/llama.node 1.0.0-beta.7 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/lib/binding.ts +10 -0
- package/lib/index.js +8 -0
- package/lib/index.ts +14 -0
- package/package.json +14 -14
- package/src/LlamaContext.cpp +37 -0
- package/src/LlamaContext.h +1 -0
- package/src/RerankWorker.h +26 -0
- package/src/llama.cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/src/llama.cpp/include/llama.h +6 -3
- package/src/llama.cpp/src/llama-arch.cpp +54 -0
- package/src/llama.cpp/src/llama-arch.h +17 -0
- package/src/llama.cpp/src/llama-batch.cpp +20 -7
- package/src/llama.cpp/src/llama-chat.cpp +11 -6
- package/src/llama.cpp/src/llama-context.cpp +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +19 -4
- package/src/llama.cpp/src/llama-graph.h +14 -2
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
- package/src/llama.cpp/src/llama-kv-cells.h +33 -9
- package/src/llama.cpp/src/llama-model.cpp +518 -1
- package/src/llama.cpp/src/llama-model.h +22 -0
- package/src/llama.cpp/src/llama-quant.cpp +87 -5
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "ggml-quants.h"
|
|
4
4
|
#include "ggml-impl.h"
|
|
5
5
|
#include "ggml-cpu.h"
|
|
6
|
+
#include "simd-mappings.h"
|
|
6
7
|
|
|
7
8
|
#include "../../quants.h"
|
|
8
9
|
#include "../../ggml-cpu-impl.h"
|
|
@@ -45,7 +46,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
45
46
|
const float d = amax / ((1 << 7) - 1);
|
|
46
47
|
const float id = d ? 1.0f/d : 0.0f;
|
|
47
48
|
|
|
48
|
-
y[i].d =
|
|
49
|
+
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
|
49
50
|
|
|
50
51
|
vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl);
|
|
51
52
|
|
|
@@ -85,7 +86,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
85
86
|
const float d = amax / ((1 << 7) - 1);
|
|
86
87
|
const float id = d ? 1.0f/d : 0.0f;
|
|
87
88
|
|
|
88
|
-
y[i].d =
|
|
89
|
+
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
|
89
90
|
|
|
90
91
|
vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl);
|
|
91
92
|
|
|
@@ -102,7 +103,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
102
103
|
|
|
103
104
|
// set y[i].s
|
|
104
105
|
int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
|
|
105
|
-
y[i].s =
|
|
106
|
+
y[i].s = GGML_CPU_FP32_TO_FP16(sum*d);
|
|
106
107
|
}
|
|
107
108
|
|
|
108
109
|
#else
|
|
@@ -160,7 +161,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
160
161
|
|
|
161
162
|
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
|
162
163
|
|
|
163
|
-
sumf += sumi*
|
|
164
|
+
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
164
165
|
}
|
|
165
166
|
|
|
166
167
|
#endif
|
|
@@ -177,7 +178,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
177
178
|
}
|
|
178
179
|
|
|
179
180
|
int sumi = sumi0 + sumi1;
|
|
180
|
-
sumf += sumi*
|
|
181
|
+
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
181
182
|
}
|
|
182
183
|
|
|
183
184
|
*s = sumf;
|
|
@@ -225,7 +226,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
225
226
|
|
|
226
227
|
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
|
227
228
|
|
|
228
|
-
sumf += (
|
|
229
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
229
230
|
}
|
|
230
231
|
|
|
231
232
|
#endif
|
|
@@ -242,7 +243,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
242
243
|
}
|
|
243
244
|
|
|
244
245
|
int sumi = sumi0 + sumi1;
|
|
245
|
-
sumf += (
|
|
246
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
246
247
|
}
|
|
247
248
|
|
|
248
249
|
*s = sumf;
|
|
@@ -293,7 +294,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
293
294
|
vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl);
|
|
294
295
|
int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum);
|
|
295
296
|
|
|
296
|
-
sumf += (
|
|
297
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
297
298
|
}
|
|
298
299
|
|
|
299
300
|
#endif
|
|
@@ -316,7 +317,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
316
317
|
}
|
|
317
318
|
|
|
318
319
|
int sumi = sumi0 + sumi1;
|
|
319
|
-
sumf += (
|
|
320
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
320
321
|
}
|
|
321
322
|
|
|
322
323
|
*s = sumf;
|
|
@@ -366,7 +367,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
366
367
|
vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl);
|
|
367
368
|
int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum);
|
|
368
369
|
|
|
369
|
-
sumf += (
|
|
370
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
370
371
|
}
|
|
371
372
|
|
|
372
373
|
#endif
|
|
@@ -389,7 +390,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
389
390
|
}
|
|
390
391
|
|
|
391
392
|
int sumi = sumi0 + sumi1;
|
|
392
|
-
sumf += (
|
|
393
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
393
394
|
}
|
|
394
395
|
|
|
395
396
|
*s = sumf;
|
|
@@ -427,7 +428,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
427
428
|
|
|
428
429
|
int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
|
|
429
430
|
|
|
430
|
-
sumf += sumi*(
|
|
431
|
+
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
431
432
|
}
|
|
432
433
|
|
|
433
434
|
#endif
|
|
@@ -438,7 +439,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
438
439
|
sumi += x[ib].qs[j]*y[ib].qs[j];
|
|
439
440
|
}
|
|
440
441
|
|
|
441
|
-
sumf += sumi*(
|
|
442
|
+
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
442
443
|
}
|
|
443
444
|
|
|
444
445
|
*s = sumf;
|
|
@@ -465,8 +466,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
465
466
|
const uint8_t * q2 = x[i].qs;
|
|
466
467
|
const int8_t * q8 = y[i].qs;
|
|
467
468
|
const uint8_t * sc = x[i].scales;
|
|
468
|
-
const float dall = y[i].d *
|
|
469
|
-
const float dmin = -y[i].d *
|
|
469
|
+
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
470
|
+
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
470
471
|
uint8_t *patmp = atmp;
|
|
471
472
|
int vsums;
|
|
472
473
|
int tmp;
|
|
@@ -569,8 +570,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
569
570
|
const int8_t * q8 = y[i].qs;
|
|
570
571
|
const uint8_t * sc = x[i].scales;
|
|
571
572
|
|
|
572
|
-
const float dall = y[i].d *
|
|
573
|
-
const float dmin = -y[i].d *
|
|
573
|
+
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
574
|
+
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
574
575
|
|
|
575
576
|
size_t vl = 16;
|
|
576
577
|
|
|
@@ -644,8 +645,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
644
645
|
const uint8_t * q2 = x[i].qs;
|
|
645
646
|
const int8_t * q8 = y[i].qs;
|
|
646
647
|
const uint8_t * sc = x[i].scales;
|
|
647
|
-
const float dall = y[i].d *
|
|
648
|
-
const float dmin = -y[i].d *
|
|
648
|
+
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
649
|
+
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
649
650
|
uint8_t *patmp = atmp;
|
|
650
651
|
int vsums;
|
|
651
652
|
int tmp;
|
|
@@ -750,8 +751,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
750
751
|
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
751
752
|
}
|
|
752
753
|
|
|
753
|
-
const float dall = y[i].d *
|
|
754
|
-
const float dmin = y[i].d *
|
|
754
|
+
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
755
|
+
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
755
756
|
|
|
756
757
|
int isum = 0;
|
|
757
758
|
int is = 0;
|
|
@@ -916,7 +917,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
916
917
|
q3 += 32; q8 += 128; scale += 8;
|
|
917
918
|
}
|
|
918
919
|
|
|
919
|
-
const float d =
|
|
920
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
920
921
|
sumf += d * isum;
|
|
921
922
|
}
|
|
922
923
|
|
|
@@ -1017,7 +1018,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1017
1018
|
|
|
1018
1019
|
}
|
|
1019
1020
|
|
|
1020
|
-
const float d =
|
|
1021
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1021
1022
|
|
|
1022
1023
|
sumf += d*sum_t;
|
|
1023
1024
|
|
|
@@ -1134,7 +1135,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1134
1135
|
q3 += 32; q8 += 128; scale += 8;
|
|
1135
1136
|
}
|
|
1136
1137
|
|
|
1137
|
-
const float d =
|
|
1138
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1138
1139
|
sumf += d * isum;
|
|
1139
1140
|
}
|
|
1140
1141
|
break;
|
|
@@ -1202,7 +1203,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1202
1203
|
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
1203
1204
|
q8 += 8; a += 8;
|
|
1204
1205
|
}
|
|
1205
|
-
const float d =
|
|
1206
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1206
1207
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1207
1208
|
}
|
|
1208
1209
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -1239,8 +1240,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1239
1240
|
float sumf = 0;
|
|
1240
1241
|
|
|
1241
1242
|
for (int i = 0; i < nb; ++i) {
|
|
1242
|
-
const float d = y[i].d *
|
|
1243
|
-
const float dmin = y[i].d *
|
|
1243
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1244
|
+
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1244
1245
|
|
|
1245
1246
|
int tmp, tmp2, sumi;
|
|
1246
1247
|
__asm__ __volatile__(
|
|
@@ -1361,8 +1362,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1361
1362
|
|
|
1362
1363
|
size_t vl = 8;
|
|
1363
1364
|
|
|
1364
|
-
const float d = y[i].d *
|
|
1365
|
-
const float dmin = y[i].d *
|
|
1365
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1366
|
+
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1366
1367
|
|
|
1367
1368
|
vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
|
|
1368
1369
|
vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
|
|
@@ -1422,8 +1423,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1422
1423
|
break;
|
|
1423
1424
|
case 128:
|
|
1424
1425
|
for (int i = 0; i < nb; ++i) {
|
|
1425
|
-
const float d = y[i].d *
|
|
1426
|
-
const float dmin = y[i].d *
|
|
1426
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1427
|
+
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1427
1428
|
|
|
1428
1429
|
int tmp, tmp2, sumi;
|
|
1429
1430
|
__asm__ __volatile__(
|
|
@@ -1580,9 +1581,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1580
1581
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1581
1582
|
q8 += 8; a += 8;
|
|
1582
1583
|
}
|
|
1583
|
-
const float d =
|
|
1584
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1584
1585
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1585
|
-
const float dmin =
|
|
1586
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1586
1587
|
sumf -= dmin * sumi;
|
|
1587
1588
|
}
|
|
1588
1589
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -1627,8 +1628,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1627
1628
|
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
|
1628
1629
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1629
1630
|
|
|
1630
|
-
const float d =
|
|
1631
|
-
const float dmin =
|
|
1631
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1632
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1632
1633
|
|
|
1633
1634
|
vint16m1_t q8sums_0 = __riscv_vlse16_v_i16m1(y[i].bsums, 4, vl);
|
|
1634
1635
|
vint16m1_t q8sums_1 = __riscv_vlse16_v_i16m1(y[i].bsums+1, 4, vl);
|
|
@@ -1749,9 +1750,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1749
1750
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1750
1751
|
q8 += 8; a += 8;
|
|
1751
1752
|
}
|
|
1752
|
-
const float d =
|
|
1753
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1753
1754
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1754
|
-
const float dmin =
|
|
1755
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1755
1756
|
sumf -= dmin * sumi;
|
|
1756
1757
|
}
|
|
1757
1758
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -1778,7 +1779,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1778
1779
|
|
|
1779
1780
|
for (int i = 0; i < nb; ++i) {
|
|
1780
1781
|
|
|
1781
|
-
const float d =
|
|
1782
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1782
1783
|
|
|
1783
1784
|
const uint8_t * restrict q6 = x[i].ql;
|
|
1784
1785
|
const uint8_t * restrict qh = x[i].qh;
|
|
@@ -1862,7 +1863,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1862
1863
|
case 256:
|
|
1863
1864
|
for (int i = 0; i < nb; ++i) {
|
|
1864
1865
|
|
|
1865
|
-
const float d =
|
|
1866
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1866
1867
|
|
|
1867
1868
|
const uint8_t * GGML_RESTRICT q6 = x[i].ql;
|
|
1868
1869
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
@@ -1943,7 +1944,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1943
1944
|
case 128:
|
|
1944
1945
|
for (int i = 0; i < nb; ++i) {
|
|
1945
1946
|
|
|
1946
|
-
const float d =
|
|
1947
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1947
1948
|
|
|
1948
1949
|
const uint8_t * restrict q6 = x[i].ql;
|
|
1949
1950
|
const uint8_t * restrict qh = x[i].qh;
|
|
@@ -2058,7 +2059,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2058
2059
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2059
2060
|
q8 += 8; a += 8;
|
|
2060
2061
|
}
|
|
2061
|
-
const float d =
|
|
2062
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2062
2063
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2063
2064
|
}
|
|
2064
2065
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
#include "ggml-impl.h"
|
|
7
7
|
#include "ggml-cpu.h"
|
|
8
8
|
#include "ggml-cpu-impl.h"
|
|
9
|
+
#include "simd-mappings.h"
|
|
9
10
|
#include "traits.h"
|
|
10
11
|
|
|
11
12
|
#include <cmath>
|
|
@@ -90,16 +91,16 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
90
91
|
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
|
91
92
|
|
|
92
93
|
// vector version needs Zvfhmin extension
|
|
93
|
-
const float a_scale =
|
|
94
|
+
const float a_scale = GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
94
95
|
const float b_scales[8] = {
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
96
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]),
|
|
97
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]),
|
|
98
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]),
|
|
99
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]),
|
|
100
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]),
|
|
101
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]),
|
|
102
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]),
|
|
103
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7])
|
|
103
104
|
};
|
|
104
105
|
const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
|
|
105
106
|
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4);
|
|
@@ -129,7 +130,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
129
130
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
130
131
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
131
132
|
}
|
|
132
|
-
sumf[j] += sumi *
|
|
133
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
133
134
|
}
|
|
134
135
|
}
|
|
135
136
|
}
|
|
@@ -181,20 +182,20 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
181
182
|
|
|
182
183
|
// vector version needs Zvfhmin extension
|
|
183
184
|
const float a_scales[4] = {
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
185
|
+
GGML_CPU_FP16_TO_FP32(a_ptr[l].d[0]),
|
|
186
|
+
GGML_CPU_FP16_TO_FP32(a_ptr[l].d[1]),
|
|
187
|
+
GGML_CPU_FP16_TO_FP32(a_ptr[l].d[2]),
|
|
188
|
+
GGML_CPU_FP16_TO_FP32(a_ptr[l].d[3])
|
|
188
189
|
};
|
|
189
190
|
const float b_scales[8] = {
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
191
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]),
|
|
192
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]),
|
|
193
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]),
|
|
194
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]),
|
|
195
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]),
|
|
196
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]),
|
|
197
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]),
|
|
198
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7])
|
|
198
199
|
};
|
|
199
200
|
const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
|
|
200
201
|
|
|
@@ -382,7 +383,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
382
383
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
383
384
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
384
385
|
}
|
|
385
|
-
sumf[m][j] += sumi *
|
|
386
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
386
387
|
}
|
|
387
388
|
}
|
|
388
389
|
}
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "ggml-quants.h"
|
|
4
4
|
#include "ggml-impl.h"
|
|
5
5
|
#include "ggml-cpu.h"
|
|
6
|
+
#include "simd-mappings.h"
|
|
6
7
|
|
|
7
8
|
#include "../../quants.h"
|
|
8
9
|
#include "../../ggml-cpu-impl.h"
|
|
@@ -49,7 +50,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
49
50
|
const float d = amax / ((1 << 7) - 1);
|
|
50
51
|
const float id = d ? 1.0f / d : 0.0f;
|
|
51
52
|
|
|
52
|
-
y[i].d =
|
|
53
|
+
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
|
53
54
|
|
|
54
55
|
for (int j = 0; j < 8; j++) {
|
|
55
56
|
const __vector float v = vec_mul(srcv[j], vec_splats(id));
|
|
@@ -94,7 +95,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
94
95
|
const float d = amax / ((1 << 7) - 1);
|
|
95
96
|
const float id = d ? 1.0f / d : 0.0f;
|
|
96
97
|
|
|
97
|
-
y[i].d =
|
|
98
|
+
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
|
98
99
|
|
|
99
100
|
__vector int32_t acc = vec_splats(0);
|
|
100
101
|
|
|
@@ -110,7 +111,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
110
111
|
acc = vec_add(acc, vi);
|
|
111
112
|
}
|
|
112
113
|
|
|
113
|
-
y[i].s =
|
|
114
|
+
y[i].s = GGML_CPU_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
|
|
114
115
|
}
|
|
115
116
|
#else
|
|
116
117
|
GGML_UNUSED(nb);
|
|
@@ -164,7 +165,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
164
165
|
__vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
|
|
165
166
|
|
|
166
167
|
const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
|
|
167
|
-
const __vector float v_d = vec_splats(
|
|
168
|
+
const __vector float v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
168
169
|
|
|
169
170
|
acc = vec_madd(v_xy, v_d, acc);
|
|
170
171
|
}
|
|
@@ -185,7 +186,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
185
186
|
}
|
|
186
187
|
|
|
187
188
|
int sumi = sumi0 + sumi1;
|
|
188
|
-
sumf += sumi*
|
|
189
|
+
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
189
190
|
}
|
|
190
191
|
|
|
191
192
|
*s = sumf;
|
|
@@ -219,7 +220,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
219
220
|
__builtin_prefetch(x[ib].qs, 0, 1);
|
|
220
221
|
__builtin_prefetch(y[ib].qs, 0, 1);
|
|
221
222
|
|
|
222
|
-
summs +=
|
|
223
|
+
summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
223
224
|
|
|
224
225
|
const uint8x16_t v_x = vec_xl(0, x[ib].qs);
|
|
225
226
|
const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
|
|
@@ -231,7 +232,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
231
232
|
const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
|
232
233
|
const float32x4_t v_xy = vec_float(v_xy_);
|
|
233
234
|
|
|
234
|
-
const float32x4_t v_d = vec_splats(
|
|
235
|
+
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
235
236
|
|
|
236
237
|
acc = vec_madd(v_xy, v_d, acc);
|
|
237
238
|
}
|
|
@@ -252,7 +253,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
252
253
|
}
|
|
253
254
|
|
|
254
255
|
int sumi = sumi0 + sumi1;
|
|
255
|
-
sumf += (
|
|
256
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
256
257
|
}
|
|
257
258
|
|
|
258
259
|
*s = sumf;
|
|
@@ -290,7 +291,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
290
291
|
|
|
291
292
|
const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
|
292
293
|
const float32x4_t v_xy = vec_float(v_xy_);
|
|
293
|
-
const float32x4_t v_d = vec_splats(
|
|
294
|
+
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
294
295
|
|
|
295
296
|
acc = vec_madd(v_xy, v_d, acc);
|
|
296
297
|
}
|
|
@@ -305,7 +306,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
305
306
|
sumi += x[ib].qs[j]*y[ib].qs[j];
|
|
306
307
|
}
|
|
307
308
|
|
|
308
|
-
sumf += sumi*(
|
|
309
|
+
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
309
310
|
}
|
|
310
311
|
|
|
311
312
|
*s = sumf;
|
|
@@ -348,7 +349,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
348
349
|
float sum = 0;
|
|
349
350
|
|
|
350
351
|
for (int i = 0; i < nb; ++i) {
|
|
351
|
-
const float d = y[i].d *
|
|
352
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
352
353
|
|
|
353
354
|
const uint8_t * restrict x0l = x[i].qs;
|
|
354
355
|
const uint8_t * restrict x0h = x[i].hmask;
|
|
@@ -497,7 +498,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
497
498
|
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
498
499
|
q8 += 8; a += 8;
|
|
499
500
|
}
|
|
500
|
-
const float d =
|
|
501
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
501
502
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
502
503
|
}
|
|
503
504
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -537,8 +538,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
537
538
|
float sumf = 0;
|
|
538
539
|
|
|
539
540
|
for (int i = 0; i < nb; ++i) {
|
|
540
|
-
const float d = y[i].d *
|
|
541
|
-
const float dmin = y[i].d *
|
|
541
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
542
|
+
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
542
543
|
|
|
543
544
|
const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
|
|
544
545
|
const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
|
|
@@ -647,9 +648,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
647
648
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
648
649
|
q8 += 8; a += 8;
|
|
649
650
|
}
|
|
650
|
-
const float d =
|
|
651
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
651
652
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
652
|
-
const float dmin =
|
|
653
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
653
654
|
sumf -= dmin * sumi;
|
|
654
655
|
}
|
|
655
656
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -698,8 +699,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
698
699
|
float sumf = 0;
|
|
699
700
|
|
|
700
701
|
for (int i = 0; i < nb; ++i) {
|
|
701
|
-
const float d = y[i].d *
|
|
702
|
-
const float dmin = y[i].d *
|
|
702
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
703
|
+
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
703
704
|
|
|
704
705
|
const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
|
|
705
706
|
const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
|
|
@@ -819,9 +820,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
819
820
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
820
821
|
q8 += 8; a += 8;
|
|
821
822
|
}
|
|
822
|
-
const float d =
|
|
823
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
823
824
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
824
|
-
const float dmin =
|
|
825
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
825
826
|
sumf -= dmin * sumi;
|
|
826
827
|
}
|
|
827
828
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -859,7 +860,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
859
860
|
int8x16_t v_y[4];
|
|
860
861
|
|
|
861
862
|
for (int i = 0; i < nb; ++i) {
|
|
862
|
-
const float d_all =
|
|
863
|
+
const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
863
864
|
|
|
864
865
|
const uint8_t * GGML_RESTRICT x0l = x[i].ql;
|
|
865
866
|
const uint8_t * GGML_RESTRICT x0h = x[i].qh;
|
|
@@ -1004,7 +1005,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1004
1005
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1005
1006
|
q8 += 8; a += 8;
|
|
1006
1007
|
}
|
|
1007
|
-
const float d =
|
|
1008
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1008
1009
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1009
1010
|
}
|
|
1010
1011
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -1071,7 +1072,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1071
1072
|
// float sumf = 0;
|
|
1072
1073
|
|
|
1073
1074
|
// for (int i = 0; i < nb; ++i) {
|
|
1074
|
-
// const float d =
|
|
1075
|
+
// const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1075
1076
|
// const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
1076
1077
|
// const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1077
1078
|
|
|
@@ -1121,7 +1122,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1121
1122
|
|
|
1122
1123
|
// float sumf = 0.f;
|
|
1123
1124
|
// for (int i = 0; i < nb; ++i) {
|
|
1124
|
-
// const float d =
|
|
1125
|
+
// const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1125
1126
|
// const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
1126
1127
|
// const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1127
1128
|
// int32_t bsum = 0;
|
|
@@ -1182,12 +1183,12 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
1182
1183
|
const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
|
|
1183
1184
|
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
|
1184
1185
|
|
|
1185
|
-
sumf +=
|
|
1186
|
+
sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
|
|
1186
1187
|
}
|
|
1187
1188
|
|
|
1188
1189
|
#endif
|
|
1189
1190
|
for (; ib < nb; ++ib) {
|
|
1190
|
-
const float d =
|
|
1191
|
+
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
|
1191
1192
|
int sumi1 = 0, sumi2 = 0;
|
|
1192
1193
|
for (int j = 0; j < QK4_NL/2; ++j) {
|
|
1193
1194
|
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
|
@@ -1257,7 +1258,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
1257
1258
|
sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
|
|
1258
1259
|
}
|
|
1259
1260
|
|
|
1260
|
-
sumf +=
|
|
1261
|
+
sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
|
|
1261
1262
|
}
|
|
1262
1263
|
|
|
1263
1264
|
*s = sumf;
|
|
@@ -1265,7 +1266,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
1265
1266
|
#else
|
|
1266
1267
|
float sumf = 0;
|
|
1267
1268
|
for (int ibl = 0; ibl < nb; ++ibl) {
|
|
1268
|
-
const float d4d8 =
|
|
1269
|
+
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
|
1269
1270
|
uint16_t h = x[ibl].scales_h;
|
|
1270
1271
|
const uint8_t * qs = x[ibl].qs;
|
|
1271
1272
|
const int8_t * q8 = y[ibl].qs;
|