@fugood/llama.node 1.0.0-beta.6 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/lib/binding.ts +12 -0
- package/lib/index.js +10 -0
- package/lib/index.ts +17 -1
- package/package.json +14 -14
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +7 -3
- package/src/LlamaCompletionWorker.h +2 -0
- package/src/LlamaContext.cpp +49 -6
- package/src/LlamaContext.h +1 -0
- package/src/RerankWorker.h +26 -0
- package/src/common.hpp +1 -1
- package/src/llama.cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/src/llama.cpp/include/llama.h +6 -3
- package/src/llama.cpp/src/llama-arch.cpp +54 -0
- package/src/llama.cpp/src/llama-arch.h +17 -0
- package/src/llama.cpp/src/llama-batch.cpp +20 -7
- package/src/llama.cpp/src/llama-chat.cpp +11 -6
- package/src/llama.cpp/src/llama-context.cpp +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +19 -4
- package/src/llama.cpp/src/llama-graph.h +14 -2
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
- package/src/llama.cpp/src/llama-kv-cells.h +33 -9
- package/src/llama.cpp/src/llama-model.cpp +518 -1
- package/src/llama.cpp/src/llama-model.h +22 -0
- package/src/llama.cpp/src/llama-quant.cpp +87 -5
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "ggml-quants.h"
|
|
4
4
|
#include "ggml-impl.h"
|
|
5
5
|
#include "ggml-cpu.h"
|
|
6
|
+
#include "simd-mappings.h"
|
|
6
7
|
|
|
7
8
|
#include "../../quants.h"
|
|
8
9
|
#include "../../ggml-cpu-impl.h"
|
|
@@ -65,7 +66,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
65
66
|
const float d = amax / ((1 << 7) - 1);
|
|
66
67
|
const float id = d ? 1.0f/d : 0.0f;
|
|
67
68
|
|
|
68
|
-
y[i].d =
|
|
69
|
+
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
|
69
70
|
|
|
70
71
|
for (int j = 0; j < 8; j++) {
|
|
71
72
|
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
|
|
@@ -110,7 +111,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
110
111
|
const float d = amax / ((1 << 7) - 1);
|
|
111
112
|
const float id = d ? 1.0f/d : 0.0f;
|
|
112
113
|
|
|
113
|
-
y[i].d =
|
|
114
|
+
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
|
114
115
|
|
|
115
116
|
v128_t accv = wasm_i32x4_splat(0);
|
|
116
117
|
|
|
@@ -126,7 +127,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
126
127
|
accv = wasm_i32x4_add(accv, vi);
|
|
127
128
|
}
|
|
128
129
|
|
|
129
|
-
y[i].s =
|
|
130
|
+
y[i].s = GGML_CPU_FP32_TO_FP16(
|
|
130
131
|
d * (wasm_i32x4_extract_lane(accv, 0) +
|
|
131
132
|
wasm_i32x4_extract_lane(accv, 1) +
|
|
132
133
|
wasm_i32x4_extract_lane(accv, 2) +
|
|
@@ -324,8 +325,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
324
325
|
);
|
|
325
326
|
|
|
326
327
|
// Accumulate results with scaling
|
|
327
|
-
float scale0 =
|
|
328
|
-
float scale1 =
|
|
328
|
+
float scale0 = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d);
|
|
329
|
+
float scale1 = GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d);
|
|
329
330
|
|
|
330
331
|
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp0), wasm_f32x4_splat(scale0)));
|
|
331
332
|
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp1), wasm_f32x4_splat(scale1)));
|
|
@@ -348,7 +349,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
348
349
|
}
|
|
349
350
|
|
|
350
351
|
int sumi = sumi0 + sumi1;
|
|
351
|
-
sumf += sumi*
|
|
352
|
+
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
352
353
|
}
|
|
353
354
|
|
|
354
355
|
*s = sumf;
|
|
@@ -428,7 +429,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
428
429
|
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
|
429
430
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
|
430
431
|
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
|
431
|
-
wasm_f32x4_splat(
|
|
432
|
+
wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d))));
|
|
432
433
|
}
|
|
433
434
|
|
|
434
435
|
sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
|
@@ -454,7 +455,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
454
455
|
}
|
|
455
456
|
|
|
456
457
|
int sumi = sumi0 + sumi1;
|
|
457
|
-
sumf += (
|
|
458
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
458
459
|
}
|
|
459
460
|
|
|
460
461
|
*s = sumf;
|
|
@@ -491,7 +492,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
491
492
|
const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
|
|
492
493
|
const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
|
|
493
494
|
|
|
494
|
-
summs +=
|
|
495
|
+
summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
|
|
495
496
|
|
|
496
497
|
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
|
497
498
|
|
|
@@ -538,7 +539,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
538
539
|
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
|
539
540
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
|
540
541
|
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
|
541
|
-
wasm_f32x4_splat(
|
|
542
|
+
wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d))));
|
|
542
543
|
}
|
|
543
544
|
|
|
544
545
|
sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
|
@@ -564,7 +565,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
564
565
|
}
|
|
565
566
|
|
|
566
567
|
int sumi = sumi0 + sumi1;
|
|
567
|
-
sumf += (
|
|
568
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
568
569
|
}
|
|
569
570
|
|
|
570
571
|
*s = sumf;
|
|
@@ -620,7 +621,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
620
621
|
const v128_t sum_dots = wasm_i32x4_add(wasm_i32x4_add(dx0_0, dx0_1), wasm_i32x4_add(dx1_0, dx1_1));
|
|
621
622
|
|
|
622
623
|
// Convert to float and accumulate
|
|
623
|
-
const float scale =
|
|
624
|
+
const float scale = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d);
|
|
624
625
|
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(sum_dots), wasm_f32x4_splat(scale)));
|
|
625
626
|
}
|
|
626
627
|
|
|
@@ -635,7 +636,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
635
636
|
sumi += x[ib].qs[j]*y[ib].qs[j];
|
|
636
637
|
}
|
|
637
638
|
|
|
638
|
-
sumf += sumi*(
|
|
639
|
+
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
639
640
|
}
|
|
640
641
|
|
|
641
642
|
*s = sumf;
|
|
@@ -746,8 +747,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
746
747
|
isum += wasm_i32x4_extract_lane(isum_vec, 0);
|
|
747
748
|
}
|
|
748
749
|
|
|
749
|
-
const float dall =
|
|
750
|
-
const float dmin =
|
|
750
|
+
const float dall = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
751
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
751
752
|
sumf += dall * isum - dmin * summs;
|
|
752
753
|
}
|
|
753
754
|
|
|
@@ -768,8 +769,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
768
769
|
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
769
770
|
}
|
|
770
771
|
|
|
771
|
-
const float dall = y[i].d *
|
|
772
|
-
const float dmin = y[i].d *
|
|
772
|
+
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
773
|
+
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
773
774
|
|
|
774
775
|
int isum = 0;
|
|
775
776
|
int is = 0;
|
|
@@ -880,7 +881,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
880
881
|
}
|
|
881
882
|
|
|
882
883
|
// Accumulate results
|
|
883
|
-
const float d =
|
|
884
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
884
885
|
const v128_t v_d = wasm_f32x4_splat(d);
|
|
885
886
|
v128_t v_sum = wasm_f32x4_add(
|
|
886
887
|
wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc0), v_d),
|
|
@@ -957,7 +958,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
957
958
|
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
958
959
|
q8 += 8; a += 8;
|
|
959
960
|
}
|
|
960
|
-
const float d =
|
|
961
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
961
962
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
962
963
|
}
|
|
963
964
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -991,8 +992,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
991
992
|
float sumf = 0;
|
|
992
993
|
|
|
993
994
|
for (int i = 0; i < nb; ++i) {
|
|
994
|
-
const float d = y[i].d *
|
|
995
|
-
const float dmin = y[i].d *
|
|
995
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
996
|
+
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Corrected sign
|
|
996
997
|
|
|
997
998
|
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
998
999
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -1136,9 +1137,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1136
1137
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1137
1138
|
q8 += 8; a += 8;
|
|
1138
1139
|
}
|
|
1139
|
-
const float d =
|
|
1140
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1140
1141
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1141
|
-
const float dmin =
|
|
1142
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1142
1143
|
sumf -= dmin * sumi;
|
|
1143
1144
|
}
|
|
1144
1145
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -1170,8 +1171,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1170
1171
|
float sumf = 0;
|
|
1171
1172
|
|
|
1172
1173
|
for (int i = 0; i < nb; ++i) {
|
|
1173
|
-
const float d = y[i].d *
|
|
1174
|
-
const float dmin = y[i].d *
|
|
1174
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1175
|
+
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Fixed sign
|
|
1175
1176
|
|
|
1176
1177
|
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
1177
1178
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
@@ -1331,9 +1332,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1331
1332
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1332
1333
|
q8 += 8; a += 8;
|
|
1333
1334
|
}
|
|
1334
|
-
const float d =
|
|
1335
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1335
1336
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1336
|
-
const float dmin =
|
|
1337
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1337
1338
|
sumf -= dmin * sumi;
|
|
1338
1339
|
}
|
|
1339
1340
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -1420,7 +1421,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1420
1421
|
wasm_v128_store(&aux32[0], acc0);
|
|
1421
1422
|
wasm_v128_store(&aux32[4], acc1);
|
|
1422
1423
|
|
|
1423
|
-
const float d =
|
|
1424
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1424
1425
|
for (int l = 0; l < 8; ++l) {
|
|
1425
1426
|
sums[l] += d * aux32[l];
|
|
1426
1427
|
}
|
|
@@ -1470,7 +1471,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1470
1471
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1471
1472
|
q8 += 8; a += 8;
|
|
1472
1473
|
}
|
|
1473
|
-
const float d =
|
|
1474
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1474
1475
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1475
1476
|
}
|
|
1476
1477
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|