@fugood/llama.node 1.0.0-beta.7 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/lib/binding.ts +10 -0
  3. package/lib/index.js +8 -0
  4. package/lib/index.ts +14 -0
  5. package/package.json +14 -14
  6. package/src/LlamaContext.cpp +58 -8
  7. package/src/LlamaContext.h +1 -0
  8. package/src/RerankWorker.h +26 -0
  9. package/src/llama.cpp/CMakeLists.txt +1 -1
  10. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  11. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  12. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  13. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  25. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  26. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  27. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
  29. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
  31. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  32. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
  33. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  34. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  35. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  36. package/src/llama.cpp/include/llama.h +6 -3
  37. package/src/llama.cpp/src/llama-arch.cpp +54 -0
  38. package/src/llama.cpp/src/llama-arch.h +17 -0
  39. package/src/llama.cpp/src/llama-batch.cpp +20 -7
  40. package/src/llama.cpp/src/llama-chat.cpp +11 -6
  41. package/src/llama.cpp/src/llama-context.cpp +0 -1
  42. package/src/llama.cpp/src/llama-graph.cpp +19 -4
  43. package/src/llama.cpp/src/llama-graph.h +14 -2
  44. package/src/llama.cpp/src/llama-hparams.h +6 -0
  45. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
  46. package/src/llama.cpp/src/llama-kv-cells.h +33 -9
  47. package/src/llama.cpp/src/llama-model.cpp +518 -1
  48. package/src/llama.cpp/src/llama-model.h +22 -0
  49. package/src/llama.cpp/src/llama-quant.cpp +87 -5
@@ -3,6 +3,7 @@
3
3
  #include "ggml-quants.h"
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-cpu.h"
6
+ #include "simd-mappings.h"
6
7
 
7
8
  #include "../../quants.h"
8
9
  #include "../../ggml-cpu-impl.h"
@@ -65,7 +66,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
65
66
  const float d = amax / ((1 << 7) - 1);
66
67
  const float id = d ? 1.0f/d : 0.0f;
67
68
 
68
- y[i].d = GGML_FP32_TO_FP16(d);
69
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
69
70
 
70
71
  for (int j = 0; j < 8; j++) {
71
72
  const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
@@ -110,7 +111,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
110
111
  const float d = amax / ((1 << 7) - 1);
111
112
  const float id = d ? 1.0f/d : 0.0f;
112
113
 
113
- y[i].d = GGML_FP32_TO_FP16(d);
114
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
114
115
 
115
116
  v128_t accv = wasm_i32x4_splat(0);
116
117
 
@@ -126,7 +127,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
126
127
  accv = wasm_i32x4_add(accv, vi);
127
128
  }
128
129
 
129
- y[i].s = GGML_FP32_TO_FP16(
130
+ y[i].s = GGML_CPU_FP32_TO_FP16(
130
131
  d * (wasm_i32x4_extract_lane(accv, 0) +
131
132
  wasm_i32x4_extract_lane(accv, 1) +
132
133
  wasm_i32x4_extract_lane(accv, 2) +
@@ -324,8 +325,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
324
325
  );
325
326
 
326
327
  // Accumulate results with scaling
327
- float scale0 = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d);
328
- float scale1 = GGML_FP16_TO_FP32(x1->d) * GGML_FP16_TO_FP32(y1->d);
328
+ float scale0 = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d);
329
+ float scale1 = GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d);
329
330
 
330
331
  sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp0), wasm_f32x4_splat(scale0)));
331
332
  sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp1), wasm_f32x4_splat(scale1)));
@@ -348,7 +349,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
348
349
  }
349
350
 
350
351
  int sumi = sumi0 + sumi1;
351
- sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
352
+ sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
352
353
  }
353
354
 
354
355
  *s = sumf;
@@ -428,7 +429,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
428
429
  wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
429
430
  wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
430
431
  wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
431
- wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
432
+ wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d))));
432
433
  }
433
434
 
434
435
  sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@@ -454,7 +455,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
454
455
  }
455
456
 
456
457
  int sumi = sumi0 + sumi1;
457
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
458
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
458
459
  }
459
460
 
460
461
  *s = sumf;
@@ -491,7 +492,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
491
492
  const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
492
493
  const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
493
494
 
494
- summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
495
+ summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
495
496
 
496
497
  const v128_t m4b = wasm_i8x16_splat(0x0F);
497
498
 
@@ -538,7 +539,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
538
539
  wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
539
540
  wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
540
541
  wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
541
- wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
542
+ wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d))));
542
543
  }
543
544
 
544
545
  sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@@ -564,7 +565,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
564
565
  }
565
566
 
566
567
  int sumi = sumi0 + sumi1;
567
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
568
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
568
569
  }
569
570
 
570
571
  *s = sumf;
@@ -620,7 +621,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
620
621
  const v128_t sum_dots = wasm_i32x4_add(wasm_i32x4_add(dx0_0, dx0_1), wasm_i32x4_add(dx1_0, dx1_1));
621
622
 
622
623
  // Convert to float and accumulate
623
- const float scale = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d);
624
+ const float scale = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d);
624
625
  sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(sum_dots), wasm_f32x4_splat(scale)));
625
626
  }
626
627
 
@@ -635,7 +636,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
635
636
  sumi += x[ib].qs[j]*y[ib].qs[j];
636
637
  }
637
638
 
638
- sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
639
+ sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
639
640
  }
640
641
 
641
642
  *s = sumf;
@@ -746,8 +747,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
746
747
  isum += wasm_i32x4_extract_lane(isum_vec, 0);
747
748
  }
748
749
 
749
- const float dall = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
750
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
750
+ const float dall = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
751
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
751
752
  sumf += dall * isum - dmin * summs;
752
753
  }
753
754
 
@@ -768,8 +769,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
768
769
  summs += y[i].bsums[j] * (sc[j] >> 4);
769
770
  }
770
771
 
771
- const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
772
- const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
772
+ const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
773
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
773
774
 
774
775
  int isum = 0;
775
776
  int is = 0;
@@ -880,7 +881,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
880
881
  }
881
882
 
882
883
  // Accumulate results
883
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
884
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
884
885
  const v128_t v_d = wasm_f32x4_splat(d);
885
886
  v128_t v_sum = wasm_f32x4_add(
886
887
  wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc0), v_d),
@@ -957,7 +958,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
957
958
  for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
958
959
  q8 += 8; a += 8;
959
960
  }
960
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
961
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
961
962
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
962
963
  }
963
964
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -991,8 +992,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
991
992
  float sumf = 0;
992
993
 
993
994
  for (int i = 0; i < nb; ++i) {
994
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
995
- const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign
995
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
996
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Corrected sign
996
997
 
997
998
  const uint8_t * GGML_RESTRICT q4 = x[i].qs;
998
999
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -1136,9 +1137,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1136
1137
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1137
1138
  q8 += 8; a += 8;
1138
1139
  }
1139
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1140
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1140
1141
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1141
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
1142
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1142
1143
  sumf -= dmin * sumi;
1143
1144
  }
1144
1145
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -1170,8 +1171,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1170
1171
  float sumf = 0;
1171
1172
 
1172
1173
  for (int i = 0; i < nb; ++i) {
1173
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1174
- const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Fixed sign
1174
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1175
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Fixed sign
1175
1176
 
1176
1177
  const uint8_t * GGML_RESTRICT q5 = x[i].qs;
1177
1178
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
@@ -1331,9 +1332,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1331
1332
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1332
1333
  q8 += 8; a += 8;
1333
1334
  }
1334
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1335
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1335
1336
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1336
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
1337
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1337
1338
  sumf -= dmin * sumi;
1338
1339
  }
1339
1340
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -1420,7 +1421,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1420
1421
  wasm_v128_store(&aux32[0], acc0);
1421
1422
  wasm_v128_store(&aux32[4], acc1);
1422
1423
 
1423
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1424
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1424
1425
  for (int l = 0; l < 8; ++l) {
1425
1426
  sums[l] += d * aux32[l];
1426
1427
  }
@@ -1470,7 +1471,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1470
1471
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1471
1472
  q8 += 8; a += 8;
1472
1473
  }
1473
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1474
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1474
1475
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1475
1476
  }
1476
1477
  for (int l = 0; l < 8; ++l) sumf += sums[l];