@fugood/llama.node 1.1.10 → 1.2.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/CMakeLists.txt +5 -8
  2. package/lib/binding.ts +20 -2
  3. package/lib/index.js +2 -2
  4. package/lib/index.ts +2 -2
  5. package/package.json +20 -16
  6. package/src/DecodeAudioTokenWorker.cpp +23 -26
  7. package/src/DecodeAudioTokenWorker.h +6 -8
  8. package/src/DetokenizeWorker.cpp +5 -8
  9. package/src/DetokenizeWorker.h +6 -5
  10. package/src/DisposeWorker.cpp +23 -3
  11. package/src/DisposeWorker.h +4 -2
  12. package/src/EmbeddingWorker.cpp +9 -35
  13. package/src/EmbeddingWorker.h +3 -2
  14. package/src/LlamaCompletionWorker.cpp +217 -315
  15. package/src/LlamaCompletionWorker.h +6 -12
  16. package/src/LlamaContext.cpp +174 -388
  17. package/src/LlamaContext.h +8 -13
  18. package/src/LoadSessionWorker.cpp +22 -19
  19. package/src/LoadSessionWorker.h +3 -2
  20. package/src/RerankWorker.h +3 -2
  21. package/src/SaveSessionWorker.cpp +22 -19
  22. package/src/SaveSessionWorker.h +3 -2
  23. package/src/TokenizeWorker.cpp +38 -35
  24. package/src/TokenizeWorker.h +12 -3
  25. package/src/common.hpp +0 -458
  26. package/src/llama.cpp/common/arg.cpp +67 -37
  27. package/src/llama.cpp/common/chat.cpp +263 -2
  28. package/src/llama.cpp/common/chat.h +4 -0
  29. package/src/llama.cpp/common/common.cpp +10 -3
  30. package/src/llama.cpp/common/common.h +5 -2
  31. package/src/llama.cpp/common/log.cpp +53 -2
  32. package/src/llama.cpp/common/log.h +10 -4
  33. package/src/llama.cpp/common/sampling.cpp +23 -2
  34. package/src/llama.cpp/common/sampling.h +3 -1
  35. package/src/llama.cpp/common/speculative.cpp +1 -1
  36. package/src/llama.cpp/ggml/CMakeLists.txt +4 -3
  37. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
  38. package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
  39. package/src/llama.cpp/ggml/include/ggml.h +50 -1
  40. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +19 -16
  41. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
  42. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -7
  43. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
  44. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
  45. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
  46. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
  47. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +18 -18
  48. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +234 -16
  50. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  51. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +80 -51
  52. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +161 -20
  53. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +399 -50
  54. package/src/llama.cpp/include/llama.h +32 -7
  55. package/src/llama.cpp/src/llama-adapter.cpp +101 -4
  56. package/src/llama.cpp/src/llama-adapter.h +6 -0
  57. package/src/llama.cpp/src/llama-arch.cpp +69 -2
  58. package/src/llama.cpp/src/llama-arch.h +6 -0
  59. package/src/llama.cpp/src/llama-context.cpp +92 -45
  60. package/src/llama.cpp/src/llama-context.h +1 -5
  61. package/src/llama.cpp/src/llama-graph.cpp +74 -19
  62. package/src/llama.cpp/src/llama-graph.h +10 -1
  63. package/src/llama.cpp/src/llama-hparams.cpp +37 -0
  64. package/src/llama.cpp/src/llama-hparams.h +9 -3
  65. package/src/llama.cpp/src/llama-impl.h +2 -0
  66. package/src/llama.cpp/src/llama-kv-cache.cpp +33 -120
  67. package/src/llama.cpp/src/llama-kv-cache.h +4 -13
  68. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  69. package/src/llama.cpp/src/llama-model.cpp +434 -21
  70. package/src/llama.cpp/src/llama-model.h +1 -1
  71. package/src/llama.cpp/src/llama-sampling.cpp +226 -126
  72. package/src/llama.cpp/src/llama-vocab.cpp +1 -1
  73. package/src/llama.cpp/src/llama.cpp +12 -0
  74. package/src/anyascii.c +0 -22223
  75. package/src/anyascii.h +0 -42
  76. package/src/tts_utils.cpp +0 -371
  77. package/src/tts_utils.h +0 -103
@@ -84,6 +84,22 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
84
84
  }
85
85
  // reduce sum1,sum2 to sum1
86
86
  GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
87
+ #elif defined(__riscv_v_intrinsic)
88
+ int vl = __riscv_vsetvlmax_e32m8();
89
+ vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
90
+ vfloat32m8_t vsum;
91
+ vfloat32m8_t ax;
92
+ vfloat32m8_t ay;
93
+ vsum = __riscv_vfmv_v_f_f32m8_tu(vsum, 0.0f, vl);
94
+ for (int i = 0; i < n; i += vl) {
95
+ vl = __riscv_vsetvl_e32m8(n - i);
96
+ ax = __riscv_vle32_v_f32m8_tu(ax, &x[i], vl);
97
+ ay = __riscv_vle32_v_f32m8_tu(ay, &y[i], vl);
98
+ vsum = __riscv_vfmacc_vv_f32m8_tu(vsum, ax, ay, vl);
99
+ }
100
+ vl = __riscv_vsetvlmax_e32m8();
101
+ vs = __riscv_vfredusum_vs_f32m8_f32m1(vsum, vs, vl);
102
+ sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
87
103
  #else
88
104
  const int np = (n & ~(GGML_F32_STEP - 1));
89
105
 
@@ -197,38 +213,125 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
197
213
 
198
214
  ggml_float sumf = 0.0;
199
215
 
200
- #if defined(GGML_SIMD)
201
- const int np = (n & ~(GGML_F16_STEP - 1));
202
216
 
203
- GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
217
+ #if defined(GGML_SIMD)
218
+ #if defined(__ARM_FEATURE_SVE)
219
+ const int sve_register_length = svcntb() * 8; //get vector length
220
+ const int ggml_f16_epr = sve_register_length / 16; // running when 16
221
+ const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
222
+
223
+ const int np= (n & ~(ggml_f16_step - 1));
224
+ svfloat16_t sum1 = svdup_n_f16(0.0f);
225
+ svfloat16_t sum2 = svdup_n_f16(0.0f);
226
+ svfloat16_t sum3 = svdup_n_f16(0.0f);
227
+ svfloat16_t sum4 = svdup_n_f16(0.0f);
228
+
229
+ svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
230
+ svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
231
+ for (int i = 0; i < np; i += ggml_f16_step) {
232
+ ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
233
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
234
+ sum1 = GGML_F16x_VEC_FMA(sum1, ax1, ay1);
235
+
236
+ ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
237
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
238
+ sum2 = GGML_F16x_VEC_FMA(sum2, ax2, ay2);
239
+
240
+ ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
241
+ ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
242
+ sum3 = GGML_F16x_VEC_FMA(sum3, ax3, ay3);
243
+
244
+ ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
245
+ ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
246
+ sum4 = GGML_F16x_VEC_FMA(sum4, ax4, ay4);
247
+
248
+ ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
249
+ ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
250
+ sum1 = GGML_F16x_VEC_FMA(sum1, ax5, ay5);
251
+
252
+ ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
253
+ ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
254
+ sum2 = GGML_F16x_VEC_FMA(sum2, ax6, ay6);
255
+
256
+ ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
257
+ ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
258
+ sum3 = GGML_F16x_VEC_FMA(sum3, ax7, ay7);
259
+
260
+ ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
261
+ ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
262
+ sum4 = GGML_F16x_VEC_FMA(sum4, ax8, ay8);
263
+ }
204
264
 
205
- GGML_F16_VEC ax[GGML_F16_ARR];
206
- GGML_F16_VEC ay[GGML_F16_ARR];
265
+ const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8
266
+ for (int k = np; k < np2; k += ggml_f16_epr) {
267
+ svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
268
+ svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
269
+ sum1 = GGML_F16x_VEC_FMA(sum1, rx, ry);
270
+ }
207
271
 
208
- for (int i = 0; i < np; i += GGML_F16_STEP) {
209
- for (int j = 0; j < GGML_F16_ARR; j++) {
210
- ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
211
- ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
272
+ if (np2 < n) {
273
+ svbool_t pg = svwhilelt_b16(np2, n);
274
+ svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
275
+ svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
212
276
 
213
- sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
277
+ sum1 = svmad_f16_x(pg, hx, hy, sum1);
214
278
  }
215
- }
279
+ GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4);
280
+ #elif defined(__riscv_v_intrinsic)
281
+ #if defined(__riscv_zvfh)
282
+ int vl = __riscv_vsetvlmax_e32m2();
283
+ vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
284
+ vfloat32m2_t vsum;
285
+ vfloat16m1_t ax;
286
+ vfloat16m1_t ay;
287
+ vsum = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vmv_v_x_u32m2(0, vl));
288
+ for (int i = 0; i < n; i += vl) {
289
+ vl = __riscv_vsetvl_e16m1(n - i);
290
+ ax = __riscv_vle16_v_f16m1_tu(ax, (const _Float16 *)&x[i], vl);
291
+ ay = __riscv_vle16_v_f16m1_tu(ay, (const _Float16 *)&y[i], vl);
292
+ vsum = __riscv_vfwmacc_vv_f32m2_tu(vsum, ax, ay, vl);
293
+ }
294
+ vl = __riscv_vsetvlmax_e32m1();
295
+ vfloat32m1_t ac0 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum, 0), __riscv_vget_v_f32m2_f32m1(vsum, 1), vl);
296
+ vs = __riscv_vfredusum_vs_f32m1_f32m1(ac0, vs, vl);
297
+ sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
298
+ #else
299
+ for (int i = 0; i < n; ++i) {
300
+ sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
301
+ }
302
+ #endif // __riscv_zvfh
303
+ #else
304
+ const int np = (n & ~(GGML_F16_STEP - 1));
216
305
 
217
- // reduce sum0..sum3 to sum0
218
- GGML_F16_VEC_REDUCE(sumf, sum);
306
+ GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
219
307
 
220
- // leftovers
221
- for (int i = np; i < n; ++i) {
222
- sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
223
- }
308
+ GGML_F16_VEC ax[GGML_F16_ARR];
309
+ GGML_F16_VEC ay[GGML_F16_ARR];
310
+
311
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
312
+ for (int j = 0; j < GGML_F16_ARR; j++) {
313
+ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
314
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
224
315
 
225
- // if you hit this, you are likely running outside the FP range
226
- assert(!isnan(sumf) && !isinf(sumf));
316
+ sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
317
+ }
318
+ }
319
+
320
+ // reduce sum0..sum3 to sum0
321
+ GGML_F16_VEC_REDUCE(sumf, sum);
322
+
323
+ // leftovers
324
+ for (int i = np; i < n; ++i) {
325
+ sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
326
+ }
327
+ // if you hit this, you are likely running outside the FP range
328
+ assert(!isnan(sumf) && !isinf(sumf));
329
+ #endif
227
330
  #else
228
331
  for (int i = 0; i < n; ++i) {
229
332
  sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
230
333
  }
231
- #endif
334
+ #endif // GGML_SIMD
232
335
 
233
336
  *s = sumf;
234
337
  }
@@ -247,6 +350,12 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) {
247
350
  for (; i + 3 < n; i += 4) {
248
351
  _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
249
352
  }
353
+ #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
354
+ const int vlen = svcntw();
355
+ for (; i < n; i += vlen) {
356
+ const svbool_t pg = svwhilelt_b32_s32(i, n);
357
+ svst1_f32(pg, y + i, ggml_v_silu(pg, svld1_f32(pg, x + i)));
358
+ }
250
359
  #elif defined(__ARM_NEON) && defined(__aarch64__)
251
360
  for (; i + 3 < n; i += 4) {
252
361
  vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
@@ -271,10 +380,24 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
271
380
  for (; i + 3 < n; i += 4) {
272
381
  _mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
273
382
  }
383
+ #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
384
+ const int vlen = svcntw();
385
+ for (; i < n; i += vlen) {
386
+ const svbool_t pg = svwhilelt_b32_s32(i, n);
387
+ svst1_f32(pg, y + i, svmul_f32_x(pg, ggml_v_silu(pg, svld1_f32(pg, x + i)), svld1_f32(pg, g + i)));
388
+ }
274
389
  #elif defined(__ARM_NEON) && defined(__aarch64__)
275
390
  for (; i + 3 < n; i += 4) {
276
391
  vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
277
392
  }
393
+ #elif defined(__riscv_v_intrinsic)
394
+ for (int vl; i < n; i += vl) {
395
+ vl = __riscv_vsetvl_e32m2(n - i);
396
+ vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
397
+ vfloat32m2_t vg = __riscv_vle32_v_f32m2(&g[i], vl);
398
+ vfloat32m2_t vy = __riscv_vfmul_vv_f32m2(ggml_v_silu_m2(vx, vl), vg, vl);
399
+ __riscv_vse32_v_f32m2(&y[i], vy, vl);
400
+ }
278
401
  #endif
279
402
  for (; i < n; ++i) {
280
403
  y[i] = ggml_silu_f32(x[i]) * g[i];
@@ -318,6 +441,15 @@ ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float
318
441
  #endif
319
442
  sum += (ggml_float)_mm_cvtss_f32(val);
320
443
  }
444
+ #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
445
+ const int vlen = svcntw();
446
+ for (; i < n; i += vlen) {
447
+ const svbool_t pg = svwhilelt_b32_s32(i, n);
448
+ svfloat32_t val = ggml_v_expf(pg, svsub_f32_x(pg, svld1_f32(pg, x + i),
449
+ svdup_n_f32_x(pg, max)));
450
+ svst1_f32(pg, y + i, val);
451
+ sum += (ggml_float)svaddv_f32(pg, val);
452
+ }
321
453
  #elif defined(__ARM_NEON) && defined(__aarch64__)
322
454
  for (; i + 3 < n; i += 4) {
323
455
  float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
@@ -325,6 +457,15 @@ ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float
325
457
  vst1q_f32(y + i, val);
326
458
  sum += (ggml_float)vaddvq_f32(val);
327
459
  }
460
+ #elif defined(__riscv_v_intrinsic)
461
+ vfloat64m1_t vsum = __riscv_vfmv_v_f_f64m1(0, 1);
462
+ for (int avl; i < n; i += avl) {
463
+ avl = __riscv_vsetvl_e32m2(n - i);
464
+ vfloat32m2_t val = ggml_v_expf_m2(__riscv_vfsub_vf_f32m2(__riscv_vle32_v_f32m2(&x[i], avl), max, avl), avl);
465
+ __riscv_vse32_v_f32m2(&y[i], val, avl);
466
+ vsum = __riscv_vfwredusum_vs_f32m2_f64m1(val, vsum, avl);
467
+ }
468
+ return (ggml_float)__riscv_vfmv_f_s_f64m1_f64(vsum);
328
469
  #endif
329
470
  for (; i < n; ++i) {
330
471
  float val = expf(x[i] - max);