@fugood/llama.node 1.1.11 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -8
- package/lib/binding.ts +18 -1
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +20 -16
- package/src/DecodeAudioTokenWorker.cpp +23 -26
- package/src/DecodeAudioTokenWorker.h +6 -8
- package/src/DetokenizeWorker.cpp +5 -8
- package/src/DetokenizeWorker.h +6 -5
- package/src/DisposeWorker.cpp +23 -3
- package/src/DisposeWorker.h +4 -2
- package/src/EmbeddingWorker.cpp +9 -35
- package/src/EmbeddingWorker.h +3 -2
- package/src/LlamaCompletionWorker.cpp +217 -315
- package/src/LlamaCompletionWorker.h +6 -12
- package/src/LlamaContext.cpp +166 -396
- package/src/LlamaContext.h +8 -13
- package/src/LoadSessionWorker.cpp +22 -19
- package/src/LoadSessionWorker.h +3 -2
- package/src/RerankWorker.h +3 -2
- package/src/SaveSessionWorker.cpp +22 -19
- package/src/SaveSessionWorker.h +3 -2
- package/src/TokenizeWorker.cpp +38 -35
- package/src/TokenizeWorker.h +12 -3
- package/src/common.hpp +0 -458
- package/src/llama.cpp/common/arg.cpp +50 -30
- package/src/llama.cpp/common/chat.cpp +250 -1
- package/src/llama.cpp/common/chat.h +4 -0
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +21 -1
- package/src/llama.cpp/common/log.cpp +53 -2
- package/src/llama.cpp/common/log.h +10 -4
- package/src/llama.cpp/common/sampling.cpp +23 -2
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
- package/src/llama.cpp/ggml/include/ggml-backend.h +15 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +0 -6
- package/src/llama.cpp/ggml/include/ggml.h +56 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +21 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +25 -38
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +379 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
- package/src/llama.cpp/include/llama.h +5 -6
- package/src/llama.cpp/src/llama-adapter.cpp +33 -0
- package/src/llama.cpp/src/llama-adapter.h +3 -0
- package/src/llama.cpp/src/llama-arch.cpp +28 -4
- package/src/llama.cpp/src/llama-arch.h +3 -0
- package/src/llama.cpp/src/llama-context.cpp +65 -57
- package/src/llama.cpp/src/llama-context.h +1 -1
- package/src/llama.cpp/src/llama-graph.cpp +57 -11
- package/src/llama.cpp/src/llama-graph.h +8 -0
- package/src/llama.cpp/src/llama-hparams.cpp +37 -0
- package/src/llama.cpp/src/llama-hparams.h +10 -3
- package/src/llama.cpp/src/llama-kv-cache.cpp +56 -38
- package/src/llama.cpp/src/llama-kv-cache.h +9 -0
- package/src/llama.cpp/src/llama-model.cpp +217 -97
- package/src/llama.cpp/src/llama-model.h +0 -1
- package/src/llama.cpp/src/llama-quant.cpp +3 -3
- package/src/llama.cpp/src/llama-sampling.cpp +226 -126
- package/src/llama.cpp/src/llama.cpp +53 -10
- package/src/anyascii.c +0 -22223
- package/src/anyascii.h +0 -42
- package/src/tts_utils.cpp +0 -371
- package/src/tts_utils.h +0 -103
|
@@ -85,15 +85,21 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
|
|
85
85
|
// reduce sum1,sum2 to sum1
|
|
86
86
|
GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
|
|
87
87
|
#elif defined(__riscv_v_intrinsic)
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
88
|
+
int vl = __riscv_vsetvlmax_e32m8();
|
|
89
|
+
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
|
|
90
|
+
vfloat32m8_t vsum;
|
|
91
|
+
vfloat32m8_t ax;
|
|
92
|
+
vfloat32m8_t ay;
|
|
93
|
+
vsum = __riscv_vfmv_v_f_f32m8_tu(vsum, 0.0f, vl);
|
|
94
|
+
for (int i = 0; i < n; i += vl) {
|
|
95
|
+
vl = __riscv_vsetvl_e32m8(n - i);
|
|
96
|
+
ax = __riscv_vle32_v_f32m8_tu(ax, &x[i], vl);
|
|
97
|
+
ay = __riscv_vle32_v_f32m8_tu(ay, &y[i], vl);
|
|
98
|
+
vsum = __riscv_vfmacc_vv_f32m8_tu(vsum, ax, ay, vl);
|
|
95
99
|
}
|
|
96
|
-
|
|
100
|
+
vl = __riscv_vsetvlmax_e32m8();
|
|
101
|
+
vs = __riscv_vfredusum_vs_f32m8_f32m1(vsum, vs, vl);
|
|
102
|
+
sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
|
|
97
103
|
#else
|
|
98
104
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
99
105
|
|
|
@@ -207,38 +213,125 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
|
|
|
207
213
|
|
|
208
214
|
ggml_float sumf = 0.0;
|
|
209
215
|
|
|
210
|
-
#if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic)
|
|
211
|
-
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
212
216
|
|
|
213
|
-
|
|
217
|
+
#if defined(GGML_SIMD)
|
|
218
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
219
|
+
const int sve_register_length = svcntb() * 8; //get vector length
|
|
220
|
+
const int ggml_f16_epr = sve_register_length / 16; // running when 16
|
|
221
|
+
const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
|
|
222
|
+
|
|
223
|
+
const int np= (n & ~(ggml_f16_step - 1));
|
|
224
|
+
svfloat16_t sum1 = svdup_n_f16(0.0f);
|
|
225
|
+
svfloat16_t sum2 = svdup_n_f16(0.0f);
|
|
226
|
+
svfloat16_t sum3 = svdup_n_f16(0.0f);
|
|
227
|
+
svfloat16_t sum4 = svdup_n_f16(0.0f);
|
|
228
|
+
|
|
229
|
+
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
|
230
|
+
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
|
231
|
+
for (int i = 0; i < np; i += ggml_f16_step) {
|
|
232
|
+
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
|
|
233
|
+
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
|
|
234
|
+
sum1 = GGML_F16x_VEC_FMA(sum1, ax1, ay1);
|
|
235
|
+
|
|
236
|
+
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
|
|
237
|
+
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
|
|
238
|
+
sum2 = GGML_F16x_VEC_FMA(sum2, ax2, ay2);
|
|
239
|
+
|
|
240
|
+
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
|
|
241
|
+
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
|
242
|
+
sum3 = GGML_F16x_VEC_FMA(sum3, ax3, ay3);
|
|
243
|
+
|
|
244
|
+
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
|
|
245
|
+
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
|
246
|
+
sum4 = GGML_F16x_VEC_FMA(sum4, ax4, ay4);
|
|
247
|
+
|
|
248
|
+
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
|
|
249
|
+
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
|
250
|
+
sum1 = GGML_F16x_VEC_FMA(sum1, ax5, ay5);
|
|
251
|
+
|
|
252
|
+
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
|
|
253
|
+
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
|
254
|
+
sum2 = GGML_F16x_VEC_FMA(sum2, ax6, ay6);
|
|
255
|
+
|
|
256
|
+
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
|
|
257
|
+
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
|
258
|
+
sum3 = GGML_F16x_VEC_FMA(sum3, ax7, ay7);
|
|
259
|
+
|
|
260
|
+
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
|
|
261
|
+
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
|
262
|
+
sum4 = GGML_F16x_VEC_FMA(sum4, ax8, ay8);
|
|
263
|
+
}
|
|
214
264
|
|
|
215
|
-
|
|
216
|
-
|
|
265
|
+
const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8
|
|
266
|
+
for (int k = np; k < np2; k += ggml_f16_epr) {
|
|
267
|
+
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
|
|
268
|
+
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
|
269
|
+
sum1 = GGML_F16x_VEC_FMA(sum1, rx, ry);
|
|
270
|
+
}
|
|
217
271
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
272
|
+
if (np2 < n) {
|
|
273
|
+
svbool_t pg = svwhilelt_b16(np2, n);
|
|
274
|
+
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
|
275
|
+
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
|
222
276
|
|
|
223
|
-
|
|
277
|
+
sum1 = svmad_f16_x(pg, hx, hy, sum1);
|
|
224
278
|
}
|
|
225
|
-
|
|
279
|
+
GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4);
|
|
280
|
+
#elif defined(__riscv_v_intrinsic)
|
|
281
|
+
#if defined(__riscv_zvfh)
|
|
282
|
+
int vl = __riscv_vsetvlmax_e32m2();
|
|
283
|
+
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
|
|
284
|
+
vfloat32m2_t vsum;
|
|
285
|
+
vfloat16m1_t ax;
|
|
286
|
+
vfloat16m1_t ay;
|
|
287
|
+
vsum = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vmv_v_x_u32m2(0, vl));
|
|
288
|
+
for (int i = 0; i < n; i += vl) {
|
|
289
|
+
vl = __riscv_vsetvl_e16m1(n - i);
|
|
290
|
+
ax = __riscv_vle16_v_f16m1_tu(ax, (const _Float16 *)&x[i], vl);
|
|
291
|
+
ay = __riscv_vle16_v_f16m1_tu(ay, (const _Float16 *)&y[i], vl);
|
|
292
|
+
vsum = __riscv_vfwmacc_vv_f32m2_tu(vsum, ax, ay, vl);
|
|
293
|
+
}
|
|
294
|
+
vl = __riscv_vsetvlmax_e32m1();
|
|
295
|
+
vfloat32m1_t ac0 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum, 0), __riscv_vget_v_f32m2_f32m1(vsum, 1), vl);
|
|
296
|
+
vs = __riscv_vfredusum_vs_f32m1_f32m1(ac0, vs, vl);
|
|
297
|
+
sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
|
|
298
|
+
#else
|
|
299
|
+
for (int i = 0; i < n; ++i) {
|
|
300
|
+
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
301
|
+
}
|
|
302
|
+
#endif // __riscv_zvfh
|
|
303
|
+
#else
|
|
304
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
226
305
|
|
|
227
|
-
|
|
228
|
-
GGML_F16_VEC_REDUCE(sumf, sum);
|
|
306
|
+
GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
|
|
229
307
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
308
|
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
|
309
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
|
310
|
+
|
|
311
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
|
312
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
|
313
|
+
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
|
314
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
|
315
|
+
|
|
316
|
+
sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
// reduce sum0..sum3 to sum0
|
|
321
|
+
GGML_F16_VEC_REDUCE(sumf, sum);
|
|
234
322
|
|
|
235
|
-
|
|
236
|
-
|
|
323
|
+
// leftovers
|
|
324
|
+
for (int i = np; i < n; ++i) {
|
|
325
|
+
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
326
|
+
}
|
|
327
|
+
// if you hit this, you are likely running outside the FP range
|
|
328
|
+
assert(!isnan(sumf) && !isinf(sumf));
|
|
329
|
+
#endif
|
|
237
330
|
#else
|
|
238
331
|
for (int i = 0; i < n; ++i) {
|
|
239
332
|
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
240
333
|
}
|
|
241
|
-
#endif
|
|
334
|
+
#endif // GGML_SIMD
|
|
242
335
|
|
|
243
336
|
*s = sumf;
|
|
244
337
|
}
|
|
@@ -257,6 +350,12 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
|
|
257
350
|
for (; i + 3 < n; i += 4) {
|
|
258
351
|
_mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
|
|
259
352
|
}
|
|
353
|
+
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
|
354
|
+
const int vlen = svcntw();
|
|
355
|
+
for (; i < n; i += vlen) {
|
|
356
|
+
const svbool_t pg = svwhilelt_b32_s32(i, n);
|
|
357
|
+
svst1_f32(pg, y + i, ggml_v_silu(pg, svld1_f32(pg, x + i)));
|
|
358
|
+
}
|
|
260
359
|
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
261
360
|
for (; i + 3 < n; i += 4) {
|
|
262
361
|
vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
|
|
@@ -281,10 +380,24 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
|
|
|
281
380
|
for (; i + 3 < n; i += 4) {
|
|
282
381
|
_mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
|
|
283
382
|
}
|
|
383
|
+
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
|
384
|
+
const int vlen = svcntw();
|
|
385
|
+
for (; i < n; i += vlen) {
|
|
386
|
+
const svbool_t pg = svwhilelt_b32_s32(i, n);
|
|
387
|
+
svst1_f32(pg, y + i, svmul_f32_x(pg, ggml_v_silu(pg, svld1_f32(pg, x + i)), svld1_f32(pg, g + i)));
|
|
388
|
+
}
|
|
284
389
|
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
285
390
|
for (; i + 3 < n; i += 4) {
|
|
286
391
|
vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
|
|
287
392
|
}
|
|
393
|
+
#elif defined(__riscv_v_intrinsic)
|
|
394
|
+
for (int vl; i < n; i += vl) {
|
|
395
|
+
vl = __riscv_vsetvl_e32m2(n - i);
|
|
396
|
+
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
|
|
397
|
+
vfloat32m2_t vg = __riscv_vle32_v_f32m2(&g[i], vl);
|
|
398
|
+
vfloat32m2_t vy = __riscv_vfmul_vv_f32m2(ggml_v_silu_m2(vx, vl), vg, vl);
|
|
399
|
+
__riscv_vse32_v_f32m2(&y[i], vy, vl);
|
|
400
|
+
}
|
|
288
401
|
#endif
|
|
289
402
|
for (; i < n; ++i) {
|
|
290
403
|
y[i] = ggml_silu_f32(x[i]) * g[i];
|
|
@@ -328,6 +441,15 @@ ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float
|
|
|
328
441
|
#endif
|
|
329
442
|
sum += (ggml_float)_mm_cvtss_f32(val);
|
|
330
443
|
}
|
|
444
|
+
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
|
445
|
+
const int vlen = svcntw();
|
|
446
|
+
for (; i < n; i += vlen) {
|
|
447
|
+
const svbool_t pg = svwhilelt_b32_s32(i, n);
|
|
448
|
+
svfloat32_t val = ggml_v_expf(pg, svsub_f32_x(pg, svld1_f32(pg, x + i),
|
|
449
|
+
svdup_n_f32_x(pg, max)));
|
|
450
|
+
svst1_f32(pg, y + i, val);
|
|
451
|
+
sum += (ggml_float)svaddv_f32(pg, val);
|
|
452
|
+
}
|
|
331
453
|
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
332
454
|
for (; i + 3 < n; i += 4) {
|
|
333
455
|
float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
|