@fugood/llama.node 1.1.11 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/CMakeLists.txt +5 -8
  2. package/lib/binding.ts +18 -1
  3. package/lib/index.js +2 -2
  4. package/lib/index.ts +2 -2
  5. package/package.json +20 -16
  6. package/src/DecodeAudioTokenWorker.cpp +23 -26
  7. package/src/DecodeAudioTokenWorker.h +6 -8
  8. package/src/DetokenizeWorker.cpp +5 -8
  9. package/src/DetokenizeWorker.h +6 -5
  10. package/src/DisposeWorker.cpp +23 -3
  11. package/src/DisposeWorker.h +4 -2
  12. package/src/EmbeddingWorker.cpp +9 -35
  13. package/src/EmbeddingWorker.h +3 -2
  14. package/src/LlamaCompletionWorker.cpp +217 -315
  15. package/src/LlamaCompletionWorker.h +6 -12
  16. package/src/LlamaContext.cpp +166 -396
  17. package/src/LlamaContext.h +8 -13
  18. package/src/LoadSessionWorker.cpp +22 -19
  19. package/src/LoadSessionWorker.h +3 -2
  20. package/src/RerankWorker.h +3 -2
  21. package/src/SaveSessionWorker.cpp +22 -19
  22. package/src/SaveSessionWorker.h +3 -2
  23. package/src/TokenizeWorker.cpp +38 -35
  24. package/src/TokenizeWorker.h +12 -3
  25. package/src/common.hpp +0 -458
  26. package/src/llama.cpp/common/arg.cpp +50 -30
  27. package/src/llama.cpp/common/chat.cpp +250 -1
  28. package/src/llama.cpp/common/chat.h +4 -0
  29. package/src/llama.cpp/common/common.h +1 -1
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +21 -1
  31. package/src/llama.cpp/common/log.cpp +53 -2
  32. package/src/llama.cpp/common/log.h +10 -4
  33. package/src/llama.cpp/common/sampling.cpp +23 -2
  34. package/src/llama.cpp/common/sampling.h +3 -1
  35. package/src/llama.cpp/common/speculative.cpp +1 -1
  36. package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
  37. package/src/llama.cpp/ggml/include/ggml-backend.h +15 -0
  38. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  39. package/src/llama.cpp/ggml/include/ggml-metal.h +0 -6
  40. package/src/llama.cpp/ggml/include/ggml.h +56 -2
  41. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +21 -14
  42. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
  43. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
  44. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -7
  45. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +25 -38
  46. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -4
  47. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -12
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +379 -4
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
  51. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
  52. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
  53. package/src/llama.cpp/include/llama.h +5 -6
  54. package/src/llama.cpp/src/llama-adapter.cpp +33 -0
  55. package/src/llama.cpp/src/llama-adapter.h +3 -0
  56. package/src/llama.cpp/src/llama-arch.cpp +28 -4
  57. package/src/llama.cpp/src/llama-arch.h +3 -0
  58. package/src/llama.cpp/src/llama-context.cpp +65 -57
  59. package/src/llama.cpp/src/llama-context.h +1 -1
  60. package/src/llama.cpp/src/llama-graph.cpp +57 -11
  61. package/src/llama.cpp/src/llama-graph.h +8 -0
  62. package/src/llama.cpp/src/llama-hparams.cpp +37 -0
  63. package/src/llama.cpp/src/llama-hparams.h +10 -3
  64. package/src/llama.cpp/src/llama-kv-cache.cpp +56 -38
  65. package/src/llama.cpp/src/llama-kv-cache.h +9 -0
  66. package/src/llama.cpp/src/llama-model.cpp +217 -97
  67. package/src/llama.cpp/src/llama-model.h +0 -1
  68. package/src/llama.cpp/src/llama-quant.cpp +3 -3
  69. package/src/llama.cpp/src/llama-sampling.cpp +226 -126
  70. package/src/llama.cpp/src/llama.cpp +53 -10
  71. package/src/anyascii.c +0 -22223
  72. package/src/anyascii.h +0 -42
  73. package/src/tts_utils.cpp +0 -371
  74. package/src/tts_utils.h +0 -103
@@ -119,45 +119,149 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
119
119
  }
120
120
 
121
121
  #if defined(GGML_SIMD)
122
- #if defined(__riscv_v_intrinsic)
123
- // todo: RVV impl
124
- for (int i = 0; i < n; ++i) {
125
- for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
126
- sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
122
+ #if defined(__ARM_FEATURE_SVE)
123
+
124
+ const int sve_register_length = svcntb() * 8;
125
+ const int ggml_f16_epr = sve_register_length / 16; // running when 16
126
+ const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
127
+
128
+ const int np = (n & ~(ggml_f16_step - 1));
129
+
130
+ svfloat16_t sum_00 = svdup_n_f16(0.0f);
131
+ svfloat16_t sum_01 = svdup_n_f16(0.0f);
132
+ svfloat16_t sum_02 = svdup_n_f16(0.0f);
133
+ svfloat16_t sum_03 = svdup_n_f16(0.0f);
134
+
135
+ svfloat16_t sum_10 = svdup_n_f16(0.0f);
136
+ svfloat16_t sum_11 = svdup_n_f16(0.0f);
137
+ svfloat16_t sum_12 = svdup_n_f16(0.0f);
138
+ svfloat16_t sum_13 = svdup_n_f16(0.0f);
139
+
140
+ svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
141
+ svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
142
+
143
+ for (int i = 0; i < np; i += ggml_f16_step) {
144
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
145
+
146
+ ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst
147
+ sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
148
+ ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
149
+ sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
150
+
151
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
152
+
153
+ ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements
154
+ sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
155
+ ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
156
+ sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
157
+
158
+ ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
159
+
160
+ ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
161
+ sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
162
+ ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
163
+ sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
164
+
165
+ ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
166
+
167
+ ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
168
+ sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
169
+ ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
170
+ sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
171
+
172
+ ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
173
+
174
+ ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
175
+
176
+ sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
177
+ ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
178
+ sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
179
+
180
+ ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
181
+
182
+ ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
183
+
184
+ sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
185
+ ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
186
+ sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
187
+
188
+ ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
189
+
190
+ ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
191
+
192
+ sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
193
+ ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
194
+ sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
195
+
196
+ ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
197
+
198
+ ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
199
+
200
+ sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
201
+ ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
202
+ sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
127
203
  }
128
- }
129
- #else
130
- const int np = (n & ~(GGML_F16_STEP - 1));
131
204
 
132
- GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
205
+ const int np2 = (n & ~(ggml_f16_epr - 1));
206
+ for (int k = np; k < np2; k += ggml_f16_epr) {
207
+ svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
133
208
 
134
- GGML_F16_VEC ax[GGML_F16_ARR];
135
- GGML_F16_VEC ay[GGML_F16_ARR];
209
+ svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
210
+ sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
211
+ rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
212
+ sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
213
+ }
136
214
 
137
- for (int i = 0; i < np; i += GGML_F16_STEP) {
138
- for (int j = 0; j < GGML_F16_ARR; j++) {
139
- ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
215
+ if (np2 < n) {
216
+ svbool_t pg = svwhilelt_b16(np2, n);
217
+ svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
218
+ svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
219
+ svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
140
220
 
141
- for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
142
- ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
221
+ sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
222
+ sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
223
+ }
224
+ GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
225
+ GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
226
+ #elif defined(__riscv_v_intrinsic)
227
+ // todo: RVV impl
228
+ for (int i = 0; i < n; ++i) {
229
+ for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
230
+ sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
231
+ }
232
+ }
233
+ #else
234
+ const int np = (n & ~(GGML_F16_STEP - 1));
235
+
236
+ GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
143
237
 
144
- sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
238
+ GGML_F16_VEC ax[GGML_F16_ARR];
239
+ GGML_F16_VEC ay[GGML_F16_ARR];
240
+
241
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
242
+ for (int j = 0; j < GGML_F16_ARR; j++) {
243
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
244
+
245
+ for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
246
+ ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
247
+
248
+ sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
249
+ }
145
250
  }
146
251
  }
147
- }
148
252
 
149
- // reduce sum0..sum3 to sum0
150
- for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
151
- GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
152
- }
253
+ // reduce sum0..sum3 to sum0
254
+ for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
255
+ GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
256
+ }
153
257
 
154
- // leftovers
155
- for (int i = np; i < n; ++i) {
156
- for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
157
- sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
258
+ // leftovers
259
+ for (int i = np; i < n; ++i) {
260
+ for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
261
+ sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
262
+ }
158
263
  }
159
- }
160
- #endif
264
+ #endif
161
265
  #else
162
266
  for (int i = 0; i < n; ++i) {
163
267
  for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
@@ -293,35 +397,112 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
293
397
 
294
398
  inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
295
399
  #if defined(GGML_SIMD)
296
- #if defined(__riscv_v_intrinsic)
297
- // todo: RVV impl
298
- // scalar
299
- for (int i = 0; i < n; ++i) {
300
- y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
301
- }
302
- #else
303
- const int np = (n & ~(GGML_F16_STEP - 1));
400
+ #if defined(__ARM_FEATURE_SVE)
401
+ const int sve_register_length = svcntb() * 8;
402
+ const int ggml_f16_epr = sve_register_length / 16;
403
+ const int ggml_f16_step = 8 * ggml_f16_epr;
404
+
405
+ GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
406
+
407
+ const int np= (n & ~(ggml_f16_step - 1));
408
+
409
+ svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
410
+ svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
411
+ for (int i = 0; i < np; i += ggml_f16_step) {
412
+ ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
413
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
414
+ ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
415
+
416
+ GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
417
+
418
+ ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
419
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
420
+ ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
421
+
422
+ GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
423
+
424
+ ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
425
+ ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
426
+ ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
304
427
 
305
- GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
428
+ GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
306
429
 
307
- GGML_F16_VEC ax[GGML_F16_ARR];
308
- GGML_F16_VEC ay[GGML_F16_ARR];
430
+ ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
431
+ ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
432
+ ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
309
433
 
310
- for (int i = 0; i < np; i += GGML_F16_STEP) {
311
- for (int j = 0; j < GGML_F16_ARR; j++) {
312
- ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
313
- ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
314
- ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
434
+ GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
315
435
 
316
- GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
436
+ ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
437
+ ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
438
+ ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
439
+
440
+ GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
441
+
442
+ ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
443
+ ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
444
+ ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
445
+
446
+ GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
447
+
448
+ ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
449
+ ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
450
+ ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
451
+
452
+ GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
453
+
454
+ ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
455
+ ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
456
+ ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
457
+
458
+ GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
317
459
  }
318
- }
460
+ const int np2 = (n & ~(ggml_f16_epr - 1));
461
+ for (int k = np; k < np2; k += ggml_f16_epr) {
462
+ svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
463
+ svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
464
+ ry = GGML_F16x_VEC_FMA(ry, rx, vx);
319
465
 
320
- // leftovers
321
- for (int i = np; i < n; ++i) {
322
- y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
323
- }
324
- #endif
466
+ GGML_F16x_VEC_STORE(y + k, ry, 0);
467
+ }
468
+
469
+ if (np2 < n) {
470
+ svbool_t pg = svwhilelt_b16(np2, n);
471
+ svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
472
+ svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
473
+ hy = svmad_f16_x(pg, hx, vx, hy);
474
+ svst1_f16(pg, (__fp16 *)(y + np2), hy);
475
+ }
476
+
477
+ #elif defined(__riscv_v_intrinsic)
478
+ // todo: RVV impl
479
+ // scalar
480
+ for (int i = 0; i < n; ++i) {
481
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
482
+ }
483
+ #else
484
+ const int np = (n & ~(GGML_F16_STEP - 1));
485
+
486
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
487
+
488
+ GGML_F16_VEC ax[GGML_F16_ARR];
489
+ GGML_F16_VEC ay[GGML_F16_ARR];
490
+
491
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
492
+ for (int j = 0; j < GGML_F16_ARR; j++) {
493
+ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
494
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
495
+ ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
496
+
497
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
498
+ }
499
+ }
500
+
501
+ // leftovers
502
+ for (int i = np; i < n; ++i) {
503
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
504
+ }
505
+ #endif
325
506
  #else
326
507
  // scalar
327
508
  for (int i = 0; i < n; ++i) {
@@ -517,33 +698,59 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
517
698
 
518
699
  inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
519
700
  #if defined(GGML_SIMD)
520
- #if defined(__riscv_v_intrinsic)
521
- // todo: RVV impl
522
- // scalar
523
- for (int i = 0; i < n; ++i) {
524
- y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
525
- }
526
- #else
527
- const int np = (n & ~(GGML_F16_STEP - 1));
701
+ #if defined(__ARM_FEATURE_SVE)
702
+ const int sve_register_length = svcntb() * 8;
703
+ const int ggml_f16_epr = sve_register_length / 16;
704
+ const int ggml_f16_step = 2 * ggml_f16_epr;
705
+
706
+ GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
707
+ const int np = (n & ~(ggml_f16_step - 1));
708
+ svfloat16_t ay1, ay2;
709
+
710
+ for (int i = 0; i < np; i += ggml_f16_step) {
711
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
712
+ ay1 = GGML_F16x_VEC_MUL(ay1, vx);
713
+ GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
714
+
715
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
716
+ ay2 = GGML_F16x_VEC_MUL(ay2, vx);
717
+ GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
718
+ }
719
+ // leftovers
720
+ // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
721
+ if (np < n) {
722
+ svbool_t pg = svwhilelt_b16(np, n);
723
+ svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
724
+ svfloat16_t out = svmul_f16_m(pg, hy, vx);
725
+ svst1_f16(pg, (__fp16 *)(y + np), out);
726
+ }
727
+ #elif defined(__riscv_v_intrinsic)
728
+ // todo: RVV impl
729
+ // scalar
730
+ for (int i = 0; i < n; ++i) {
731
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
732
+ }
733
+ #else
734
+ const int np = (n & ~(GGML_F16_STEP - 1));
528
735
 
529
- GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
736
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
530
737
 
531
- GGML_F16_VEC ay[GGML_F16_ARR];
738
+ GGML_F16_VEC ay[GGML_F16_ARR];
532
739
 
533
- for (int i = 0; i < np; i += GGML_F16_STEP) {
534
- for (int j = 0; j < GGML_F16_ARR; j++) {
535
- ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
536
- ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
740
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
741
+ for (int j = 0; j < GGML_F16_ARR; j++) {
742
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
743
+ ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
537
744
 
538
- GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
745
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
746
+ }
539
747
  }
540
- }
541
748
 
542
- // leftovers
543
- for (int i = np; i < n; ++i) {
544
- y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
545
- }
546
- #endif
749
+ // leftovers
750
+ for (int i = np; i < n; ++i) {
751
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
752
+ }
753
+ #endif
547
754
  #else
548
755
  // scalar
549
756
  for (int i = 0; i < n; ++i) {
@@ -795,7 +1002,39 @@ https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/sr
795
1002
  }
796
1003
  #endif
797
1004
 
798
- #if defined(__ARM_NEON) && defined(__aarch64__)
1005
+ #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
1006
+
1007
+ inline static svfloat32_t ggml_v_expf(svbool_t pg, svfloat32_t x) {
1008
+ const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f);
1009
+ const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f);
1010
+ const svfloat32_t n = svsub_f32_x(pg, z, r);
1011
+ const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f);
1012
+ const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23);
1013
+ const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1))));
1014
+ const svbool_t c = svacgt_n_f32(pg, n, 126);
1015
+ const svfloat32_t u = svmul_f32_x(pg, b, b);
1016
+ const svfloat32_t j = svmla_f32_x(pg,
1017
+ svmul_n_f32_x(pg, b, 0x1.ffffecp-1f),
1018
+ svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b),
1019
+ svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u);
1020
+ const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000);
1021
+ const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000));
1022
+ const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d));
1023
+ return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1),
1024
+ svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j)));
1025
+ }
1026
+
1027
+ // computes silu x/(1+exp(-x)) in single precision vector
1028
+ inline static svfloat32_t ggml_v_silu(svbool_t pg, svfloat32_t x) {
1029
+ const svfloat32_t one = svdup_n_f32_x(pg, 1.0f);
1030
+ const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f);
1031
+ const svfloat32_t neg_x = svsub_f32_x(pg, zero, x);
1032
+ const svfloat32_t exp_neg_x = ggml_v_expf(pg, neg_x);
1033
+ const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x);
1034
+ return svdiv_f32_x(pg, x, one_plus_exp_neg_x);
1035
+ }
1036
+
1037
+ #elif defined(__ARM_NEON) && defined(__aarch64__)
799
1038
 
800
1039
  // adapted from arm limited optimized routine
801
1040
  // the maximum error is 1.45358 plus 0.5 ulps
@@ -1030,6 +1269,14 @@ inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
1030
1269
  vl);
1031
1270
  }
1032
1271
 
1272
+ // computes silu x/(1+exp(-x)) in single precision vector
1273
+ inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) {
1274
+ const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
1275
+ const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl);
1276
+ const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
1277
+ return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
1278
+ }
1279
+
1033
1280
  #endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
1034
1281
 
1035
1282
  inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
@@ -206,7 +206,7 @@ extern "C" {
206
206
  llama_token_data * data;
207
207
  size_t size;
208
208
  int64_t selected; // this is the index in the data array (i.e. not the token id)
209
- bool sorted;
209
+ bool sorted; // note: do not assume the data is sorted - always check this flag
210
210
  } llama_token_data_array;
211
211
 
212
212
  typedef bool (*llama_progress_callback)(float progress, void * user_data);
@@ -583,6 +583,10 @@ extern "C" {
583
583
  // Note: loaded adapters will be free when the associated model is deleted
584
584
  LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
585
585
 
586
+ // Get the invocation tokens if the current lora is an alora
587
+ LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
588
+ LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter);
589
+
586
590
  // The following functions operate on a llama_context, hence the naming: llama_verb_...
587
591
 
588
592
  // Add a loaded LoRA adapter to given context
@@ -1156,11 +1160,6 @@ extern "C" {
1156
1160
  LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
1157
1161
  LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
1158
1162
 
1159
- /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
1160
- /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
1161
- DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
1162
- "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
1163
-
1164
1163
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1165
1164
  /// Setting k <= 0 makes this a noop
1166
1165
  LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
@@ -6,6 +6,7 @@
6
6
 
7
7
  #include <map>
8
8
  #include <cassert>
9
+ #include <sstream>
9
10
  #include <stdexcept>
10
11
 
11
12
  // vec
@@ -215,6 +216,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
215
216
  }
216
217
 
217
218
  adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
219
+
220
+ // parse alora invocation sequence vector
221
+ const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS);
222
+ const int kid = gguf_find_key(ctx_gguf.get(), key.c_str());
223
+ if (kid >= 0) {
224
+ if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) {
225
+ throw std::runtime_error("invalid gguf type for " + key);
226
+ }
227
+ const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid);
228
+ if (arr_type != GGUF_TYPE_UINT32) {
229
+ throw std::runtime_error("invalid gguf element type for " + key);
230
+ }
231
+ const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid);
232
+ const void * data = gguf_get_arr_data(ctx_gguf.get(), kid);
233
+ adapter.alora_invocation_tokens.resize(seq_len);
234
+ std::copy(
235
+ (const llama_token *)data,
236
+ (const llama_token *)data + seq_len,
237
+ adapter.alora_invocation_tokens.begin());
238
+ }
218
239
  }
219
240
 
220
241
  int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
@@ -450,3 +471,15 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
450
471
  void llama_adapter_lora_free(llama_adapter_lora * adapter) {
451
472
  delete adapter;
452
473
  }
474
+
475
+ uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
476
+ if (!adapter) {
477
+ return 0;
478
+ }
479
+ return adapter->alora_invocation_tokens.size();
480
+ }
481
+
482
+ const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) {
483
+ GGML_ASSERT(adapter);
484
+ return adapter->alora_invocation_tokens.data();
485
+ }
@@ -70,6 +70,9 @@ struct llama_adapter_lora {
70
70
  // gguf metadata
71
71
  std::unordered_map<std::string, std::string> gguf_kv;
72
72
 
73
+ // activated lora (aLoRA)
74
+ std::vector<llama_token> alora_invocation_tokens;
75
+
73
76
  llama_adapter_lora() = default;
74
77
  ~llama_adapter_lora() = default;
75
78
 
@@ -45,6 +45,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
45
45
  { LLM_ARCH_GEMMA2, "gemma2" },
46
46
  { LLM_ARCH_GEMMA3, "gemma3" },
47
47
  { LLM_ARCH_GEMMA3N, "gemma3n" },
48
+ { LLM_ARCH_GEMMA_EMBEDDING, "gemma-embedding" },
48
49
  { LLM_ARCH_STARCODER2, "starcoder2" },
49
50
  { LLM_ARCH_MAMBA, "mamba" },
50
51
  { LLM_ARCH_MAMBA2, "mamba2" },
@@ -136,6 +137,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
136
137
  { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
137
138
  { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
138
139
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
140
+ { LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" },
139
141
  { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
140
142
  { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
141
143
  { LLM_KV_SWIN_NORM, "%s.swin_norm" },
@@ -236,10 +238,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
236
238
  { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
237
239
  { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
238
240
 
239
- { LLM_KV_ADAPTER_TYPE, "adapter.type" },
240
- { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
241
- { LLM_KV_ADAPTER_LORA_TASK_NAME, "adapter.lora.task_name" },
242
- { LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
241
+ { LLM_KV_ADAPTER_TYPE, "adapter.type" },
242
+ { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
243
+ { LLM_KV_ADAPTER_LORA_TASK_NAME, "adapter.lora.task_name" },
244
+ { LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
245
+ { LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, "adapter.alora.invocation_tokens" },
243
246
 
244
247
  // deprecated
245
248
  { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
@@ -1038,6 +1041,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1038
1041
  { LLM_TENSOR_LAUREL_POST_NORM, "blk.%d.laurel_post_norm" },
1039
1042
  },
1040
1043
  },
1044
+ {
1045
+ LLM_ARCH_GEMMA_EMBEDDING,
1046
+ {
1047
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1048
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1049
+ { LLM_TENSOR_OUTPUT, "output" },
1050
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1051
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1052
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1053
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1054
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1055
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1056
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1057
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1058
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1059
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1060
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1061
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1062
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
1063
+ },
1064
+ },
1041
1065
  {
1042
1066
  LLM_ARCH_STARCODER2,
1043
1067
  {
@@ -49,6 +49,7 @@ enum llm_arch {
49
49
  LLM_ARCH_GEMMA2,
50
50
  LLM_ARCH_GEMMA3,
51
51
  LLM_ARCH_GEMMA3N,
52
+ LLM_ARCH_GEMMA_EMBEDDING,
52
53
  LLM_ARCH_STARCODER2,
53
54
  LLM_ARCH_MAMBA,
54
55
  LLM_ARCH_MAMBA2,
@@ -140,6 +141,7 @@ enum llm_kv {
140
141
  LLM_KV_POOLING_TYPE,
141
142
  LLM_KV_LOGIT_SCALE,
142
143
  LLM_KV_DECODER_START_TOKEN_ID,
144
+ LLM_KV_DECODER_BLOCK_COUNT,
143
145
  LLM_KV_ATTN_LOGIT_SOFTCAPPING,
144
146
  LLM_KV_FINAL_LOGIT_SOFTCAPPING,
145
147
  LLM_KV_SWIN_NORM,
@@ -234,6 +236,7 @@ enum llm_kv {
234
236
  LLM_KV_ADAPTER_LORA_ALPHA,
235
237
  LLM_KV_ADAPTER_LORA_TASK_NAME,
236
238
  LLM_KV_ADAPTER_LORA_PROMPT_PREFIX,
239
+ LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS,
237
240
 
238
241
  LLM_KV_POSNET_EMBEDDING_LENGTH,
239
242
  LLM_KV_POSNET_BLOCK_COUNT,