@fugood/llama.node 1.1.9 → 1.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/lib/binding.ts +7 -1
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +15 -5
  4. package/src/LlamaCompletionWorker.cpp +12 -3
  5. package/src/LlamaCompletionWorker.h +3 -1
  6. package/src/LlamaContext.cpp +20 -2
  7. package/src/llama.cpp/common/arg.cpp +29 -19
  8. package/src/llama.cpp/common/chat.cpp +153 -3
  9. package/src/llama.cpp/common/chat.h +1 -0
  10. package/src/llama.cpp/common/common.cpp +10 -3
  11. package/src/llama.cpp/common/common.h +4 -1
  12. package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
  13. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -4
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
  16. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
  17. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +14 -9
  18. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -12
  20. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
  21. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
  22. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -1
  23. package/src/llama.cpp/include/llama.h +27 -1
  24. package/src/llama.cpp/src/llama-adapter.cpp +68 -4
  25. package/src/llama.cpp/src/llama-adapter.h +3 -0
  26. package/src/llama.cpp/src/llama-arch.cpp +46 -2
  27. package/src/llama.cpp/src/llama-arch.h +4 -0
  28. package/src/llama.cpp/src/llama-context.cpp +80 -39
  29. package/src/llama.cpp/src/llama-context.h +0 -4
  30. package/src/llama.cpp/src/llama-graph.cpp +20 -10
  31. package/src/llama.cpp/src/llama-graph.h +2 -1
  32. package/src/llama.cpp/src/llama-hparams.cpp +25 -0
  33. package/src/llama.cpp/src/llama-hparams.h +6 -0
  34. package/src/llama.cpp/src/llama-impl.h +2 -0
  35. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +24 -7
  36. package/src/llama.cpp/src/llama-kv-cache-iswa.h +4 -2
  37. package/src/llama.cpp/src/llama-kv-cache.cpp +67 -130
  38. package/src/llama.cpp/src/llama-kv-cache.h +16 -28
  39. package/src/llama.cpp/src/llama-memory-hybrid.cpp +29 -28
  40. package/src/llama.cpp/src/llama-memory-hybrid.h +18 -22
  41. package/src/llama.cpp/src/llama-memory-recurrent.cpp +7 -7
  42. package/src/llama.cpp/src/llama-memory-recurrent.h +7 -11
  43. package/src/llama.cpp/src/llama-memory.h +8 -0
  44. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  45. package/src/llama.cpp/src/llama-model.cpp +302 -31
  46. package/src/llama.cpp/src/llama-model.h +1 -0
  47. package/src/llama.cpp/src/llama-vocab.cpp +1 -1
  48. package/src/llama.cpp/src/llama.cpp +12 -0
@@ -18,6 +18,10 @@
18
18
  #include <immintrin.h>
19
19
  #endif
20
20
 
21
+ #if defined(__riscv_v_intrinsic)
22
+ #include <riscv_vector.h>
23
+ #endif
24
+
21
25
  #ifdef __cplusplus
22
26
  extern "C" {
23
27
  #endif
@@ -94,24 +98,15 @@ extern "C" {
94
98
  }
95
99
  #elif defined(__riscv) && defined(__riscv_zfhmin)
96
100
  static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
97
- float f;
98
- __asm__(
99
- "fmv.h.x %[f], %[h]\n\t"
100
- "fcvt.s.h %[f], %[f]"
101
- : [f] "=&f" (f)
102
- : [h] "r" (h)
103
- );
104
- return f;
101
+ _Float16 hf;
102
+ memcpy(&hf, &h, sizeof(ggml_fp16_t));
103
+ return hf;
105
104
  }
106
105
 
107
106
  static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
108
107
  ggml_fp16_t res;
109
- __asm__(
110
- "fcvt.h.s %[f], %[f]\n\t"
111
- "fmv.x.h %[h], %[f]"
112
- : [h] "=&r" (res)
113
- : [f] "f" (f)
114
- );
108
+ _Float16 hf = (_Float16)f;
109
+ memcpy(&res, &hf, sizeof(ggml_fp16_t));
115
110
  return res;
116
111
  }
117
112
 
@@ -1170,6 +1165,36 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
1170
1165
  #define GGML_F16_VEC_MUL GGML_F32x4_MUL
1171
1166
  #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
1172
1167
 
1168
+ #elif defined(__riscv_v_intrinsic)
1169
+
1170
+ // compatible with vlen >= 128
1171
+
1172
+ #define GGML_SIMD
1173
+
1174
+ // F32
1175
+
1176
+ #define GGML_F32_STEP 16
1177
+ #define GGML_F32_EPR 4
1178
+
1179
+ #define GGML_F32x4 vfloat32m1_t
1180
+ #define GGML_F32x4_ZERO __riscv_vfmv_v_f_f32m1(0.0f, GGML_F32_EPR)
1181
+ #define GGML_F32x4_SET1(x) __riscv_vfmv_v_f_f32m1(x, GGML_F32_EPR)
1182
+ #define GGML_F32x4_LOAD(x) __riscv_vle32_v_f32m1(x, GGML_F32_EPR)
1183
+ #define GGML_F32x4_STORE(b, v) __riscv_vse32_v_f32m1(b, v, GGML_F32_EPR)
1184
+ #define GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, GGML_F32_EPR)
1185
+ #define GGML_F32x4_ADD(a, b) __riscv_vfadd_vv_f32m1(a, b, GGML_F32_EPR)
1186
+ #define GGML_F32x4_MUL(a, b) __riscv_vfmul_vv_f32m1(a, b, GGML_F32_EPR)
1187
+
1188
+ #define GGML_F32_VEC GGML_F32x4
1189
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
1190
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
1191
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
1192
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
1193
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
1194
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
1195
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
1196
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1197
+
1173
1198
  #endif
1174
1199
 
1175
1200
  // GGML_F32_ARR / GGML_F16_ARR
@@ -84,6 +84,16 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
84
84
  }
85
85
  // reduce sum1,sum2 to sum1
86
86
  GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
87
+ #elif defined(__riscv_v_intrinsic)
88
+ vfloat32m1_t vsum = __riscv_vfmv_v_f_f32m1(0.0f, 1);
89
+ for (int i = 0, avl; i < n; i += avl) {
90
+ avl = __riscv_vsetvl_e32m8(n - i);
91
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
92
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
93
+ vfloat32m8_t prod = __riscv_vfmul_vv_f32m8(ax, ay, avl);
94
+ vsum = __riscv_vfredusum_vs_f32m8_f32m1(prod, vsum, avl);
95
+ }
96
+ sumf += __riscv_vfmv_f_s_f32m1_f32(vsum);
87
97
  #else
88
98
  const int np = (n & ~(GGML_F32_STEP - 1));
89
99
 
@@ -197,7 +207,7 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
197
207
 
198
208
  ggml_float sumf = 0.0;
199
209
 
200
- #if defined(GGML_SIMD)
210
+ #if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic)
201
211
  const int np = (n & ~(GGML_F16_STEP - 1));
202
212
 
203
213
  GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
@@ -325,6 +335,15 @@ ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float
325
335
  vst1q_f32(y + i, val);
326
336
  sum += (ggml_float)vaddvq_f32(val);
327
337
  }
338
+ #elif defined(__riscv_v_intrinsic)
339
+ vfloat64m1_t vsum = __riscv_vfmv_v_f_f64m1(0, 1);
340
+ for (int avl; i < n; i += avl) {
341
+ avl = __riscv_vsetvl_e32m2(n - i);
342
+ vfloat32m2_t val = ggml_v_expf_m2(__riscv_vfsub_vf_f32m2(__riscv_vle32_v_f32m2(&x[i], avl), max, avl), avl);
343
+ __riscv_vse32_v_f32m2(&y[i], val, avl);
344
+ vsum = __riscv_vfwredusum_vs_f32m2_f64m1(val, vsum, avl);
345
+ }
346
+ return (ggml_float)__riscv_vfmv_f_s_f64m1_f64(vsum);
328
347
  #endif
329
348
  for (; i < n; ++i) {
330
349
  float val = expf(x[i] - max);
@@ -119,6 +119,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
119
119
  }
120
120
 
121
121
  #if defined(GGML_SIMD)
122
+ #if defined(__riscv_v_intrinsic)
123
+ // todo: RVV impl
124
+ for (int i = 0; i < n; ++i) {
125
+ for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
126
+ sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
127
+ }
128
+ }
129
+ #else
122
130
  const int np = (n & ~(GGML_F16_STEP - 1));
123
131
 
124
132
  GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
@@ -149,6 +157,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
149
157
  sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
150
158
  }
151
159
  }
160
+ #endif
152
161
  #else
153
162
  for (int i = 0; i < n; ++i) {
154
163
  for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
@@ -243,6 +252,14 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
243
252
 
244
253
  svst1_f32(pg, y + np2, ay1);
245
254
  }
255
+ #elif defined(__riscv_v_intrinsic)
256
+ for (int i = 0, avl; i < n; i += avl) {
257
+ avl = __riscv_vsetvl_e32m8(n - i);
258
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
259
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
260
+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
261
+ __riscv_vse32_v_f32m8(&y[i], ny, avl);
262
+ }
246
263
  #else
247
264
  const int np = (n & ~(GGML_F32_STEP - 1));
248
265
 
@@ -276,6 +293,13 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
276
293
 
277
294
  inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
278
295
  #if defined(GGML_SIMD)
296
+ #if defined(__riscv_v_intrinsic)
297
+ // todo: RVV impl
298
+ // scalar
299
+ for (int i = 0; i < n; ++i) {
300
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
301
+ }
302
+ #else
279
303
  const int np = (n & ~(GGML_F16_STEP - 1));
280
304
 
281
305
  GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
@@ -297,6 +321,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
297
321
  for (int i = np; i < n; ++i) {
298
322
  y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
299
323
  }
324
+ #endif
300
325
  #else
301
326
  // scalar
302
327
  for (int i = 0; i < n; ++i) {
@@ -324,6 +349,16 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
324
349
  y[i] += x[k][i]*v[k][0];
325
350
  }
326
351
  }
352
+ #elif defined(__riscv_v_intrinsic)
353
+ for (int i = 0, avl; i < n; i += avl) {
354
+ avl = __riscv_vsetvl_e32m8(n - i);
355
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
356
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; k++) {
357
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
358
+ ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl);
359
+ }
360
+ __riscv_vse32_v_f32m8(&y[i], ay, avl);
361
+ }
327
362
  #else
328
363
  const int np = (n & ~(GGML_F32_STEP - 1));
329
364
 
@@ -375,6 +410,14 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
375
410
  for (int i = 0; i < n; ++i) {
376
411
  y[i] = x[i]*s + b;
377
412
  }
413
+ #elif defined(__riscv_v_intrinsic)
414
+ for (int i = 0, avl; i < n; i += avl) {
415
+ avl = __riscv_vsetvl_e32m8(n - i);
416
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
417
+ vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
418
+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
419
+ __riscv_vse32_v_f32m8(&y[i], ny, avl);
420
+ }
378
421
  #else
379
422
  const int np = (n & ~(GGML_F32_STEP - 1));
380
423
 
@@ -436,6 +479,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
436
479
  ay1 = svmul_f32_m(pg, ay1, vx);
437
480
  svst1_f32(pg, y + np, ay1);
438
481
  }
482
+ #elif defined(__riscv_v_intrinsic)
483
+ for (int i = 0, avl; i < n; i += avl) {
484
+ avl = __riscv_vsetvl_e32m8(n - i);
485
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
486
+ vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
487
+ __riscv_vse32_v_f32m8(&y[i], ny, avl);
488
+ }
439
489
  #else
440
490
  const int np = (n & ~(GGML_F32_STEP - 1));
441
491
 
@@ -467,6 +517,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
467
517
 
468
518
  inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
469
519
  #if defined(GGML_SIMD)
520
+ #if defined(__riscv_v_intrinsic)
521
+ // todo: RVV impl
522
+ // scalar
523
+ for (int i = 0; i < n; ++i) {
524
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
525
+ }
526
+ #else
470
527
  const int np = (n & ~(GGML_F16_STEP - 1));
471
528
 
472
529
  GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
@@ -486,6 +543,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
486
543
  for (int i = np; i < n; ++i) {
487
544
  y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
488
545
  }
546
+ #endif
489
547
  #else
490
548
  // scalar
491
549
  for (int i = 0; i < n; ++i) {
@@ -928,7 +986,51 @@ inline static __m128 ggml_v_silu(__m128 x) {
928
986
  return _mm_div_ps(x, one_plus_exp_neg_x);
929
987
  }
930
988
 
931
- #endif // __ARM_NEON / __AVX2__ / __SSE2__
989
+ #elif defined(__riscv_v_intrinsic)
990
+
991
+ // adapted from arm limited optimized routine
992
+ // the maximum error is 1.45358 plus 0.5 ulps
993
+ // numbers above 88.38 will flush to infinity
994
+ // numbers beneath -103.97 will flush to zero
995
+ inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
996
+ const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl);
997
+ #ifdef __riscv_xtheadvector
998
+ // workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
999
+ vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl);
1000
+ z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl);
1001
+ #else
1002
+ const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl);
1003
+ #endif
1004
+ const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
1005
+ const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl),
1006
+ 0x1.7f7d1cp-20f, n, vl);
1007
+ const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl);
1008
+ const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f
1009
+ const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl);
1010
+ const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
1011
+ const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
1012
+ __riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl),
1013
+ __riscv_vfmacc_vv_f32m2(
1014
+ __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl),
1015
+ __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl),
1016
+ u, vl), u, vl);
1017
+ if (!__riscv_vcpop_m_b16(c, vl))
1018
+ return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
1019
+ const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl);
1020
+ const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl);
1021
+ const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl));
1022
+ const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
1023
+ const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2(
1024
+ __riscv_vfmacc_vv_f32m2(k, k, j, vl),
1025
+ __riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl),
1026
+ c, vl);
1027
+ return __riscv_vmerge_vvm_f32m2(
1028
+ r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
1029
+ __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl),
1030
+ vl);
1031
+ }
1032
+
1033
+ #endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
932
1034
 
933
1035
  inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
934
1036
  for (int i = 0; i < n; ++i) {
@@ -179,6 +179,14 @@ extern "C" {
179
179
  LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
180
180
  };
181
181
 
182
+ enum llama_flash_attn_type {
183
+ LLAMA_FLASH_ATTN_TYPE_AUTO = -1,
184
+ LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
185
+ LLAMA_FLASH_ATTN_TYPE_ENABLED = 1,
186
+ };
187
+
188
+ LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
189
+
182
190
  enum llama_split_mode {
183
191
  LLAMA_SPLIT_MODE_NONE = 0, // single GPU
184
192
  LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
@@ -303,6 +311,7 @@ extern "C" {
303
311
  enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
304
312
  enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
305
313
  enum llama_attention_type attention_type; // attention type to use for embeddings
314
+ enum llama_flash_attn_type flash_attn_type; // when to enable Flash Attention
306
315
 
307
316
  // ref: https://github.com/ggml-org/llama.cpp/pull/2054
308
317
  float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -329,7 +338,6 @@ extern "C" {
329
338
  // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
330
339
  bool embeddings; // if true, extract embeddings (together with logits)
331
340
  bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
332
- bool flash_attn; // use flash attention [EXPERIMENTAL]
333
341
  bool no_perf; // measure performance timings
334
342
  bool op_offload; // offload host tensor operations to device
335
343
  bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
@@ -553,6 +561,24 @@ extern "C" {
553
561
  struct llama_model * model,
554
562
  const char * path_lora);
555
563
 
564
+ // Functions to access the adapter's GGUF metadata scalar values
565
+ // - The functions return the length of the string on success, or -1 on failure
566
+ // - The output string is always null-terminated and cleared on failure
567
+ // - When retrieving a string, an extra byte must be allocated to account for the null terminator
568
+ // - GGUF array values are not supported by these functions
569
+
570
+ // Get metadata value as a string by key name
571
+ LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
572
+
573
+ // Get the number of metadata key/value pairs
574
+ LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
575
+
576
+ // Get metadata key name by index
577
+ LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
578
+
579
+ // Get metadata value as a string by index
580
+ LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
581
+
556
582
  // Manually free a LoRA adapter
557
583
  // Note: loaded adapters will be free when the associated model is deleted
558
584
  LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
@@ -163,13 +163,38 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
163
163
 
164
164
  // check metadata
165
165
  {
166
+ const gguf_context * gguf_ctx = ctx_gguf.get();
167
+
168
+ LLAMA_LOG_INFO("%s: Dumping metadata keys/values.\n", __func__);
169
+
170
+ // get metadata as string
171
+ for (int i = 0; i < gguf_get_n_kv(gguf_ctx); i++) {
172
+ gguf_type type = gguf_get_kv_type(gguf_ctx, i);
173
+ const std::string type_name =
174
+ type == GGUF_TYPE_ARRAY
175
+ ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(gguf_ctx, i)), gguf_get_arr_n(gguf_ctx, i))
176
+ : gguf_type_name(type);
177
+ const char * name = gguf_get_key(gguf_ctx, i);
178
+ const std::string value = gguf_kv_to_str(gguf_ctx, i);
179
+
180
+ if (type != GGUF_TYPE_ARRAY) {
181
+ adapter.gguf_kv.emplace(name, value);
182
+ }
183
+
184
+ const size_t MAX_VALUE_LEN = 40;
185
+ std::string print_value = value.size() > MAX_VALUE_LEN ? format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()) : value;
186
+ replace_all(print_value, "\n", "\\n");
187
+
188
+ LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), print_value.c_str());
189
+ }
190
+
166
191
  auto get_kv_str = [&](const std::string & key) -> std::string {
167
- int id = gguf_find_key(ctx_gguf.get(), key.c_str());
168
- return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id));
192
+ int id = gguf_find_key(gguf_ctx, key.c_str());
193
+ return id < 0 ? "" : std::string(gguf_get_val_str(gguf_ctx, id));
169
194
  };
170
195
  auto get_kv_f32 = [&](const std::string & key) -> float {
171
- int id = gguf_find_key(ctx_gguf.get(), key.c_str());
172
- return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id);
196
+ int id = gguf_find_key(gguf_ctx, key.c_str());
197
+ return id < 0 ? 0.0f : gguf_get_val_f32(gguf_ctx, id);
173
198
  };
174
199
  LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
175
200
 
@@ -383,6 +408,45 @@ llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * p
383
408
  return nullptr;
384
409
  }
385
410
 
411
+ int32_t llama_adapter_meta_val_str(const llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size) {
412
+ const auto & it = adapter->gguf_kv.find(key);
413
+ if (it == adapter->gguf_kv.end()) {
414
+ if (buf_size > 0) {
415
+ buf[0] = '\0';
416
+ }
417
+ return -1;
418
+ }
419
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
420
+ }
421
+
422
+ int32_t llama_adapter_meta_count(const llama_adapter_lora * adapter) {
423
+ return (int)adapter->gguf_kv.size();
424
+ }
425
+
426
+ int32_t llama_adapter_meta_key_by_index(const llama_adapter_lora * adapter, int i, char * buf, size_t buf_size) {
427
+ if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
428
+ if (buf_size > 0) {
429
+ buf[0] = '\0';
430
+ }
431
+ return -1;
432
+ }
433
+ auto it = adapter->gguf_kv.begin();
434
+ std::advance(it, i);
435
+ return snprintf(buf, buf_size, "%s", it->first.c_str());
436
+ }
437
+
438
+ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size) {
439
+ if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
440
+ if (buf_size > 0) {
441
+ buf[0] = '\0';
442
+ }
443
+ return -1;
444
+ }
445
+ auto it = adapter->gguf_kv.begin();
446
+ std::advance(it, i);
447
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
448
+ }
449
+
386
450
  void llama_adapter_lora_free(llama_adapter_lora * adapter) {
387
451
  delete adapter;
388
452
  }
@@ -67,6 +67,9 @@ struct llama_adapter_lora {
67
67
 
68
68
  float alpha;
69
69
 
70
+ // gguf metadata
71
+ std::unordered_map<std::string, std::string> gguf_kv;
72
+
70
73
  llama_adapter_lora() = default;
71
74
  ~llama_adapter_lora() = default;
72
75
 
@@ -22,6 +22,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
22
22
  { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
23
23
  { LLM_ARCH_NEO_BERT, "neo-bert" },
24
24
  { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
25
+ { LLM_ARCH_JINA_BERT_V3, "jina-bert-v3" },
25
26
  { LLM_ARCH_BLOOM, "bloom" },
26
27
  { LLM_ARCH_STABLELM, "stablelm" },
27
28
  { LLM_ARCH_QWEN, "qwen" },
@@ -68,6 +69,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
68
69
  { LLM_ARCH_T5ENCODER, "t5encoder" },
69
70
  { LLM_ARCH_JAIS, "jais" },
70
71
  { LLM_ARCH_NEMOTRON, "nemotron" },
72
+ { LLM_ARCH_NEMOTRON_H, "nemotron_h" },
71
73
  { LLM_ARCH_EXAONE, "exaone" },
72
74
  { LLM_ARCH_EXAONE4, "exaone4" },
73
75
  { LLM_ARCH_RWKV6, "rwkv6" },
@@ -234,8 +236,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
234
236
  { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
235
237
  { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
236
238
 
237
- { LLM_KV_ADAPTER_TYPE, "adapter.type" },
238
- { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
239
+ { LLM_KV_ADAPTER_TYPE, "adapter.type" },
240
+ { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
241
+ { LLM_KV_ADAPTER_LORA_TASK_NAME, "adapter.lora.task_name" },
242
+ { LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
239
243
 
240
244
  // deprecated
241
245
  { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
@@ -575,6 +579,20 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
575
579
  { LLM_TENSOR_CLS, "cls" },
576
580
  },
577
581
  },
582
+ {
583
+ LLM_ARCH_JINA_BERT_V3,
584
+ {
585
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
586
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
587
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
588
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
589
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
590
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
591
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
592
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
593
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
594
+ },
595
+ },
578
596
  {
579
597
  LLM_ARCH_BLOOM,
580
598
  {
@@ -1533,6 +1551,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1533
1551
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1534
1552
  },
1535
1553
  },
1554
+ {
1555
+ LLM_ARCH_NEMOTRON_H,
1556
+ {
1557
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1558
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1559
+ { LLM_TENSOR_OUTPUT, "output" },
1560
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1561
+ // mamba(2) ssm layers
1562
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
1563
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
1564
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
1565
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
1566
+ { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
1567
+ { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
1568
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
1569
+ // attention layers
1570
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1571
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1572
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1573
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1574
+ // dense FFN
1575
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1576
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1577
+ },
1578
+ },
1536
1579
  {
1537
1580
  LLM_ARCH_EXAONE,
1538
1581
  {
@@ -2338,6 +2381,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
2338
2381
  case LLM_ARCH_PLAMO2:
2339
2382
  case LLM_ARCH_GRANITE_HYBRID:
2340
2383
  case LLM_ARCH_LFM2:
2384
+ case LLM_ARCH_NEMOTRON_H:
2341
2385
  return true;
2342
2386
  default:
2343
2387
  return false;
@@ -26,6 +26,7 @@ enum llm_arch {
26
26
  LLM_ARCH_NOMIC_BERT_MOE,
27
27
  LLM_ARCH_NEO_BERT,
28
28
  LLM_ARCH_JINA_BERT_V2,
29
+ LLM_ARCH_JINA_BERT_V3,
29
30
  LLM_ARCH_BLOOM,
30
31
  LLM_ARCH_STABLELM,
31
32
  LLM_ARCH_QWEN,
@@ -72,6 +73,7 @@ enum llm_arch {
72
73
  LLM_ARCH_T5ENCODER,
73
74
  LLM_ARCH_JAIS,
74
75
  LLM_ARCH_NEMOTRON,
76
+ LLM_ARCH_NEMOTRON_H,
75
77
  LLM_ARCH_EXAONE,
76
78
  LLM_ARCH_EXAONE4,
77
79
  LLM_ARCH_RWKV6,
@@ -230,6 +232,8 @@ enum llm_kv {
230
232
 
231
233
  LLM_KV_ADAPTER_TYPE,
232
234
  LLM_KV_ADAPTER_LORA_ALPHA,
235
+ LLM_KV_ADAPTER_LORA_TASK_NAME,
236
+ LLM_KV_ADAPTER_LORA_PROMPT_PREFIX,
233
237
 
234
238
  LLM_KV_POSNET_EMBEDDING_LENGTH,
235
239
  LLM_KV_POSNET_BLOCK_COUNT,