@fugood/llama.node 1.2.3 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +33 -11
  3. package/src/llama.cpp/CMakeLists.txt +1 -0
  4. package/src/llama.cpp/common/CMakeLists.txt +46 -2
  5. package/src/llama.cpp/common/arg.cpp +484 -204
  6. package/src/llama.cpp/common/arg.h +0 -1
  7. package/src/llama.cpp/common/chat-parser.cpp +156 -15
  8. package/src/llama.cpp/common/chat-parser.h +3 -0
  9. package/src/llama.cpp/common/chat.cpp +217 -6
  10. package/src/llama.cpp/common/chat.h +5 -3
  11. package/src/llama.cpp/common/common.cpp +22 -6
  12. package/src/llama.cpp/common/common.h +6 -4
  13. package/src/llama.cpp/common/http.h +73 -0
  14. package/src/llama.cpp/common/json-partial.cpp +51 -0
  15. package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
  16. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  17. package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
  18. package/src/llama.cpp/ggml/include/ggml.h +22 -0
  19. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  21. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  25. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
  28. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
  29. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +17 -17
  31. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
  32. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
  39. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +11 -9
  40. package/src/llama.cpp/include/llama.h +8 -0
  41. package/src/llama.cpp/src/llama-arch.cpp +93 -0
  42. package/src/llama.cpp/src/llama-arch.h +22 -0
  43. package/src/llama.cpp/src/llama-chat.cpp +1 -1
  44. package/src/llama.cpp/src/llama-context.cpp +6 -0
  45. package/src/llama.cpp/src/llama-graph.cpp +57 -22
  46. package/src/llama.cpp/src/llama-graph.h +10 -1
  47. package/src/llama.cpp/src/llama-hparams.cpp +5 -1
  48. package/src/llama.cpp/src/llama-hparams.h +17 -2
  49. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +2 -2
  50. package/src/llama.cpp/src/llama-kv-cache.cpp +2 -5
  51. package/src/llama.cpp/src/llama-memory-hybrid.cpp +11 -9
  52. package/src/llama.cpp/src/llama-memory-recurrent.cpp +11 -3
  53. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  54. package/src/llama.cpp/src/llama-model.cpp +572 -45
  55. package/src/llama.cpp/src/llama-model.h +18 -0
  56. package/src/llama.cpp/src/llama-sampling.cpp +5 -0
  57. package/src/llama.cpp/src/llama-vocab.cpp +7 -1
  58. package/src/llama.cpp/src/llama-vocab.h +41 -40
  59. package/src/llama.cpp/src/unicode.h +43 -0
@@ -0,0 +1,26 @@
1
+ #pragma once
2
+
3
+ #include <cstddef>
4
+
5
+ namespace sqnbitgemm_spacemit_ime {
6
+ namespace ime1 {
7
+ size_t gemm_kernel_i8i4(size_t blk_len,
8
+ const std::byte * quant_a_ptr,
9
+ const std::byte * quant_b_data,
10
+ const float * quant_b_scale,
11
+ const std::byte * quant_b_zp,
12
+ float * c_ptr,
13
+ size_t count_m,
14
+ size_t count_n,
15
+ size_t count_k,
16
+ size_t block_count_k,
17
+ size_t ldc,
18
+ const float * bias,
19
+ const size_t scale_stride);
20
+
21
+ void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
22
+
23
+ void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
24
+
25
+ } // namespace ime1
26
+ } // namespace sqnbitgemm_spacemit_ime
@@ -52,6 +52,15 @@ static inline float op_sqrt(float x) {
52
52
  return sqrtf(x);
53
53
  }
54
54
 
55
+ static inline float op_xielu(float x, float alpha_n, float alpha_p, float beta, float eps) {
56
+ if (x > 0.0f) {
57
+ return alpha_p * x * x + beta * x;
58
+ } else {
59
+ const float min_x_eps = fminf(x, eps);
60
+ return (expm1f(min_x_eps) - x) * alpha_n + beta * x;
61
+ }
62
+ }
63
+
55
64
  static inline float op_sin(float x) {
56
65
  return sinf(x);
57
66
  }
@@ -121,6 +130,86 @@ static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
121
130
  }
122
131
  }
123
132
 
133
+ template <float (*op)(float, ggml_tensor *)>
134
+ static void unary_op_params(const ggml_compute_params * params, ggml_tensor * dst) {
135
+ const ggml_tensor * src0 = dst->src[0];
136
+
137
+ /* */ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
138
+ apply_unary_op<op, float, float>(params, dst);
139
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
140
+ apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
141
+ } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
142
+ apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
143
+ } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
144
+ apply_unary_op<op, ggml_bf16_t, float>(params, dst);
145
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
146
+ apply_unary_op<op, ggml_fp16_t, float>(params, dst);
147
+ } else {
148
+ fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
149
+ ggml_type_name(dst->type), ggml_type_name(src0->type));
150
+ GGML_ABORT("fatal error");
151
+ }
152
+ }
153
+
154
+ // Extend vec_unary_op to support functors
155
+ template <typename Op, typename src0_t, typename dst_t>
156
+ static inline void vec_unary_op_functor(int64_t n, dst_t * y, const src0_t * x, Op op) {
157
+ constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
158
+ constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
159
+
160
+ for (int i = 0; i < n; i++) {
161
+ y[i] = f32_to_dst(op(src0_to_f32(x[i])));
162
+ }
163
+ }
164
+
165
+ // Extend apply_unary_op to support functors
166
+ template <typename Op, typename src0_t, typename dst_t>
167
+ static void apply_unary_op_functor(const ggml_compute_params * params, ggml_tensor * dst, Op op) {
168
+ const ggml_tensor * src0 = dst->src[0];
169
+
170
+ GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
171
+
172
+ GGML_TENSOR_UNARY_OP_LOCALS
173
+
174
+ GGML_ASSERT( nb0 == sizeof(dst_t));
175
+ GGML_ASSERT(nb00 == sizeof(src0_t));
176
+
177
+ const auto [ir0, ir1] = get_thread_range(params, src0);
178
+
179
+ for (int64_t ir = ir0; ir < ir1; ++ir) {
180
+ const int64_t i03 = ir/(ne02*ne01);
181
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
182
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
183
+
184
+ dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
185
+ const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
186
+
187
+ vec_unary_op_functor(ne0, dst_ptr, src0_ptr, op);
188
+ }
189
+ }
190
+
191
+ // Generic dispatcher for functors
192
+ template <typename Op>
193
+ static void unary_op_functor(const ggml_compute_params * params, ggml_tensor * dst, Op op) {
194
+ const ggml_tensor * src0 = dst->src[0];
195
+
196
+ /* */ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
197
+ apply_unary_op_functor<Op, float, float>(params, dst, op);
198
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
199
+ apply_unary_op_functor<Op, ggml_fp16_t, ggml_fp16_t>(params, dst, op);
200
+ } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
201
+ apply_unary_op_functor<Op, ggml_bf16_t, ggml_bf16_t>(params, dst, op);
202
+ } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
203
+ apply_unary_op_functor<Op, ggml_bf16_t, float>(params, dst, op);
204
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
205
+ apply_unary_op_functor<Op, ggml_fp16_t, float>(params, dst, op);
206
+ } else {
207
+ fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
208
+ ggml_type_name(dst->type), ggml_type_name(src0->type));
209
+ GGML_ABORT("fatal error");
210
+ }
211
+ }
212
+
124
213
  void ggml_compute_forward_abs(const ggml_compute_params * params, ggml_tensor * dst) {
125
214
  unary_op<op_abs>(params, dst);
126
215
  }
@@ -184,3 +273,17 @@ void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor *
184
273
  void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * dst) {
185
274
  unary_op<op_log>(params, dst);
186
275
  }
276
+
277
+ void ggml_compute_forward_xielu(const ggml_compute_params * params, ggml_tensor * dst) {
278
+ const float alpha_n = ggml_get_op_params_f32(dst, 1);
279
+ const float alpha_p = ggml_get_op_params_f32(dst, 2);
280
+ const float beta = ggml_get_op_params_f32(dst, 3);
281
+ const float eps = ggml_get_op_params_f32(dst, 4);
282
+
283
+ const auto xielu_op_params = [alpha_n, alpha_p, beta, eps](float f) {
284
+ return op_xielu(f, alpha_n, alpha_p, beta, eps);
285
+ };
286
+
287
+ unary_op_functor(params, dst, xielu_op_params);
288
+ }
289
+
@@ -22,6 +22,7 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct
22
22
  void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
23
23
  void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
24
24
  void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
25
+ void ggml_compute_forward_xielu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
25
26
 
26
27
  #ifdef __cplusplus
27
28
  }
@@ -404,6 +404,72 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
404
404
  }
405
405
  }
406
406
 
407
+ ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean) {
408
+ int i = 0;
409
+ ggml_float sum = 0;
410
+ // TODO: optimize to process the remaining elements in groups using the smaller vector sizes from AVX2 and SSE
411
+ // ref: https://github.com/ggml-org/llama.cpp/pull/15953#pullrequestreview-3310928344
412
+ #if defined(__AVX512F__) && defined(__AVX512DQ__)
413
+ for (; i + 15 < n; i += 16) {
414
+ __m512 val = _mm512_sub_ps(_mm512_loadu_ps(x + i),
415
+ _mm512_set1_ps(mean));
416
+ _mm512_storeu_ps(y + i, val);
417
+ sum += (ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(val, val));
418
+ }
419
+ #elif defined(__AVX2__) && defined(__FMA__)
420
+ for (; i + 7 < n; i += 8) {
421
+ __m256 val = _mm256_sub_ps(_mm256_loadu_ps(x + i),
422
+ _mm256_set1_ps(mean));
423
+ _mm256_storeu_ps(y + i, val);
424
+ val = _mm256_mul_ps(val,val);
425
+ __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
426
+ _mm256_castps256_ps128(val));
427
+ val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
428
+ val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
429
+ sum += (ggml_float)_mm_cvtss_f32(val2);
430
+ }
431
+ #elif defined(__SSE2__)
432
+ for (; i + 3 < n; i += 4) {
433
+ __m128 val = _mm_sub_ps(_mm_loadu_ps(x + i),
434
+ _mm_set1_ps(mean));
435
+ _mm_storeu_ps(y + i, val);
436
+ val = _mm_mul_ps(val, val);
437
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
438
+ val = _mm_add_ps(val, _mm_movehl_ps(val, val));
439
+ val = _mm_add_ss(val, _mm_movehdup_ps(val));
440
+ #else
441
+ __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
442
+ val = _mm_add_ps(val, tmp);
443
+ tmp = _mm_movehl_ps(tmp, val);
444
+ val = _mm_add_ss(val, tmp);
445
+ #endif // __AVX__ || __AVX2__ || __AVX512F__
446
+ sum += (ggml_float)_mm_cvtss_f32(val);
447
+ }
448
+ #elif defined(__ARM_NEON) && defined(__aarch64__)
449
+ for (; i + 3 < n; i += 4) {
450
+ float32x4_t val = vsubq_f32(vld1q_f32(x + i),
451
+ vdupq_n_f32(mean));
452
+ vst1q_f32(y + i, val);
453
+ val = vmulq_f32(val, val);
454
+ sum += (ggml_float)vaddvq_f32(val);
455
+ }
456
+ #elif defined(__VXE__) || defined(__VXE2__)
457
+ for (; i + 3 < n; i += 4) {
458
+ float32x4_t val = vec_sub(vec_xl(0, x + i), vec_splats(mean));
459
+ vec_xst(val, 0, y + i);
460
+ val = vec_mul(val, val);
461
+ sum += (ggml_float)vec_hsum_f32x4(val);
462
+ }
463
+ #endif
464
+ for (; i < n; ++i) {
465
+ float val = x[i] - mean;
466
+ val *= val;
467
+ sum += (ggml_float)val;
468
+ y[i] = val;
469
+ }
470
+ return sum/n;
471
+ }
472
+
407
473
  ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
408
474
  int i = 0;
409
475
  ggml_float sum = 0;
@@ -44,6 +44,7 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
44
44
  void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
45
45
 
46
46
  void ggml_vec_silu_f32(const int n, float * y, const float * x);
47
+ ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean )
47
48
  ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
48
49
  ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
49
50
 
@@ -143,14 +144,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
143
144
  for (int i = 0; i < np; i += ggml_f16_step) {
144
145
  ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
145
146
 
146
- ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst
147
+ ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements
147
148
  sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
148
149
  ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
149
150
  sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
150
151
 
151
152
  ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
152
153
 
153
- ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements
154
+ ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements
154
155
  sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
155
156
  ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
156
157
  sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
@@ -159,7 +160,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
159
160
 
160
161
  ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
161
162
  sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
162
- ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
163
+ ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
163
164
  sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
164
165
 
165
166
  ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
@@ -610,7 +611,7 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
610
611
  for (int i = 0; i < np; i += GGML_F32_STEP) {
611
612
  for (int j = 0; j < GGML_F32_ARR; j++) {
612
613
  ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
613
- ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);
614
+ ay[j] = GGML_F32_VEC_FMA(vb, ay[j], vs);
614
615
 
615
616
  GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
616
617
  }
@@ -654,11 +655,11 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
654
655
  }
655
656
  // leftovers
656
657
  // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
657
- if (np < n) {
658
- svbool_t pg = svwhilelt_b32(np, n);
659
- ay1 = svld1_f32(pg, y + np);
658
+ for (int i = np; i < n; i += ggml_f32_epr) {
659
+ svbool_t pg = svwhilelt_b32(i, n);
660
+ ay1 = svld1_f32(pg, y + i);
660
661
  ay1 = svmul_f32_m(pg, ay1, vx);
661
- svst1_f32(pg, y + np, ay1);
662
+ svst1_f32(pg, y + i, ay1);
662
663
  }
663
664
  #elif defined(__riscv_v_intrinsic)
664
665
  for (int i = 0, avl; i < n; i += avl) {
@@ -819,7 +820,8 @@ inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_f
819
820
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
820
821
  inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
821
822
  for (int i = 0; i < n; ++i) {
822
- y[i] = GGML_CPU_FP32_TO_FP16(expm1f(GGML_CPU_FP16_TO_FP32(x[i])));
823
+ const float v = GGML_CPU_FP16_TO_FP32(x[i]);
824
+ y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v));
823
825
  }
824
826
  }
825
827
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
@@ -296,6 +296,7 @@ extern "C" {
296
296
  bool use_mlock; // force system to keep model in RAM
297
297
  bool check_tensors; // validate model tensor data
298
298
  bool use_extra_bufts; // use extra buffer types (used for weight repacking)
299
+ bool no_host; // bypass host buffer allowing extra buffers to be used
299
300
  };
300
301
 
301
302
  // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
@@ -543,6 +544,9 @@ extern "C" {
543
544
  // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
544
545
  LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
545
546
 
547
+ // Returns true if the model is hybrid (like Jamba, Granite, etc.)
548
+ LLAMA_API bool llama_model_is_hybrid(const struct llama_model * model);
549
+
546
550
  // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
547
551
  LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
548
552
 
@@ -791,8 +795,12 @@ extern "C" {
791
795
  size_t n_token_capacity,
792
796
  size_t * n_token_count_out);
793
797
 
798
+ // for backwards-compat
794
799
  #define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
795
800
 
801
+ // work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba)
802
+ #define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1
803
+
796
804
  typedef uint32_t llama_state_seq_flags;
797
805
 
798
806
  LLAMA_API size_t llama_state_seq_get_size_ext(
@@ -93,11 +93,14 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
93
93
  { LLM_ARCH_SMOLLM3, "smollm3" },
94
94
  { LLM_ARCH_OPENAI_MOE, "gpt-oss" },
95
95
  { LLM_ARCH_LFM2, "lfm2" },
96
+ { LLM_ARCH_LFM2MOE, "lfm2moe" },
96
97
  { LLM_ARCH_DREAM, "dream" },
97
98
  { LLM_ARCH_SMALLTHINKER, "smallthinker" },
98
99
  { LLM_ARCH_LLADA, "llada" },
99
100
  { LLM_ARCH_LLADA_MOE, "llada-moe" },
100
101
  { LLM_ARCH_SEED_OSS, "seed_oss" },
102
+ { LLM_ARCH_GROVEMOE, "grovemoe" },
103
+ { LLM_ARCH_APERTUS, "apertus" },
101
104
  { LLM_ARCH_UNKNOWN, "(unknown)" },
102
105
  };
103
106
 
@@ -125,6 +128,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
125
128
  { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
126
129
  { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
127
130
  { LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
131
+ { LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, "%s.expert_chunk_feed_forward_length" },
128
132
  { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
129
133
  { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
130
134
  { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
@@ -133,6 +137,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
133
137
  { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
134
138
  { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
135
139
  { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
140
+ { LLM_KV_EXPERT_GROUP_SCALE, "%s.expert_group_scale" },
141
+ { LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
136
142
  { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
137
143
  { LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
138
144
  { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
@@ -213,6 +219,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
213
219
  { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
214
220
 
215
221
  { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
222
+ // sentence-transformers dense modules feature dims
223
+ { LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" },
224
+ { LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" },
225
+ { LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" },
226
+ { LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" },
216
227
 
217
228
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
218
229
  { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
@@ -252,6 +263,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
252
263
  { LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
253
264
  { LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, "adapter.alora.invocation_tokens" },
254
265
 
266
+ { LLM_KV_XIELU_ALPHA_N, "xielu.alpha_n" },
267
+ { LLM_KV_XIELU_ALPHA_P, "xielu.alpha_p" },
268
+ { LLM_KV_XIELU_BETA, "xielu.beta" },
269
+ { LLM_KV_XIELU_EPS, "xielu.eps" },
270
+
255
271
  // deprecated
256
272
  { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
257
273
  { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
@@ -721,6 +737,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
721
737
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
722
738
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
723
739
  { LLM_TENSOR_OUTPUT, "output" },
740
+ { LLM_TENSOR_CLS_OUT, "cls.output" },
724
741
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
725
742
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
726
743
  { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
@@ -1059,6 +1076,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1059
1076
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1060
1077
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1061
1078
  { LLM_TENSOR_OUTPUT, "output" },
1079
+ { LLM_TENSOR_DENSE_2_OUT, "dense_2" },
1080
+ { LLM_TENSOR_DENSE_3_OUT, "dense_3" },
1062
1081
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1063
1082
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1064
1083
  { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
@@ -2093,6 +2112,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2093
2112
  { LLM_TENSOR_OUTPUT, "output" },
2094
2113
  }
2095
2114
  },
2115
+ {
2116
+ LLM_ARCH_LFM2MOE,
2117
+ {
2118
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2119
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2120
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2121
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2122
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2123
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
2124
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
2125
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
2126
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
2127
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2128
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2129
+ { LLM_TENSOR_SHORTCONV_CONV, "blk.%d.shortconv.conv" },
2130
+ { LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
2131
+ { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
2132
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2133
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
2134
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
2135
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
2136
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
2137
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
2138
+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
2139
+ }
2140
+ },
2096
2141
  {
2097
2142
  LLM_ARCH_SMALLTHINKER,
2098
2143
  {
@@ -2114,6 +2159,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2114
2159
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }
2115
2160
  },
2116
2161
  },
2162
+ {
2163
+ LLM_ARCH_APERTUS,
2164
+ {
2165
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2166
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2167
+ { LLM_TENSOR_OUTPUT, "output" },
2168
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
2169
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2170
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2171
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2172
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2173
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2174
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
2175
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
2176
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2177
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
2178
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2179
+ },
2180
+ },
2117
2181
  {
2118
2182
  LLM_ARCH_DREAM,
2119
2183
  {
@@ -2185,6 +2249,29 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2185
2249
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2186
2250
  },
2187
2251
  },
2252
+ {
2253
+ LLM_ARCH_GROVEMOE,
2254
+ {
2255
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2256
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2257
+ { LLM_TENSOR_OUTPUT, "output" },
2258
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2259
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2260
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
2261
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2262
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
2263
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2264
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2265
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2266
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
2267
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
2268
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
2269
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
2270
+ { LLM_TENSOR_FFN_GATE_CHEXPS, "blk.%d.ffn_gate_chexps" },
2271
+ { LLM_TENSOR_FFN_DOWN_CHEXPS, "blk.%d.ffn_down_chexps" },
2272
+ { LLM_TENSOR_FFN_UP_CHEXPS, "blk.%d.ffn_up_chexps" },
2273
+ },
2274
+ },
2188
2275
  {
2189
2276
  LLM_ARCH_UNKNOWN,
2190
2277
  {
@@ -2201,6 +2288,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
2201
2288
  {LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
2202
2289
  {LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
2203
2290
  {LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
2291
+ {LLM_TENSOR_DENSE_2_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
2292
+ {LLM_TENSOR_DENSE_3_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
2204
2293
  {LLM_TENSOR_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
2205
2294
  {LLM_TENSOR_DEC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
2206
2295
  {LLM_TENSOR_ENC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
@@ -2317,6 +2406,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
2317
2406
  {LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
2318
2407
  {LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
2319
2408
  {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
2409
+ {LLM_TENSOR_FFN_DOWN_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
2410
+ {LLM_TENSOR_FFN_GATE_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
2411
+ {LLM_TENSOR_FFN_UP_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
2320
2412
  {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
2321
2413
  // altup / laurel (gemma 3n)
2322
2414
  {LLM_TENSOR_PER_LAYER_TOKEN_EMBD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
@@ -2437,6 +2529,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
2437
2529
  case LLM_ARCH_PLAMO2:
2438
2530
  case LLM_ARCH_GRANITE_HYBRID:
2439
2531
  case LLM_ARCH_LFM2:
2532
+ case LLM_ARCH_LFM2MOE:
2440
2533
  case LLM_ARCH_NEMOTRON_H:
2441
2534
  return true;
2442
2535
  default:
@@ -97,11 +97,14 @@ enum llm_arch {
97
97
  LLM_ARCH_SMOLLM3,
98
98
  LLM_ARCH_OPENAI_MOE,
99
99
  LLM_ARCH_LFM2,
100
+ LLM_ARCH_LFM2MOE,
100
101
  LLM_ARCH_DREAM,
101
102
  LLM_ARCH_SMALLTHINKER,
102
103
  LLM_ARCH_LLADA,
103
104
  LLM_ARCH_LLADA_MOE,
104
105
  LLM_ARCH_SEED_OSS,
106
+ LLM_ARCH_GROVEMOE,
107
+ LLM_ARCH_APERTUS,
105
108
  LLM_ARCH_UNKNOWN,
106
109
  };
107
110
 
@@ -129,6 +132,7 @@ enum llm_kv {
129
132
  LLM_KV_FEED_FORWARD_LENGTH,
130
133
  LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
131
134
  LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
135
+ LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,
132
136
  LLM_KV_USE_PARALLEL_RESIDUAL,
133
137
  LLM_KV_TENSOR_DATA_LAYOUT,
134
138
  LLM_KV_EXPERT_COUNT,
@@ -137,6 +141,8 @@ enum llm_kv {
137
141
  LLM_KV_EXPERT_WEIGHTS_SCALE,
138
142
  LLM_KV_EXPERT_WEIGHTS_NORM,
139
143
  LLM_KV_EXPERT_GATING_FUNC,
144
+ LLM_KV_EXPERT_GROUP_SCALE,
145
+ LLM_KV_EXPERTS_PER_GROUP,
140
146
  LLM_KV_MOE_EVERY_N_LAYERS,
141
147
  LLM_KV_NEXTN_PREDICT_LAYERS,
142
148
  LLM_KV_POOLING_TYPE,
@@ -256,10 +262,21 @@ enum llm_kv {
256
262
 
257
263
  LLM_KV_SHORTCONV_L_CACHE,
258
264
 
265
+ LLM_KV_XIELU_ALPHA_N,
266
+ LLM_KV_XIELU_ALPHA_P,
267
+ LLM_KV_XIELU_BETA,
268
+ LLM_KV_XIELU_EPS,
269
+
259
270
  // deprecated:
260
271
  LLM_KV_TOKENIZER_PREFIX_ID,
261
272
  LLM_KV_TOKENIZER_SUFFIX_ID,
262
273
  LLM_KV_TOKENIZER_MIDDLE_ID,
274
+
275
+ // sentence-transformers dense layers in and out features
276
+ LLM_KV_DENSE_2_FEAT_IN,
277
+ LLM_KV_DENSE_2_FEAT_OUT,
278
+ LLM_KV_DENSE_3_FEAT_IN,
279
+ LLM_KV_DENSE_3_FEAT_OUT,
263
280
  };
264
281
 
265
282
  enum llm_tensor {
@@ -267,6 +284,8 @@ enum llm_tensor {
267
284
  LLM_TENSOR_TOKEN_EMBD_NORM,
268
285
  LLM_TENSOR_TOKEN_TYPES,
269
286
  LLM_TENSOR_POS_EMBD,
287
+ LLM_TENSOR_DENSE_2_OUT,
288
+ LLM_TENSOR_DENSE_3_OUT,
270
289
  LLM_TENSOR_OUTPUT,
271
290
  LLM_TENSOR_OUTPUT_NORM,
272
291
  LLM_TENSOR_ROPE_FREQS,
@@ -301,6 +320,9 @@ enum llm_tensor {
301
320
  LLM_TENSOR_FFN_DOWN_SHEXP,
302
321
  LLM_TENSOR_FFN_GATE_SHEXP,
303
322
  LLM_TENSOR_FFN_UP_SHEXP,
323
+ LLM_TENSOR_FFN_DOWN_CHEXPS,
324
+ LLM_TENSOR_FFN_GATE_CHEXPS,
325
+ LLM_TENSOR_FFN_UP_CHEXPS,
304
326
  LLM_TENSOR_FFN_EXP_PROBS_B,
305
327
  LLM_TENSOR_ATTN_Q_NORM,
306
328
  LLM_TENSOR_ATTN_K_NORM,
@@ -590,7 +590,7 @@ int32_t llm_chat_apply_template(
590
590
  ss << message->content << "<|end_of_text|>\n";
591
591
  }
592
592
  if (add_ass) {
593
- ss << "<|start_of_role|>assistant<|end_of_role|>\n";
593
+ ss << "<|start_of_role|>assistant<|end_of_role|>";
594
594
  }
595
595
  } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
596
596
  // GigaChat template
@@ -2346,6 +2346,12 @@ llama_context * llama_init_from_model(
2346
2346
  return nullptr;
2347
2347
  }
2348
2348
 
2349
+ if (params.pooling_type != model->hparams.pooling_type) {
2350
+ //user-specified pooling-type is different from the model default
2351
+ LLAMA_LOG_WARN("%s: model default pooling_type is [%d], but [%d] was specified\n", __func__,
2352
+ model->hparams.pooling_type, params.pooling_type);
2353
+ }
2354
+
2349
2355
  try {
2350
2356
  auto * ctx = new llama_context(*model, params);
2351
2357
  return ctx;