@fugood/llama.node 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +12 -12
  3. package/src/llama.cpp/CMakeLists.txt +0 -1
  4. package/src/llama.cpp/common/arg.cpp +17 -0
  5. package/src/llama.cpp/common/chat.cpp +37 -20
  6. package/src/llama.cpp/common/chat.h +2 -0
  7. package/src/llama.cpp/common/common.h +4 -0
  8. package/src/llama.cpp/ggml/CMakeLists.txt +7 -2
  9. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
  10. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  11. package/src/llama.cpp/ggml/include/ggml.h +181 -10
  12. package/src/llama.cpp/ggml/src/CMakeLists.txt +0 -1
  13. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +38 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1297 -211
  17. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +7 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  19. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
  20. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -9
  21. package/src/llama.cpp/include/llama.h +1 -0
  22. package/src/llama.cpp/src/llama-arch.cpp +108 -2
  23. package/src/llama.cpp/src/llama-arch.h +7 -0
  24. package/src/llama.cpp/src/llama-batch.cpp +27 -1
  25. package/src/llama.cpp/src/llama-batch.h +8 -1
  26. package/src/llama.cpp/src/llama-chat.cpp +15 -0
  27. package/src/llama.cpp/src/llama-chat.h +1 -0
  28. package/src/llama.cpp/src/llama-graph.cpp +95 -81
  29. package/src/llama.cpp/src/llama-graph.h +43 -16
  30. package/src/llama.cpp/src/llama-hparams.cpp +2 -1
  31. package/src/llama.cpp/src/llama-hparams.h +1 -0
  32. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
  33. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
  34. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
  35. package/src/llama.cpp/src/llama-kv-cache-unified.h +62 -24
  36. package/src/llama.cpp/src/llama-kv-cells.h +62 -10
  37. package/src/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
  38. package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
  39. package/src/llama.cpp/src/llama-memory-recurrent.cpp +34 -16
  40. package/src/llama.cpp/src/llama-memory.cpp +17 -0
  41. package/src/llama.cpp/src/llama-memory.h +3 -0
  42. package/src/llama.cpp/src/llama-model.cpp +1374 -210
  43. package/src/llama.cpp/src/llama-model.h +3 -0
  44. package/src/llama.cpp/src/llama-vocab.cpp +8 -1
  45. package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
@@ -20,6 +20,9 @@
20
20
 
21
21
  static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
22
22
 
23
+ // Work buffer size for im2col operations in CONV2D
24
+ #define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024)
25
+
23
26
  #ifdef __cplusplus
24
27
  extern "C" {
25
28
  #endif
@@ -53,6 +56,7 @@ void ggml_compute_forward_permute(const struct ggml_compute_params * params, str
53
56
  void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst);
54
57
  void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
55
58
  void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
59
+ void ggml_compute_forward_set_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
56
60
  void ggml_compute_forward_diag(const struct ggml_compute_params * params, struct ggml_tensor * dst);
57
61
  void ggml_compute_forward_diag_mask_inf(const struct ggml_compute_params * params, struct ggml_tensor * dst);
58
62
  void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -64,6 +68,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
64
68
  void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
65
69
  void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
66
70
  void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
71
+ void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
67
72
  void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
68
73
  void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
69
74
  void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -93,6 +98,7 @@ void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, st
93
98
  void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst);
94
99
  void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst);
95
100
  void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst);
101
+ void ggml_compute_forward_glu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
96
102
  void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
97
103
  void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
98
104
  void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -105,6 +111,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru
105
111
  void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
106
112
  void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
107
113
  void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
114
+ void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
108
115
 
109
116
  #ifdef __cplusplus
110
117
  }
@@ -189,7 +189,7 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
189
189
  #define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
190
190
  #define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
191
191
  #define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
192
- #define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, a, b, c)
192
+ #define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a)
193
193
  #define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
194
194
  #define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
195
195
  #define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
@@ -37,35 +37,35 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
37
37
  for (int i = 0; i < np; i += ggml_f32_step) {
38
38
  ax1 = GGML_F32_VEC_LOAD(x + i);
39
39
  ay1 = GGML_F32_VEC_LOAD(y + i);
40
- sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
40
+ sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
41
41
 
42
42
  ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
43
43
  ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
44
- sum2 = GGML_F32_VEC_FMA(ax2, ay2, sum2);
44
+ sum2 = GGML_F32_VEC_FMA(sum2, ax2, ay2);
45
45
 
46
46
  ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
47
47
  ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
48
- sum3 = GGML_F32_VEC_FMA(ax3, ay3, sum3);
48
+ sum3 = GGML_F32_VEC_FMA(sum3, ax3, ay3);
49
49
 
50
50
  ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
51
51
  ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
52
- sum4 = GGML_F32_VEC_FMA(ax4, ay4, sum4);
52
+ sum4 = GGML_F32_VEC_FMA(sum4, ax4, ay4);
53
53
 
54
54
  ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
55
55
  ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
56
- sum5 = GGML_F32_VEC_FMA(ax5, ay5, sum5);
56
+ sum5 = GGML_F32_VEC_FMA(sum5, ax5, ay5);
57
57
 
58
58
  ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
59
59
  ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
60
- sum6 = GGML_F32_VEC_FMA(ax6, ay6, sum6);
60
+ sum6 = GGML_F32_VEC_FMA(sum6, ax6, ay6);
61
61
 
62
62
  ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
63
63
  ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
64
- sum7 = GGML_F32_VEC_FMA(ax7, ay7, sum7);
64
+ sum7 = GGML_F32_VEC_FMA(sum7, ax7, ay7);
65
65
 
66
66
  ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
67
67
  ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
68
- sum8 = GGML_F32_VEC_FMA(ax8, ay8, sum8);
68
+ sum8 = GGML_F32_VEC_FMA(sum8, ax8, ay8);
69
69
  }
70
70
  // leftovers
71
71
  // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
@@ -73,7 +73,7 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
73
73
  for (int i = np; i < np2; i += ggml_f32_epr) {
74
74
  ax1 = GGML_F32_VEC_LOAD(x + i);
75
75
  ay1 = GGML_F32_VEC_LOAD(y + i);
76
- sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
76
+ sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
77
77
  }
78
78
  // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
79
79
  if (np2 < n) {
@@ -254,6 +254,30 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) {
254
254
  }
255
255
  }
256
256
 
257
+ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g) {
258
+ int i = 0;
259
+ #if defined(__AVX512F__) && defined(__AVX512DQ__)
260
+ for (; i + 15 < n; i += 16) {
261
+ _mm512_storeu_ps(y + i, _mm512_mul_ps(ggml_v_silu(_mm512_loadu_ps(x + i)), _mm512_loadu_ps(g + i)));
262
+ }
263
+ #elif defined(__AVX2__) && defined(__FMA__)
264
+ for (; i + 7 < n; i += 8) {
265
+ _mm256_storeu_ps(y + i, _mm256_mul_ps(ggml_v_silu(_mm256_loadu_ps(x + i)), _mm256_loadu_ps(g + i)));
266
+ }
267
+ #elif defined(__SSE2__)
268
+ for (; i + 3 < n; i += 4) {
269
+ _mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
270
+ }
271
+ #elif defined(__ARM_NEON) && defined(__aarch64__)
272
+ for (; i + 3 < n; i += 4) {
273
+ vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
274
+ }
275
+ #endif
276
+ for (; i < n; ++i) {
277
+ y[i] = ggml_silu_f32(x[i]) * g[i];
278
+ }
279
+ }
280
+
257
281
  ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
258
282
  int i = 0;
259
283
  ggml_float sum = 0;
@@ -163,49 +163,49 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
163
163
 
164
164
  ax1 = GGML_F32_VEC_LOAD(x + i);
165
165
  ay1 = GGML_F32_VEC_LOAD(y + i);
166
- ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
166
+ ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
167
167
 
168
168
  GGML_F32_VEC_STORE(y + i, ay1);
169
169
 
170
170
  ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
171
171
  ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
172
- ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2);
172
+ ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
173
173
 
174
174
  GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
175
175
 
176
176
  ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
177
177
  ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
178
- ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3);
178
+ ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
179
179
 
180
180
  GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
181
181
 
182
182
  ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
183
183
  ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
184
- ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4);
184
+ ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
185
185
 
186
186
  GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
187
187
 
188
188
  ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
189
189
  ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
190
- ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5);
190
+ ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
191
191
 
192
192
  GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
193
193
 
194
194
  ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
195
195
  ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
196
- ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6);
196
+ ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
197
197
 
198
198
  GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
199
199
 
200
200
  ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
201
201
  ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
202
- ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7);
202
+ ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
203
203
 
204
204
  GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
205
205
 
206
206
  ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
207
207
  ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
208
- ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8);
208
+ ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
209
209
 
210
210
  GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
211
211
  }
@@ -215,7 +215,7 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
215
215
  for (int i = np; i < np2; i += ggml_f32_epr) {
216
216
  ax1 = GGML_F32_VEC_LOAD(x + i);
217
217
  ay1 = GGML_F32_VEC_LOAD(y + i);
218
- ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
218
+ ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
219
219
 
220
220
  GGML_F32_VEC_STORE(y + i, ay1);
221
221
  }
@@ -905,6 +905,100 @@ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, con
905
905
  }
906
906
  }
907
907
 
908
+ inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
909
+ for (int i = 0; i < n; ++i) {
910
+ y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
911
+ }
912
+ }
913
+
914
+ inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
915
+ for (int i = 0; i < n; ++i) {
916
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
917
+ y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f);
918
+ }
919
+ }
920
+
921
+ #ifdef GGML_GELU_FP16
922
+ inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
923
+ uint16_t t;
924
+ for (int i = 0; i < n; ++i) {
925
+ if (x[i] <= -10.0f) {
926
+ y[i] = 0.0f;
927
+ } else if (x[i] >= 10.0f) {
928
+ y[i] = x[i] * g[i];
929
+ } else {
930
+ ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
931
+ memcpy(&t, &fp16, sizeof(uint16_t));
932
+ y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
933
+ }
934
+ }
935
+ }
936
+ #else
937
+ inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
938
+ for (int i = 0; i < n; ++i) {
939
+ y[i] = ggml_gelu_f32(x[i]) * g[i];
940
+ }
941
+ }
942
+ #endif
943
+
944
+ inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
945
+ const uint16_t * i16 = (const uint16_t *) x;
946
+ for (int i = 0; i < n; ++i) {
947
+ float v = GGML_CPU_FP16_TO_FP32(g[i]);
948
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
949
+ }
950
+ }
951
+
952
+ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
953
+
954
+ inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
955
+ for (int i = 0; i < n; ++i) {
956
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
957
+ float w = GGML_CPU_FP16_TO_FP32(g[i]);
958
+ y[i] = GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);
959
+ }
960
+ }
961
+
962
+ inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
963
+ for (int i = 0; i < n; ++i) {
964
+ float xi = x[i];
965
+ y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
966
+ }
967
+ }
968
+
969
+ inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
970
+ for (int i = 0; i < n; ++i) {
971
+ float xi = GGML_CPU_FP16_TO_FP32(x[i]);
972
+ float gi = GGML_CPU_FP16_TO_FP32(g[i]);
973
+ y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
974
+ }
975
+ }
976
+
977
+ #ifdef GGML_GELU_QUICK_FP16
978
+ inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
979
+ uint16_t t;
980
+ for (int i = 0; i < n; ++i) {
981
+ ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
982
+ memcpy(&t, &fp16, sizeof(uint16_t));
983
+ y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
984
+ }
985
+ }
986
+ #else
987
+ inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
988
+ for (int i = 0; i < n; ++i) {
989
+ y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
990
+ }
991
+ }
992
+ #endif
993
+
994
+ inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
995
+ const uint16_t * i16 = (const uint16_t *) x;
996
+ for (int i = 0; i < n; ++i) {
997
+ float v = GGML_CPU_FP16_TO_FP32(g[i]);
998
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
999
+ }
1000
+ }
1001
+
908
1002
  inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
909
1003
  #ifndef GGML_USE_ACCELERATE
910
1004
  ggml_float sum = 0.0;
@@ -117,6 +117,7 @@ extern "C" {
117
117
  LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
118
118
  LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
119
119
  LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
120
+ LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
120
121
  };
121
122
 
122
123
  enum llama_rope_type {
@@ -45,6 +45,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
45
45
  { LLM_ARCH_GEMMA3N, "gemma3n" },
46
46
  { LLM_ARCH_STARCODER2, "starcoder2" },
47
47
  { LLM_ARCH_MAMBA, "mamba" },
48
+ { LLM_ARCH_MAMBA2, "mamba2" },
49
+ { LLM_ARCH_FALCON_H1, "falcon-h1" },
48
50
  { LLM_ARCH_XVERSE, "xverse" },
49
51
  { LLM_ARCH_COMMAND_R, "command-r" },
50
52
  { LLM_ARCH_COHERE2, "cohere2" },
@@ -76,6 +78,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
76
78
  { LLM_ARCH_BAILINGMOE, "bailingmoe" },
77
79
  { LLM_ARCH_DOTS1, "dots1" },
78
80
  { LLM_ARCH_ARCEE, "arcee" },
81
+ { LLM_ARCH_ERNIE4_5, "ernie4_5" },
82
+ { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
83
+ { LLM_ARCH_SMOLLM3, "smollm3" },
79
84
  { LLM_ARCH_UNKNOWN, "(unknown)" },
80
85
  };
81
86
 
@@ -169,6 +174,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
169
174
  { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
170
175
  { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
171
176
  { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
177
+ { LLM_KV_SSM_GROUP_COUNT, "%s.ssm.group_count" },
172
178
  { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
173
179
 
174
180
  { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
@@ -1003,6 +1009,46 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1003
1009
  { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
1004
1010
  },
1005
1011
  },
1012
+ {
1013
+ LLM_ARCH_MAMBA2,
1014
+ {
1015
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1016
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1017
+ { LLM_TENSOR_OUTPUT, "output" },
1018
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1019
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
1020
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
1021
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
1022
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
1023
+ { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
1024
+ { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
1025
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
1026
+ },
1027
+ },
1028
+ {
1029
+ LLM_ARCH_FALCON_H1,
1030
+ {
1031
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1032
+ { LLM_TENSOR_OUTPUT, "output" },
1033
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1034
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1035
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1036
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1037
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1038
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1039
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
1040
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
1041
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
1042
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
1043
+ { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
1044
+ { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
1045
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
1046
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1047
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1048
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1049
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1050
+ },
1051
+ },
1006
1052
  {
1007
1053
  LLM_ARCH_XVERSE,
1008
1054
  {
@@ -1658,12 +1704,69 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1658
1704
  { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1659
1705
  }
1660
1706
  },
1707
+ {
1708
+ LLM_ARCH_ERNIE4_5,
1709
+ {
1710
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1711
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1712
+ { LLM_TENSOR_OUTPUT, "output" },
1713
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1714
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1715
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1716
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1717
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1718
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1719
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1720
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1721
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1722
+ },
1723
+ },
1724
+ {
1725
+ LLM_ARCH_HUNYUAN_MOE,
1726
+ {
1727
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1728
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1729
+ { LLM_TENSOR_OUTPUT, "output" },
1730
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1731
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1732
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1733
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1734
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1735
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1736
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1737
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1738
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1739
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1740
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1741
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1742
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1743
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1744
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1745
+ },
1746
+ },
1661
1747
  {
1662
1748
  LLM_ARCH_UNKNOWN,
1663
1749
  {
1664
1750
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1665
1751
  },
1666
1752
  },
1753
+ {
1754
+ LLM_ARCH_SMOLLM3,
1755
+ {
1756
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1757
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1758
+ { LLM_TENSOR_OUTPUT, "output" },
1759
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1760
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1761
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1762
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1763
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1764
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1765
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1766
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1767
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1768
+ },
1769
+ },
1667
1770
  };
1668
1771
 
1669
1772
  static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -1743,6 +1846,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1743
1846
  {LLM_TENSOR_SSM_CONV1D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
1744
1847
  {LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
1745
1848
  {LLM_TENSOR_SSM_D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1849
+ {LLM_TENSOR_SSM_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1746
1850
  {LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1747
1851
  {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1748
1852
  {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -1876,6 +1980,7 @@ const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) {
1876
1980
  bool llm_arch_is_recurrent(const llm_arch & arch) {
1877
1981
  switch (arch) {
1878
1982
  case LLM_ARCH_MAMBA:
1983
+ case LLM_ARCH_MAMBA2:
1879
1984
  case LLM_ARCH_RWKV6:
1880
1985
  case LLM_ARCH_RWKV6QWEN2:
1881
1986
  case LLM_ARCH_RWKV7:
@@ -1887,9 +1992,10 @@ bool llm_arch_is_recurrent(const llm_arch & arch) {
1887
1992
  }
1888
1993
 
1889
1994
  bool llm_arch_is_hybrid(const llm_arch & arch) {
1890
- // TODO: There are currently no hybrid models! Once there are, this will be
1891
- // the place to identify them
1995
+ // List all mamba-attention hybrid models here
1892
1996
  switch (arch) {
1997
+ case LLM_ARCH_FALCON_H1:
1998
+ return true;
1893
1999
  default:
1894
2000
  return false;
1895
2001
  }
@@ -49,6 +49,8 @@ enum llm_arch {
49
49
  LLM_ARCH_GEMMA3N,
50
50
  LLM_ARCH_STARCODER2,
51
51
  LLM_ARCH_MAMBA,
52
+ LLM_ARCH_MAMBA2,
53
+ LLM_ARCH_FALCON_H1,
52
54
  LLM_ARCH_XVERSE,
53
55
  LLM_ARCH_COMMAND_R,
54
56
  LLM_ARCH_COHERE2,
@@ -80,6 +82,9 @@ enum llm_arch {
80
82
  LLM_ARCH_BAILINGMOE,
81
83
  LLM_ARCH_DOTS1,
82
84
  LLM_ARCH_ARCEE,
85
+ LLM_ARCH_ERNIE4_5,
86
+ LLM_ARCH_HUNYUAN_MOE,
87
+ LLM_ARCH_SMOLLM3,
83
88
  LLM_ARCH_UNKNOWN,
84
89
  };
85
90
 
@@ -173,6 +178,7 @@ enum llm_kv {
173
178
  LLM_KV_SSM_CONV_KERNEL,
174
179
  LLM_KV_SSM_STATE_SIZE,
175
180
  LLM_KV_SSM_TIME_STEP_RANK,
181
+ LLM_KV_SSM_GROUP_COUNT,
176
182
  LLM_KV_SSM_DT_B_C_RMS,
177
183
 
178
184
  LLM_KV_WKV_HEAD_SIZE,
@@ -292,6 +298,7 @@ enum llm_tensor {
292
298
  LLM_TENSOR_SSM_DT,
293
299
  LLM_TENSOR_SSM_A,
294
300
  LLM_TENSOR_SSM_D,
301
+ LLM_TENSOR_SSM_NORM,
295
302
  LLM_TENSOR_SSM_OUT,
296
303
  LLM_TENSOR_TIME_MIX_W0,
297
304
  LLM_TENSOR_TIME_MIX_W1,
@@ -166,6 +166,8 @@ bool llama_batch_allocr::init(
166
166
 
167
167
  // note: tracking the other way around is not necessary for now
168
168
  //seq_cpl[s0][s1] = true;
169
+
170
+ has_cpl = true;
169
171
  }
170
172
  }
171
173
  }
@@ -405,6 +407,10 @@ uint32_t llama_batch_allocr::get_n_outputs() const {
405
407
  return n_outputs;
406
408
  }
407
409
 
410
+ uint32_t llama_batch_allocr::get_n_used() const {
411
+ return n_used;
412
+ }
413
+
408
414
  std::vector<int32_t> & llama_batch_allocr::get_out_ids() {
409
415
  return out_ids;
410
416
  }
@@ -420,6 +426,8 @@ llama_pos llama_batch_allocr::seq_pos_max(llama_seq_id seq_id) const {
420
426
  void llama_batch_allocr::split_reset() {
421
427
  out_ids.clear();
422
428
 
429
+ n_used = 0;
430
+
423
431
  used.clear();
424
432
  used.resize(get_n_tokens(), false);
425
433
 
@@ -444,6 +452,7 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
444
452
  idxs.push_back(cur_idx);
445
453
 
446
454
  used[cur_idx] = true;
455
+ ++n_used;
447
456
 
448
457
  ++cur_idx;
449
458
 
@@ -459,9 +468,17 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
459
468
  return ubatch_add(idxs, idxs.size(), false);
460
469
  }
461
470
 
462
- llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
471
+ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
472
+ if (sequential && has_cpl) {
473
+ LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__);
474
+
475
+ return {};
476
+ }
477
+
463
478
  std::vector<seq_set_t> cur_seq_set;
464
479
 
480
+ llama_seq_id last_seq_id = -1;
481
+
465
482
  // determine the non-overlapping sequence sets participating in this ubatch
466
483
  for (int32_t i = 0; i < batch.n_tokens; ++i) {
467
484
  if (used[i]) {
@@ -478,9 +495,16 @@ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
478
495
  }
479
496
  }
480
497
 
498
+ // accept only increasing sequence ids
499
+ if (sequential) {
500
+ add = add && (cur_seq_set.empty() || batch.seq_id[i][0] == last_seq_id + 1);
501
+ }
502
+
481
503
  if (add) {
482
504
  cur_seq_set.push_back(seq_set[i]);
483
505
 
506
+ last_seq_id = batch.seq_id[i][0];
507
+
484
508
  if (cur_seq_set.size() > n_ubatch) {
485
509
  break;
486
510
  }
@@ -529,6 +553,7 @@ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
529
553
  idxs_per_seq[s].push_back(idx);
530
554
 
531
555
  used[idx] = true;
556
+ ++n_used;
532
557
 
533
558
  ++cur_idx[s];
534
559
  }
@@ -570,6 +595,7 @@ llama_ubatch llama_batch_allocr::split_seq(uint32_t n_ubatch) {
570
595
  idxs.push_back(cur_idx);
571
596
 
572
597
  used[cur_idx] = true;
598
+ ++n_used;
573
599
 
574
600
  if (idxs.size() >= n_ubatch) {
575
601
  break;
@@ -54,6 +54,7 @@ public:
54
54
 
55
55
  uint32_t get_n_tokens() const;
56
56
  uint32_t get_n_outputs() const;
57
+ uint32_t get_n_used() const;
57
58
 
58
59
  // the array of output indices in the order they were encountered during the ubatch splitting
59
60
  std::vector<int32_t> & get_out_ids();
@@ -69,7 +70,8 @@ public:
69
70
  llama_ubatch split_simple(uint32_t n_ubatch);
70
71
 
71
72
  // make ubatches of equal-length sequences sets
72
- llama_ubatch split_equal(uint32_t n_ubatch);
73
+ // if sequential == true, the tokens in the ubatch will have increasing sequential sequence ids
74
+ llama_ubatch split_equal(uint32_t n_ubatch, bool sequential);
73
75
 
74
76
  // sequence-set-wise split - each ubatch contains a single sequence-set
75
77
  llama_ubatch split_seq(uint32_t n_ubatch);
@@ -112,6 +114,9 @@ private:
112
114
  using pos_set_t = std::set<llama_pos>;
113
115
  using seq_cpl_t = std::vector<bool>;
114
116
 
117
+ // helper flag to quickly determine if there are any coupled sequences in the batch
118
+ bool has_cpl;
119
+
115
120
  std::vector<pos_set_t> seq_pos; // seq_pos[s]: the set of positions in sequence s
116
121
  std::vector<seq_cpl_t> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1
117
122
 
@@ -125,6 +130,8 @@ private:
125
130
  // batch indices of the output
126
131
  std::vector<int32_t> out_ids;
127
132
 
133
+ uint32_t n_used;
134
+
128
135
  // used[i] indicates if token i has already been used in a previous ubatch
129
136
  std::vector<bool> used;
130
137
 
@@ -64,6 +64,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
64
64
  { "bailing", LLM_CHAT_TEMPLATE_BAILING },
65
65
  { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
66
66
  { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
67
+ { "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
67
68
  };
68
69
 
69
70
  llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -185,6 +186,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
185
186
  return LLM_CHAT_TEMPLATE_LLAMA4;
186
187
  } else if (tmpl_contains("<|endofuserprompt|>")) {
187
188
  return LLM_CHAT_TEMPLATE_DOTS1;
189
+ } else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
190
+ return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
188
191
  }
189
192
  return LLM_CHAT_TEMPLATE_UNKNOWN;
190
193
  }
@@ -665,6 +668,18 @@ int32_t llm_chat_apply_template(
665
668
  if (add_ass) {
666
669
  ss << "<|response|>";
667
670
  }
671
+ } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_MOE) {
672
+ // tencent/Hunyuan-A13B-Instruct
673
+ for (auto message : chat) {
674
+ std::string role(message->role);
675
+ if (role == "system") {
676
+ ss << "<|startoftext|>" << message->content << "<|extra_4|>";
677
+ } else if (role == "assistant") {
678
+ ss << "<|startoftext|>" << message->content << "<|eos|>";
679
+ } else {
680
+ ss << "<|startoftext|>" << message->content << "<|extra_0|>";
681
+ }
682
+ }
668
683
  } else {
669
684
  // template not supported
670
685
  return -1;