@fugood/llama.node 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/package.json +14 -14
  2. package/src/llama.cpp/CMakeLists.txt +0 -1
  3. package/src/llama.cpp/common/arg.cpp +7 -0
  4. package/src/llama.cpp/common/common.h +1 -0
  5. package/src/llama.cpp/ggml/CMakeLists.txt +7 -2
  6. package/src/llama.cpp/ggml/include/ggml.h +91 -10
  7. package/src/llama.cpp/ggml/src/CMakeLists.txt +0 -1
  8. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
  9. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +12 -1
  10. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +726 -155
  11. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -0
  12. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  13. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +9 -9
  14. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +49 -9
  15. package/src/llama.cpp/include/llama.h +1 -0
  16. package/src/llama.cpp/src/llama-arch.cpp +90 -2
  17. package/src/llama.cpp/src/llama-arch.h +6 -0
  18. package/src/llama.cpp/src/llama-batch.cpp +27 -1
  19. package/src/llama.cpp/src/llama-batch.h +8 -1
  20. package/src/llama.cpp/src/llama-chat.cpp +15 -0
  21. package/src/llama.cpp/src/llama-chat.h +1 -0
  22. package/src/llama.cpp/src/llama-graph.cpp +64 -50
  23. package/src/llama.cpp/src/llama-graph.h +41 -16
  24. package/src/llama.cpp/src/llama-hparams.cpp +2 -1
  25. package/src/llama.cpp/src/llama-hparams.h +1 -0
  26. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
  27. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
  28. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
  29. package/src/llama.cpp/src/llama-kv-cache-unified.h +62 -24
  30. package/src/llama.cpp/src/llama-kv-cells.h +62 -10
  31. package/src/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
  32. package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
  33. package/src/llama.cpp/src/llama-memory-recurrent.cpp +15 -2
  34. package/src/llama.cpp/src/llama-memory.cpp +17 -0
  35. package/src/llama.cpp/src/llama-memory.h +3 -0
  36. package/src/llama.cpp/src/llama-model.cpp +1234 -248
  37. package/src/llama.cpp/src/llama-model.h +2 -0
  38. package/src/llama.cpp/src/llama-vocab.cpp +8 -1
  39. package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
@@ -20,6 +20,9 @@
20
20
 
21
21
  static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
22
22
 
23
+ // Work buffer size for im2col operations in CONV2D
24
+ #define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024)
25
+
23
26
  #ifdef __cplusplus
24
27
  extern "C" {
25
28
  #endif
@@ -65,6 +68,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
65
68
  void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
66
69
  void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
67
70
  void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
71
+ void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
68
72
  void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
69
73
  void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
70
74
  void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -107,6 +111,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru
107
111
  void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
108
112
  void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
109
113
  void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
114
+ void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
110
115
 
111
116
  #ifdef __cplusplus
112
117
  }
@@ -189,7 +189,7 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
189
189
  #define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
190
190
  #define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
191
191
  #define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
192
- #define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, a, b, c)
192
+ #define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a)
193
193
  #define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
194
194
  #define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
195
195
  #define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
@@ -37,35 +37,35 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
37
37
  for (int i = 0; i < np; i += ggml_f32_step) {
38
38
  ax1 = GGML_F32_VEC_LOAD(x + i);
39
39
  ay1 = GGML_F32_VEC_LOAD(y + i);
40
- sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
40
+ sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
41
41
 
42
42
  ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
43
43
  ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
44
- sum2 = GGML_F32_VEC_FMA(ax2, ay2, sum2);
44
+ sum2 = GGML_F32_VEC_FMA(sum2, ax2, ay2);
45
45
 
46
46
  ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
47
47
  ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
48
- sum3 = GGML_F32_VEC_FMA(ax3, ay3, sum3);
48
+ sum3 = GGML_F32_VEC_FMA(sum3, ax3, ay3);
49
49
 
50
50
  ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
51
51
  ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
52
- sum4 = GGML_F32_VEC_FMA(ax4, ay4, sum4);
52
+ sum4 = GGML_F32_VEC_FMA(sum4, ax4, ay4);
53
53
 
54
54
  ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
55
55
  ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
56
- sum5 = GGML_F32_VEC_FMA(ax5, ay5, sum5);
56
+ sum5 = GGML_F32_VEC_FMA(sum5, ax5, ay5);
57
57
 
58
58
  ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
59
59
  ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
60
- sum6 = GGML_F32_VEC_FMA(ax6, ay6, sum6);
60
+ sum6 = GGML_F32_VEC_FMA(sum6, ax6, ay6);
61
61
 
62
62
  ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
63
63
  ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
64
- sum7 = GGML_F32_VEC_FMA(ax7, ay7, sum7);
64
+ sum7 = GGML_F32_VEC_FMA(sum7, ax7, ay7);
65
65
 
66
66
  ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
67
67
  ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
68
- sum8 = GGML_F32_VEC_FMA(ax8, ay8, sum8);
68
+ sum8 = GGML_F32_VEC_FMA(sum8, ax8, ay8);
69
69
  }
70
70
  // leftovers
71
71
  // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
@@ -73,7 +73,7 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
73
73
  for (int i = np; i < np2; i += ggml_f32_epr) {
74
74
  ax1 = GGML_F32_VEC_LOAD(x + i);
75
75
  ay1 = GGML_F32_VEC_LOAD(y + i);
76
- sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
76
+ sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
77
77
  }
78
78
  // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
79
79
  if (np2 < n) {
@@ -163,49 +163,49 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
163
163
 
164
164
  ax1 = GGML_F32_VEC_LOAD(x + i);
165
165
  ay1 = GGML_F32_VEC_LOAD(y + i);
166
- ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
166
+ ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
167
167
 
168
168
  GGML_F32_VEC_STORE(y + i, ay1);
169
169
 
170
170
  ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
171
171
  ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
172
- ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2);
172
+ ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
173
173
 
174
174
  GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
175
175
 
176
176
  ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
177
177
  ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
178
- ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3);
178
+ ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
179
179
 
180
180
  GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
181
181
 
182
182
  ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
183
183
  ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
184
- ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4);
184
+ ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
185
185
 
186
186
  GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
187
187
 
188
188
  ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
189
189
  ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
190
- ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5);
190
+ ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
191
191
 
192
192
  GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
193
193
 
194
194
  ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
195
195
  ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
196
- ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6);
196
+ ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
197
197
 
198
198
  GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
199
199
 
200
200
  ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
201
201
  ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
202
- ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7);
202
+ ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
203
203
 
204
204
  GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
205
205
 
206
206
  ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
207
207
  ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
208
- ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8);
208
+ ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
209
209
 
210
210
  GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
211
211
  }
@@ -215,7 +215,7 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
215
215
  for (int i = np; i < np2; i += ggml_f32_epr) {
216
216
  ax1 = GGML_F32_VEC_LOAD(x + i);
217
217
  ay1 = GGML_F32_VEC_LOAD(y + i);
218
- ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
218
+ ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
219
219
 
220
220
  GGML_F32_VEC_STORE(y + i, ay1);
221
221
  }
@@ -959,6 +959,46 @@ inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_
959
959
  }
960
960
  }
961
961
 
962
+ inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
963
+ for (int i = 0; i < n; ++i) {
964
+ float xi = x[i];
965
+ y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
966
+ }
967
+ }
968
+
969
+ inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
970
+ for (int i = 0; i < n; ++i) {
971
+ float xi = GGML_CPU_FP16_TO_FP32(x[i]);
972
+ float gi = GGML_CPU_FP16_TO_FP32(g[i]);
973
+ y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
974
+ }
975
+ }
976
+
977
+ #ifdef GGML_GELU_QUICK_FP16
978
+ inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
979
+ uint16_t t;
980
+ for (int i = 0; i < n; ++i) {
981
+ ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
982
+ memcpy(&t, &fp16, sizeof(uint16_t));
983
+ y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
984
+ }
985
+ }
986
+ #else
987
+ inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
988
+ for (int i = 0; i < n; ++i) {
989
+ y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
990
+ }
991
+ }
992
+ #endif
993
+
994
+ inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
995
+ const uint16_t * i16 = (const uint16_t *) x;
996
+ for (int i = 0; i < n; ++i) {
997
+ float v = GGML_CPU_FP16_TO_FP32(g[i]);
998
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
999
+ }
1000
+ }
1001
+
962
1002
  inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
963
1003
  #ifndef GGML_USE_ACCELERATE
964
1004
  ggml_float sum = 0.0;
@@ -117,6 +117,7 @@ extern "C" {
117
117
  LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
118
118
  LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
119
119
  LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
120
+ LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
120
121
  };
121
122
 
122
123
  enum llama_rope_type {
@@ -45,6 +45,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
45
45
  { LLM_ARCH_GEMMA3N, "gemma3n" },
46
46
  { LLM_ARCH_STARCODER2, "starcoder2" },
47
47
  { LLM_ARCH_MAMBA, "mamba" },
48
+ { LLM_ARCH_MAMBA2, "mamba2" },
49
+ { LLM_ARCH_FALCON_H1, "falcon-h1" },
48
50
  { LLM_ARCH_XVERSE, "xverse" },
49
51
  { LLM_ARCH_COMMAND_R, "command-r" },
50
52
  { LLM_ARCH_COHERE2, "cohere2" },
@@ -77,6 +79,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
77
79
  { LLM_ARCH_DOTS1, "dots1" },
78
80
  { LLM_ARCH_ARCEE, "arcee" },
79
81
  { LLM_ARCH_ERNIE4_5, "ernie4_5" },
82
+ { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
83
+ { LLM_ARCH_SMOLLM3, "smollm3" },
80
84
  { LLM_ARCH_UNKNOWN, "(unknown)" },
81
85
  };
82
86
 
@@ -170,6 +174,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
170
174
  { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
171
175
  { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
172
176
  { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
177
+ { LLM_KV_SSM_GROUP_COUNT, "%s.ssm.group_count" },
173
178
  { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
174
179
 
175
180
  { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
@@ -1004,6 +1009,46 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1004
1009
  { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
1005
1010
  },
1006
1011
  },
1012
+ {
1013
+ LLM_ARCH_MAMBA2,
1014
+ {
1015
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1016
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1017
+ { LLM_TENSOR_OUTPUT, "output" },
1018
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1019
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
1020
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
1021
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
1022
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
1023
+ { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
1024
+ { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
1025
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
1026
+ },
1027
+ },
1028
+ {
1029
+ LLM_ARCH_FALCON_H1,
1030
+ {
1031
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1032
+ { LLM_TENSOR_OUTPUT, "output" },
1033
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1034
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1035
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1036
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1037
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1038
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1039
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
1040
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
1041
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
1042
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
1043
+ { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
1044
+ { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
1045
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
1046
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1047
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1048
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1049
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1050
+ },
1051
+ },
1007
1052
  {
1008
1053
  LLM_ARCH_XVERSE,
1009
1054
  {
@@ -1676,12 +1721,52 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1676
1721
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1677
1722
  },
1678
1723
  },
1724
+ {
1725
+ LLM_ARCH_HUNYUAN_MOE,
1726
+ {
1727
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1728
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1729
+ { LLM_TENSOR_OUTPUT, "output" },
1730
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1731
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1732
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1733
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1734
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1735
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1736
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1737
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1738
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1739
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1740
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1741
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1742
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1743
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1744
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1745
+ },
1746
+ },
1679
1747
  {
1680
1748
  LLM_ARCH_UNKNOWN,
1681
1749
  {
1682
1750
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1683
1751
  },
1684
1752
  },
1753
+ {
1754
+ LLM_ARCH_SMOLLM3,
1755
+ {
1756
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1757
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1758
+ { LLM_TENSOR_OUTPUT, "output" },
1759
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1760
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1761
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1762
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1763
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1764
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1765
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1766
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1767
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1768
+ },
1769
+ },
1685
1770
  };
1686
1771
 
1687
1772
  static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -1761,6 +1846,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1761
1846
  {LLM_TENSOR_SSM_CONV1D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
1762
1847
  {LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
1763
1848
  {LLM_TENSOR_SSM_D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1849
+ {LLM_TENSOR_SSM_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1764
1850
  {LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1765
1851
  {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1766
1852
  {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -1894,6 +1980,7 @@ const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) {
1894
1980
  bool llm_arch_is_recurrent(const llm_arch & arch) {
1895
1981
  switch (arch) {
1896
1982
  case LLM_ARCH_MAMBA:
1983
+ case LLM_ARCH_MAMBA2:
1897
1984
  case LLM_ARCH_RWKV6:
1898
1985
  case LLM_ARCH_RWKV6QWEN2:
1899
1986
  case LLM_ARCH_RWKV7:
@@ -1905,9 +1992,10 @@ bool llm_arch_is_recurrent(const llm_arch & arch) {
1905
1992
  }
1906
1993
 
1907
1994
  bool llm_arch_is_hybrid(const llm_arch & arch) {
1908
- // TODO: There are currently no hybrid models! Once there are, this will be
1909
- // the place to identify them
1995
+ // List all mamba-attention hybrid models here
1910
1996
  switch (arch) {
1997
+ case LLM_ARCH_FALCON_H1:
1998
+ return true;
1911
1999
  default:
1912
2000
  return false;
1913
2001
  }
@@ -49,6 +49,8 @@ enum llm_arch {
49
49
  LLM_ARCH_GEMMA3N,
50
50
  LLM_ARCH_STARCODER2,
51
51
  LLM_ARCH_MAMBA,
52
+ LLM_ARCH_MAMBA2,
53
+ LLM_ARCH_FALCON_H1,
52
54
  LLM_ARCH_XVERSE,
53
55
  LLM_ARCH_COMMAND_R,
54
56
  LLM_ARCH_COHERE2,
@@ -81,6 +83,8 @@ enum llm_arch {
81
83
  LLM_ARCH_DOTS1,
82
84
  LLM_ARCH_ARCEE,
83
85
  LLM_ARCH_ERNIE4_5,
86
+ LLM_ARCH_HUNYUAN_MOE,
87
+ LLM_ARCH_SMOLLM3,
84
88
  LLM_ARCH_UNKNOWN,
85
89
  };
86
90
 
@@ -174,6 +178,7 @@ enum llm_kv {
174
178
  LLM_KV_SSM_CONV_KERNEL,
175
179
  LLM_KV_SSM_STATE_SIZE,
176
180
  LLM_KV_SSM_TIME_STEP_RANK,
181
+ LLM_KV_SSM_GROUP_COUNT,
177
182
  LLM_KV_SSM_DT_B_C_RMS,
178
183
 
179
184
  LLM_KV_WKV_HEAD_SIZE,
@@ -293,6 +298,7 @@ enum llm_tensor {
293
298
  LLM_TENSOR_SSM_DT,
294
299
  LLM_TENSOR_SSM_A,
295
300
  LLM_TENSOR_SSM_D,
301
+ LLM_TENSOR_SSM_NORM,
296
302
  LLM_TENSOR_SSM_OUT,
297
303
  LLM_TENSOR_TIME_MIX_W0,
298
304
  LLM_TENSOR_TIME_MIX_W1,
@@ -166,6 +166,8 @@ bool llama_batch_allocr::init(
166
166
 
167
167
  // note: tracking the other way around is not necessary for now
168
168
  //seq_cpl[s0][s1] = true;
169
+
170
+ has_cpl = true;
169
171
  }
170
172
  }
171
173
  }
@@ -405,6 +407,10 @@ uint32_t llama_batch_allocr::get_n_outputs() const {
405
407
  return n_outputs;
406
408
  }
407
409
 
410
+ uint32_t llama_batch_allocr::get_n_used() const {
411
+ return n_used;
412
+ }
413
+
408
414
  std::vector<int32_t> & llama_batch_allocr::get_out_ids() {
409
415
  return out_ids;
410
416
  }
@@ -420,6 +426,8 @@ llama_pos llama_batch_allocr::seq_pos_max(llama_seq_id seq_id) const {
420
426
  void llama_batch_allocr::split_reset() {
421
427
  out_ids.clear();
422
428
 
429
+ n_used = 0;
430
+
423
431
  used.clear();
424
432
  used.resize(get_n_tokens(), false);
425
433
 
@@ -444,6 +452,7 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
444
452
  idxs.push_back(cur_idx);
445
453
 
446
454
  used[cur_idx] = true;
455
+ ++n_used;
447
456
 
448
457
  ++cur_idx;
449
458
 
@@ -459,9 +468,17 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
459
468
  return ubatch_add(idxs, idxs.size(), false);
460
469
  }
461
470
 
462
- llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
471
+ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
472
+ if (sequential && has_cpl) {
473
+ LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__);
474
+
475
+ return {};
476
+ }
477
+
463
478
  std::vector<seq_set_t> cur_seq_set;
464
479
 
480
+ llama_seq_id last_seq_id = -1;
481
+
465
482
  // determine the non-overlapping sequence sets participating in this ubatch
466
483
  for (int32_t i = 0; i < batch.n_tokens; ++i) {
467
484
  if (used[i]) {
@@ -478,9 +495,16 @@ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
478
495
  }
479
496
  }
480
497
 
498
+ // accept only increasing sequence ids
499
+ if (sequential) {
500
+ add = add && (cur_seq_set.empty() || batch.seq_id[i][0] == last_seq_id + 1);
501
+ }
502
+
481
503
  if (add) {
482
504
  cur_seq_set.push_back(seq_set[i]);
483
505
 
506
+ last_seq_id = batch.seq_id[i][0];
507
+
484
508
  if (cur_seq_set.size() > n_ubatch) {
485
509
  break;
486
510
  }
@@ -529,6 +553,7 @@ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
529
553
  idxs_per_seq[s].push_back(idx);
530
554
 
531
555
  used[idx] = true;
556
+ ++n_used;
532
557
 
533
558
  ++cur_idx[s];
534
559
  }
@@ -570,6 +595,7 @@ llama_ubatch llama_batch_allocr::split_seq(uint32_t n_ubatch) {
570
595
  idxs.push_back(cur_idx);
571
596
 
572
597
  used[cur_idx] = true;
598
+ ++n_used;
573
599
 
574
600
  if (idxs.size() >= n_ubatch) {
575
601
  break;
@@ -54,6 +54,7 @@ public:
54
54
 
55
55
  uint32_t get_n_tokens() const;
56
56
  uint32_t get_n_outputs() const;
57
+ uint32_t get_n_used() const;
57
58
 
58
59
  // the array of output indices in the order they were encountered during the ubatch splitting
59
60
  std::vector<int32_t> & get_out_ids();
@@ -69,7 +70,8 @@ public:
69
70
  llama_ubatch split_simple(uint32_t n_ubatch);
70
71
 
71
72
  // make ubatches of equal-length sequences sets
72
- llama_ubatch split_equal(uint32_t n_ubatch);
73
+ // if sequential == true, the tokens in the ubatch will have increasing sequential sequence ids
74
+ llama_ubatch split_equal(uint32_t n_ubatch, bool sequential);
73
75
 
74
76
  // sequence-set-wise split - each ubatch contains a single sequence-set
75
77
  llama_ubatch split_seq(uint32_t n_ubatch);
@@ -112,6 +114,9 @@ private:
112
114
  using pos_set_t = std::set<llama_pos>;
113
115
  using seq_cpl_t = std::vector<bool>;
114
116
 
117
+ // helper flag to quickly determine if there are any coupled sequences in the batch
118
+ bool has_cpl;
119
+
115
120
  std::vector<pos_set_t> seq_pos; // seq_pos[s]: the set of positions in sequence s
116
121
  std::vector<seq_cpl_t> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1
117
122
 
@@ -125,6 +130,8 @@ private:
125
130
  // batch indices of the output
126
131
  std::vector<int32_t> out_ids;
127
132
 
133
+ uint32_t n_used;
134
+
128
135
  // used[i] indicates if token i has already been used in a previous ubatch
129
136
  std::vector<bool> used;
130
137
 
@@ -64,6 +64,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
64
64
  { "bailing", LLM_CHAT_TEMPLATE_BAILING },
65
65
  { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
66
66
  { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
67
+ { "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
67
68
  };
68
69
 
69
70
  llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -185,6 +186,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
185
186
  return LLM_CHAT_TEMPLATE_LLAMA4;
186
187
  } else if (tmpl_contains("<|endofuserprompt|>")) {
187
188
  return LLM_CHAT_TEMPLATE_DOTS1;
189
+ } else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
190
+ return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
188
191
  }
189
192
  return LLM_CHAT_TEMPLATE_UNKNOWN;
190
193
  }
@@ -665,6 +668,18 @@ int32_t llm_chat_apply_template(
665
668
  if (add_ass) {
666
669
  ss << "<|response|>";
667
670
  }
671
+ } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_MOE) {
672
+ // tencent/Hunyuan-A13B-Instruct
673
+ for (auto message : chat) {
674
+ std::string role(message->role);
675
+ if (role == "system") {
676
+ ss << "<|startoftext|>" << message->content << "<|extra_4|>";
677
+ } else if (role == "assistant") {
678
+ ss << "<|startoftext|>" << message->content << "<|eos|>";
679
+ } else {
680
+ ss << "<|startoftext|>" << message->content << "<|extra_0|>";
681
+ }
682
+ }
668
683
  } else {
669
684
  // template not supported
670
685
  return -1;
@@ -44,6 +44,7 @@ enum llm_chat_template {
44
44
  LLM_CHAT_TEMPLATE_LLAMA4,
45
45
  LLM_CHAT_TEMPLATE_SMOLVLM,
46
46
  LLM_CHAT_TEMPLATE_DOTS1,
47
+ LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
47
48
  LLM_CHAT_TEMPLATE_UNKNOWN,
48
49
  };
49
50