@fugood/llama.node 1.1.5 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +17 -13
  3. package/src/LlamaCompletionWorker.cpp +2 -0
  4. package/src/llama.cpp/common/arg.cpp +28 -11
  5. package/src/llama.cpp/common/chat.cpp +46 -2
  6. package/src/llama.cpp/common/chat.h +7 -2
  7. package/src/llama.cpp/common/common.h +3 -2
  8. package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
  9. package/src/llama.cpp/ggml/include/ggml.h +37 -1
  10. package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
  11. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
  12. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
  13. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +6 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
  16. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
  17. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
  20. package/src/llama.cpp/include/llama.h +1 -0
  21. package/src/llama.cpp/src/llama-arch.cpp +65 -0
  22. package/src/llama.cpp/src/llama-arch.h +10 -0
  23. package/src/llama.cpp/src/llama-chat.cpp +13 -0
  24. package/src/llama.cpp/src/llama-chat.h +1 -0
  25. package/src/llama.cpp/src/llama-context.cpp +8 -8
  26. package/src/llama.cpp/src/llama-graph.cpp +118 -9
  27. package/src/llama.cpp/src/llama-graph.h +38 -0
  28. package/src/llama.cpp/src/llama-hparams.h +5 -3
  29. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +4 -0
  30. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  31. package/src/llama.cpp/src/llama-model-loader.h +3 -2
  32. package/src/llama.cpp/src/llama-model.cpp +499 -4
  33. package/src/llama.cpp/src/llama-model.h +24 -4
  34. package/src/llama.cpp/src/llama-quant.cpp +37 -1
  35. package/src/llama.cpp/src/llama-vocab.cpp +42 -0
@@ -55,7 +55,22 @@ inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x)
55
55
 
56
56
  inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
57
57
  inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
58
- inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
58
+
59
+ inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
60
+ int i = 0;
61
+ #if defined(__AVX2__)
62
+ for (; i + 7 < n; i += 8) {
63
+ __m256 vx = _mm256_loadu_ps(x + i);
64
+ __m256 vy = _mm256_loadu_ps(y + i);
65
+ __m256 vz = _mm256_add_ps(vx, vy);
66
+ _mm256_storeu_ps(z + i, vz);
67
+ }
68
+ #endif
69
+ for (; i < n; ++i) {
70
+ z[i] = x[i] + y[i];
71
+ }
72
+ }
73
+
59
74
  inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
60
75
  for (int i = 0; i < n; ++i) {
61
76
  z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
@@ -992,9 +1007,9 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
992
1007
 
993
1008
  inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
994
1009
  for (int i = 0; i < n; ++i) {
995
- float v = GGML_CPU_FP16_TO_FP32(x[i]);
996
- float w = GGML_CPU_FP16_TO_FP32(g[i]);
997
- y[i] = GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);
1010
+ float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1011
+ float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1012
+ y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
998
1013
  }
999
1014
  }
1000
1015
 
@@ -152,6 +152,7 @@ extern "C" {
152
152
  //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
153
153
  LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
154
154
  LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
155
+ LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
155
156
 
156
157
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
157
158
  };
@@ -62,6 +62,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
62
62
  { LLM_ARCH_DEEPSEEK2, "deepseek2" },
63
63
  { LLM_ARCH_CHATGLM, "chatglm" },
64
64
  { LLM_ARCH_GLM4, "glm4" },
65
+ { LLM_ARCH_GLM4_MOE, "glm4moe" },
65
66
  { LLM_ARCH_BITNET, "bitnet" },
66
67
  { LLM_ARCH_T5, "t5" },
67
68
  { LLM_ARCH_T5ENCODER, "t5encoder" },
@@ -87,6 +88,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
87
88
  { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
88
89
  { LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
89
90
  { LLM_ARCH_SMOLLM3, "smollm3" },
91
+ { LLM_ARCH_OPENAI_MOE, "gpt-oss" },
90
92
  { LLM_ARCH_LFM2, "lfm2" },
91
93
  { LLM_ARCH_DREAM, "dream" },
92
94
  { LLM_ARCH_SMALLTHINKER, "smallthinker" },
@@ -127,6 +129,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
127
129
  { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
128
130
  { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
129
131
  { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
132
+ { LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
130
133
  { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
131
134
  { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
132
135
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@@ -1391,6 +1394,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1391
1394
  { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
1392
1395
  },
1393
1396
  },
1397
+ {
1398
+ LLM_ARCH_GLM4_MOE,
1399
+ {
1400
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1401
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1402
+ { LLM_TENSOR_OUTPUT, "output" },
1403
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1404
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1405
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1406
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1407
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1408
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1409
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1410
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1411
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1412
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1413
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1414
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1415
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1416
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1417
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1418
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1419
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1420
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1421
+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1422
+ // NextN/MTP tensors - preserved but unused (in final layer, dynamic layer number)
1423
+ { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" },
1424
+ { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" },
1425
+ { LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" },
1426
+ { LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" },
1427
+ { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" },
1428
+ { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
1429
+ },
1430
+ },
1394
1431
  {
1395
1432
  LLM_ARCH_BITNET,
1396
1433
  {
@@ -1935,6 +1972,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1935
1972
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1936
1973
  },
1937
1974
  },
1975
+ {
1976
+ LLM_ARCH_OPENAI_MOE,
1977
+ {
1978
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1979
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1980
+ { LLM_TENSOR_OUTPUT, "output" },
1981
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1982
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1983
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1984
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1985
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1986
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1987
+ { LLM_TENSOR_ATTN_SINKS, "blk.%d.attn_sinks" },
1988
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1989
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1990
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1991
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1992
+ },
1993
+ },
1938
1994
  {
1939
1995
  LLM_ARCH_LFM2,
1940
1996
  {
@@ -2050,6 +2106,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
2050
2106
  {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2051
2107
  {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2052
2108
  {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2109
+ {LLM_TENSOR_ATTN_SINKS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
2053
2110
  {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2054
2111
  {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2055
2112
  {LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -2181,6 +2238,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
2181
2238
  {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
2182
2239
  {LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2183
2240
  {LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2241
+ // NextN/MTP tensors are currently ignored (reserved for future MTP support)
2242
+ // These tensors only exist in the last layer(s) and are treated as output tensors
2243
+ {LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
2244
+ {LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
2245
+ {LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
2246
+ {LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
2247
+ {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
2248
+ {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
2184
2249
  };
2185
2250
 
2186
2251
  LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
@@ -66,6 +66,7 @@ enum llm_arch {
66
66
  LLM_ARCH_DEEPSEEK2,
67
67
  LLM_ARCH_CHATGLM,
68
68
  LLM_ARCH_GLM4,
69
+ LLM_ARCH_GLM4_MOE,
69
70
  LLM_ARCH_BITNET,
70
71
  LLM_ARCH_T5,
71
72
  LLM_ARCH_T5ENCODER,
@@ -91,6 +92,7 @@ enum llm_arch {
91
92
  LLM_ARCH_HUNYUAN_MOE,
92
93
  LLM_ARCH_HUNYUAN_DENSE,
93
94
  LLM_ARCH_SMOLLM3,
95
+ LLM_ARCH_OPENAI_MOE,
94
96
  LLM_ARCH_LFM2,
95
97
  LLM_ARCH_DREAM,
96
98
  LLM_ARCH_SMALLTHINKER,
@@ -131,6 +133,7 @@ enum llm_kv {
131
133
  LLM_KV_EXPERT_WEIGHTS_NORM,
132
134
  LLM_KV_EXPERT_GATING_FUNC,
133
135
  LLM_KV_MOE_EVERY_N_LAYERS,
136
+ LLM_KV_NEXTN_PREDICT_LAYERS,
134
137
  LLM_KV_POOLING_TYPE,
135
138
  LLM_KV_LOGIT_SCALE,
136
139
  LLM_KV_DECODER_START_TOKEN_ID,
@@ -263,6 +266,7 @@ enum llm_tensor {
263
266
  LLM_TENSOR_ATTN_OUT_NORM,
264
267
  LLM_TENSOR_ATTN_POST_NORM,
265
268
  LLM_TENSOR_ATTN_ROT_EMBD,
269
+ LLM_TENSOR_ATTN_SINKS,
266
270
  LLM_TENSOR_FFN_GATE_INP,
267
271
  LLM_TENSOR_FFN_GATE_INP_SHEXP,
268
272
  LLM_TENSOR_FFN_NORM,
@@ -409,6 +413,12 @@ enum llm_tensor {
409
413
  LLM_TENSOR_SHORTCONV_CONV,
410
414
  LLM_TENSOR_SHORTCONV_INPROJ,
411
415
  LLM_TENSOR_SHORTCONV_OUTPROJ,
416
+ LLM_TENSOR_NEXTN_EH_PROJ,
417
+ LLM_TENSOR_NEXTN_EMBED_TOKENS,
418
+ LLM_TENSOR_NEXTN_ENORM,
419
+ LLM_TENSOR_NEXTN_HNORM,
420
+ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
421
+ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
412
422
  };
413
423
 
414
424
  enum llm_tensor_layer {
@@ -66,6 +66,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
66
66
  { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
67
67
  { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
68
68
  { "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
69
+ { "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
69
70
  { "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
70
71
  { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
71
72
  };
@@ -194,6 +195,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
194
195
  return LLM_CHAT_TEMPLATE_DOTS1;
195
196
  } else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
196
197
  return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
198
+ } else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
199
+ return LLM_CHAT_TEMPLATE_OPENAI_MOE;
197
200
  } else if (tmpl_contains("<|hy_place▁holder▁no▁2|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
198
201
  return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
199
202
  } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
@@ -706,6 +709,16 @@ int32_t llm_chat_apply_template(
706
709
  ss << "<|startoftext|>" << message->content << "<|extra_0|>";
707
710
  }
708
711
  }
712
+ } else if (tmpl == LLM_CHAT_TEMPLATE_OPENAI_MOE) {
713
+ // OpenAI MoE (based on Harmony chat template)
714
+ for (auto message : chat) {
715
+ std::string role(message->role);
716
+ ss << "<|start|>" << role << "<|message|>" << message->content;
717
+ ss << (role == "assistant" ? "<|return|>" : "<|end|>");
718
+ }
719
+ if (add_ass) {
720
+ ss << "<|start|>assistant";
721
+ }
709
722
  } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_DENSE) {
710
723
  // tencent/Hunyuan-4B-Instruct
711
724
  for (size_t i = 0; i < chat.size(); i++) {
@@ -46,6 +46,7 @@ enum llm_chat_template {
46
46
  LLM_CHAT_TEMPLATE_SMOLVLM,
47
47
  LLM_CHAT_TEMPLATE_DOTS1,
48
48
  LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
49
+ LLM_CHAT_TEMPLATE_OPENAI_MOE,
49
50
  LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
50
51
  LLM_CHAT_TEMPLATE_KIMI_K2,
51
52
  LLM_CHAT_TEMPLATE_UNKNOWN,
@@ -786,7 +786,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
786
786
  const auto & hparams = model.hparams;
787
787
 
788
788
  const int64_t n_embd = hparams.n_embd;
789
- const int32_t n_vocab = model.vocab.n_tokens();
789
+ const int64_t n_vocab = model.vocab.n_tokens();
790
790
 
791
791
  // note: during encode, we always pass the full sequence starting from pos = 0
792
792
  if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
@@ -959,7 +959,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
959
959
  const auto & vocab = model.vocab;
960
960
  const auto & hparams = model.hparams;
961
961
 
962
- const int32_t n_vocab = vocab.n_tokens();
962
+ const int64_t n_vocab = vocab.n_tokens();
963
963
  const int64_t n_embd = hparams.n_embd;
964
964
 
965
965
  // when computing embeddings, all tokens are output
@@ -1328,21 +1328,21 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
1328
1328
  }
1329
1329
 
1330
1330
  void llama_context::output_reorder() {
1331
- const uint32_t n_vocab = model.vocab.n_tokens();
1331
+ const uint64_t n_vocab = model.vocab.n_tokens();
1332
1332
  const uint64_t n_embd = model.hparams.n_embd;
1333
1333
 
1334
- for (uint32_t s = 0; s < output_swaps.size(); ++s) {
1335
- const uint32_t i0 = output_swaps[s].i0;
1336
- const uint32_t i1 = output_swaps[s].i1;
1334
+ for (size_t s = 0; s < output_swaps.size(); ++s) {
1335
+ const uint64_t i0 = output_swaps[s].i0;
1336
+ const uint64_t i1 = output_swaps[s].i1;
1337
1337
 
1338
1338
  if (logits_size > 0) {
1339
- for (uint32_t k = 0; k < n_vocab; k++) {
1339
+ for (uint64_t k = 0; k < n_vocab; k++) {
1340
1340
  std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
1341
1341
  }
1342
1342
  }
1343
1343
 
1344
1344
  if (embd_size > 0) {
1345
- for (uint32_t k = 0; k < n_embd; k++) {
1345
+ for (uint64_t k = 0; k < n_embd; k++) {
1346
1346
  std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
1347
1347
  }
1348
1348
  }
@@ -740,6 +740,8 @@ ggml_tensor * llm_graph_context::build_ffn(
740
740
  cur = ggml_reglu(ctx0, cur);
741
741
  cb(cur, "ffn_reglu", il);
742
742
  } break;
743
+ default:
744
+ GGML_ABORT("fatal error");
743
745
  }
744
746
 
745
747
  if (gate && type_gate == LLM_FFN_PAR) {
@@ -749,8 +751,8 @@ ggml_tensor * llm_graph_context::build_ffn(
749
751
 
750
752
  if (down) {
751
753
  cur = build_lora_mm(down, cur);
752
- if (arch == LLM_ARCH_GLM4) {
753
- // GLM4 seems to have numerical issues with half-precision accumulators
754
+ if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
755
+ // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
754
756
  ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
755
757
  }
756
758
  }
@@ -787,6 +789,45 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
787
789
  llama_expert_gating_func_type gating_op,
788
790
  int il,
789
791
  ggml_tensor * probs_in) const {
792
+ return build_moe_ffn(
793
+ cur,
794
+ gate_inp, /* gate_inp_b */ nullptr,
795
+ up_exps, /* up_exps_b */ nullptr,
796
+ gate_exps, /* gate_exps_b */ nullptr,
797
+ down_exps, /* down_exps_b */ nullptr,
798
+ exp_probs_b,
799
+ n_expert,
800
+ n_expert_used,
801
+ type_op,
802
+ norm_w,
803
+ scale_w,
804
+ w_scale,
805
+ gating_op,
806
+ il,
807
+ probs_in
808
+ );
809
+ }
810
+
811
+ ggml_tensor * llm_graph_context::build_moe_ffn(
812
+ ggml_tensor * cur,
813
+ ggml_tensor * gate_inp,
814
+ ggml_tensor * gate_inp_b,
815
+ ggml_tensor * up_exps,
816
+ ggml_tensor * up_exps_b,
817
+ ggml_tensor * gate_exps,
818
+ ggml_tensor * gate_exps_b,
819
+ ggml_tensor * down_exps,
820
+ ggml_tensor * down_exps_b,
821
+ ggml_tensor * exp_probs_b,
822
+ int64_t n_expert,
823
+ int64_t n_expert_used,
824
+ llm_ffn_op_type type_op,
825
+ bool norm_w,
826
+ bool scale_w,
827
+ float w_scale,
828
+ llama_expert_gating_func_type gating_op,
829
+ int il,
830
+ ggml_tensor * probs_in) const {
790
831
  const int64_t n_embd = cur->ne[0];
791
832
  const int64_t n_tokens = cur->ne[1];
792
833
  const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
@@ -800,6 +841,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
800
841
  logits = probs_in;
801
842
  }
802
843
 
844
+ if (gate_inp_b) {
845
+ logits = ggml_add(ctx0, logits, gate_inp_b);
846
+ cb(logits, "ffn_moe_logits_biased", il);
847
+ }
848
+
803
849
  ggml_tensor * probs = nullptr;
804
850
  switch (gating_op) {
805
851
  case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
@@ -810,6 +856,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
810
856
  {
811
857
  probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
812
858
  } break;
859
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT:
860
+ {
861
+ probs = logits; // [n_expert, n_tokens]
862
+ } break;
813
863
  default:
814
864
  GGML_ABORT("fatal error");
815
865
  }
@@ -838,6 +888,13 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
838
888
  ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
839
889
  cb(weights, "ffn_moe_weights", il);
840
890
 
891
+ if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
892
+ weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
893
+ weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens]
894
+ weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
895
+ cb(weights, "ffn_moe_weights_softmax", il);
896
+ }
897
+
841
898
  if (norm_w) {
842
899
  weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
843
900
 
@@ -866,6 +923,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
866
923
  ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
867
924
  cb(up, "ffn_moe_up", il);
868
925
 
926
+ if (up_exps_b) {
927
+ up = ggml_add_id(ctx0, up, up_exps_b, selected_experts);
928
+ cb(up, "ffn_moe_up_biased", il);
929
+ }
930
+
869
931
  ggml_tensor * experts = nullptr;
870
932
  if (gate_exps) {
871
933
  cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
@@ -874,6 +936,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
874
936
  cur = up;
875
937
  }
876
938
 
939
+ if (gate_exps_b) {
940
+ cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
941
+ cb(cur, "ffn_moe_gate_biased", il);
942
+ }
943
+
877
944
  switch (type_op) {
878
945
  case LLM_FFN_SILU:
879
946
  if (gate_exps) {
@@ -891,6 +958,14 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
891
958
  cur = ggml_gelu(ctx0, cur);
892
959
  cb(cur, "ffn_moe_gelu", il);
893
960
  } break;
961
+ case LLM_FFN_SWIGLU_OAI_MOE:
962
+ {
963
+ // TODO: move to hparams?
964
+ constexpr float alpha = 1.702f;
965
+ constexpr float limit = 7.0f;
966
+ cur = ggml_swiglu_oai(ctx0, cur, up, alpha, limit);
967
+ cb(cur, "ffn_moe_swiglu_oai", il);
968
+ } break;
894
969
  case LLM_FFN_RELU:
895
970
  if (gate_exps) {
896
971
  cur = ggml_reglu_split(ctx0, cur, up);
@@ -906,6 +981,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
906
981
  experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
907
982
  cb(experts, "ffn_moe_down", il);
908
983
 
984
+ if (down_exps_b) {
985
+ experts = ggml_add_id(ctx0, experts, down_exps_b, selected_experts);
986
+ cb(experts, "ffn_moe_down_biased", il);
987
+ }
988
+
909
989
  if (!weight_before_ffn) {
910
990
  experts = ggml_mul(ctx0, experts, weights);
911
991
  cb(cur, "ffn_moe_weighted", il);
@@ -1144,6 +1224,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1144
1224
  ggml_tensor * kq_b,
1145
1225
  ggml_tensor * kq_mask,
1146
1226
  ggml_tensor * v_mla,
1227
+ ggml_tensor * sinks,
1147
1228
  float kq_scale) const {
1148
1229
  const bool v_trans = v->nb[1] > v->nb[2];
1149
1230
 
@@ -1180,7 +1261,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1180
1261
  cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
1181
1262
  hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
1182
1263
 
1183
- ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
1264
+ ggml_flash_attn_ext_add_sinks(cur, sinks);
1265
+ ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
1184
1266
 
1185
1267
  if (v_mla) {
1186
1268
  #if 0
@@ -1228,6 +1310,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1228
1310
  }
1229
1311
 
1230
1312
  kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
1313
+ ggml_soft_max_add_sinks(kq, sinks);
1231
1314
 
1232
1315
  if (!v_trans) {
1233
1316
  // note: avoid this branch
@@ -1298,7 +1381,7 @@ ggml_tensor * llm_graph_context::build_attn(
1298
1381
  ggml_tensor * k = k_cur;
1299
1382
  ggml_tensor * v = v_cur;
1300
1383
 
1301
- ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1384
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
1302
1385
  cb(cur, "kqv_out", il);
1303
1386
 
1304
1387
  if (wo) {
@@ -1386,13 +1469,13 @@ ggml_tensor * llm_graph_context::build_attn(
1386
1469
  ggml_tensor * k = mctx_cur->get_k(ctx0, il);
1387
1470
  ggml_tensor * v = mctx_cur->get_v(ctx0, il);
1388
1471
 
1389
- ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1472
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
1390
1473
  cb(cur, "kqv_out", il);
1391
1474
 
1392
1475
  if (wo) {
1393
1476
  cur = build_lora_mm(wo, cur);
1394
- if (arch == LLM_ARCH_GLM4) {
1395
- // GLM4 seems to have numerical issues with half-precision accumulators
1477
+ if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
1478
+ // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
1396
1479
  ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
1397
1480
  }
1398
1481
  }
@@ -1415,6 +1498,32 @@ ggml_tensor * llm_graph_context::build_attn(
1415
1498
  ggml_tensor * v_mla,
1416
1499
  float kq_scale,
1417
1500
  int il) const {
1501
+ return build_attn_with_sinks(
1502
+ inp,
1503
+ wo,
1504
+ wo_b,
1505
+ q_cur,
1506
+ k_cur,
1507
+ v_cur,
1508
+ kq_b,
1509
+ v_mla,
1510
+ nullptr,
1511
+ kq_scale,
1512
+ il);
1513
+ }
1514
+
1515
+ ggml_tensor * llm_graph_context::build_attn_with_sinks(
1516
+ llm_graph_input_attn_kv_unified_iswa * inp,
1517
+ ggml_tensor * wo,
1518
+ ggml_tensor * wo_b,
1519
+ ggml_tensor * q_cur,
1520
+ ggml_tensor * k_cur,
1521
+ ggml_tensor * v_cur,
1522
+ ggml_tensor * kq_b,
1523
+ ggml_tensor * v_mla,
1524
+ ggml_tensor * sinks,
1525
+ float kq_scale,
1526
+ int il) const {
1418
1527
  // these nodes are added to the graph together so that they are not reordered
1419
1528
  // by doing so, the number of splits in the graph is reduced
1420
1529
  ggml_build_forward_expand(gf, q_cur);
@@ -1452,7 +1561,7 @@ ggml_tensor * llm_graph_context::build_attn(
1452
1561
  ggml_tensor * k = mctx_cur->get_k(ctx0, il);
1453
1562
  ggml_tensor * v = mctx_cur->get_v(ctx0, il);
1454
1563
 
1455
- ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1564
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, sinks, kq_scale);
1456
1565
  cb(cur, "kqv_out", il);
1457
1566
 
1458
1567
  if (wo) {
@@ -1506,7 +1615,7 @@ ggml_tensor * llm_graph_context::build_attn(
1506
1615
  ggml_tensor * k = k_cur;
1507
1616
  ggml_tensor * v = v_cur;
1508
1617
 
1509
- ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1618
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
1510
1619
  cb(cur, "kqv_out", il);
1511
1620
 
1512
1621
  if (wo) {
@@ -39,6 +39,7 @@ enum llm_ffn_op_type {
39
39
  LLM_FFN_SWIGLU,
40
40
  LLM_FFN_GEGLU,
41
41
  LLM_FFN_REGLU,
42
+ LLM_FFN_SWIGLU_OAI_MOE,
42
43
  };
43
44
 
44
45
  enum llm_ffn_gate_type {
@@ -619,6 +620,7 @@ struct llm_graph_context {
619
620
  llm_ffn_gate_type type_gate,
620
621
  int il) const;
621
622
 
623
+ // build MoE FFN without bias tensors
622
624
  ggml_tensor * build_moe_ffn(
623
625
  ggml_tensor * cur,
624
626
  ggml_tensor * gate_inp,
@@ -636,6 +638,27 @@ struct llm_graph_context {
636
638
  int il,
637
639
  ggml_tensor * probs_in = nullptr) const;
638
640
 
641
+ ggml_tensor * build_moe_ffn(
642
+ ggml_tensor * cur,
643
+ ggml_tensor * gate_inp,
644
+ ggml_tensor * gate_inp_b,
645
+ ggml_tensor * up_exps,
646
+ ggml_tensor * up_exps_b,
647
+ ggml_tensor * gate_exps,
648
+ ggml_tensor * gate_exps_b,
649
+ ggml_tensor * down_exps,
650
+ ggml_tensor * down_exps_b,
651
+ ggml_tensor * exp_probs_b,
652
+ int64_t n_expert,
653
+ int64_t n_expert_used,
654
+ llm_ffn_op_type type_op,
655
+ bool norm_w,
656
+ bool scale_w,
657
+ float w_scale,
658
+ llama_expert_gating_func_type gating_op,
659
+ int il,
660
+ ggml_tensor * probs_in = nullptr) const;
661
+
639
662
  //
640
663
  // inputs
641
664
  //
@@ -662,6 +685,7 @@ struct llm_graph_context {
662
685
  ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
663
686
  ggml_tensor * kq_b,
664
687
  ggml_tensor * kq_mask,
688
+ ggml_tensor * sinks,
665
689
  ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
666
690
  float kq_scale) const;
667
691
 
@@ -708,6 +732,20 @@ struct llm_graph_context {
708
732
  float kq_scale,
709
733
  int il) const;
710
734
 
735
+ // TODO: temporary to keep the diff small. after the code is public will refactor to simplify this
736
+ ggml_tensor * build_attn_with_sinks(
737
+ llm_graph_input_attn_kv_unified_iswa * inp,
738
+ ggml_tensor * wo,
739
+ ggml_tensor * wo_b,
740
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
741
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
742
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
743
+ ggml_tensor * kq_b,
744
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
745
+ ggml_tensor * sinks, // [n_head_q]
746
+ float kq_scale,
747
+ int il) const;
748
+
711
749
  llm_graph_input_attn_cross * build_attn_inp_cross() const;
712
750
 
713
751
  ggml_tensor * build_attn(
@@ -9,9 +9,10 @@
9
9
  #define LLAMA_MAX_EXPERTS 384 // Kimi-K2
10
10
 
11
11
  enum llama_expert_gating_func_type {
12
- LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
13
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
14
- LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
12
+ LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
13
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
14
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
15
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits
15
16
  };
16
17
 
17
18
  enum llama_swa_type {
@@ -73,6 +74,7 @@ struct llama_hparams {
73
74
  bool expert_weights_norm = false;
74
75
  uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
75
76
  uint32_t moe_every_n_layers = 0;
77
+ uint32_t nextn_predict_layers = 0;
76
78
 
77
79
  float f_norm_eps;
78
80
  float f_norm_rms_eps;
@@ -39,6 +39,10 @@ llama_kv_cache_unified::llama_kv_cache_unified(
39
39
  if (model.arch == LLM_ARCH_GEMMA3N) {
40
40
  n_layer_cache = 20;
41
41
  }
42
+ if (model.arch == LLM_ARCH_GLM4_MOE) {
43
+ // GLM-4.5: Only process up to last layer, skip final NextN layer
44
+ n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers;
45
+ }
42
46
 
43
47
  // create a context for each buffer type
44
48
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -35,6 +35,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
35
35
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
36
36
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
37
37
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
38
+ case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
38
39
  case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
39
40
  case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
40
41
  case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
@@ -58,8 +58,9 @@ struct llama_model_loader {
58
58
  }
59
59
  };
60
60
 
61
- static const int TENSOR_NOT_REQUIRED = 1;
62
- static const int TENSOR_DUPLICATED = 2;
61
+ static const int TENSOR_NOT_REQUIRED = 1 << 0;
62
+ static const int TENSOR_DUPLICATED = 1 << 1;
63
+ static const int TENSOR_SKIP = 1 << 2;
63
64
 
64
65
  int n_kv = 0;
65
66
  int n_tensors = 0;