@fugood/llama.node 1.1.5 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/lib/binding.ts +4 -0
  2. package/lib/index.js +6 -1
  3. package/lib/index.ts +6 -0
  4. package/lib/version.js +5 -0
  5. package/lib/version.ts +2 -0
  6. package/package.json +14 -14
  7. package/scripts/llama.cpp.patch +19 -15
  8. package/src/LlamaCompletionWorker.cpp +73 -18
  9. package/src/LlamaCompletionWorker.h +8 -0
  10. package/src/llama.cpp/CMakeLists.txt +2 -0
  11. package/src/llama.cpp/common/arg.cpp +147 -46
  12. package/src/llama.cpp/common/chat-parser.cpp +9 -1
  13. package/src/llama.cpp/common/chat.cpp +350 -3
  14. package/src/llama.cpp/common/chat.h +11 -3
  15. package/src/llama.cpp/common/common.cpp +54 -0
  16. package/src/llama.cpp/common/common.h +44 -9
  17. package/src/llama.cpp/ggml/CMakeLists.txt +5 -2
  18. package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
  19. package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
  20. package/src/llama.cpp/ggml/include/ggml.h +65 -3
  21. package/src/llama.cpp/ggml/src/CMakeLists.txt +13 -1
  22. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +20 -1
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
  29. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +270 -11
  31. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +3 -8
  32. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
  35. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
  37. package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
  39. package/src/llama.cpp/include/llama.h +26 -0
  40. package/src/llama.cpp/src/llama-arch.cpp +65 -0
  41. package/src/llama.cpp/src/llama-arch.h +10 -0
  42. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  43. package/src/llama.cpp/src/llama-chat.cpp +15 -4
  44. package/src/llama.cpp/src/llama-chat.h +1 -0
  45. package/src/llama.cpp/src/llama-context.cpp +37 -25
  46. package/src/llama.cpp/src/llama-context.h +6 -5
  47. package/src/llama.cpp/src/llama-graph.cpp +118 -9
  48. package/src/llama.cpp/src/llama-graph.h +38 -0
  49. package/src/llama.cpp/src/llama-hparams.h +5 -3
  50. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
  51. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
  52. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +93 -69
  53. package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
  54. package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
  55. package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
  56. package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
  57. package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
  58. package/src/llama.cpp/src/llama-memory.h +2 -2
  59. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  60. package/src/llama.cpp/src/llama-model-loader.h +3 -2
  61. package/src/llama.cpp/src/llama-model.cpp +500 -4
  62. package/src/llama.cpp/src/llama-model.h +25 -4
  63. package/src/llama.cpp/src/llama-quant.cpp +37 -1
  64. package/src/llama.cpp/src/llama-vocab.cpp +43 -0
@@ -109,8 +109,10 @@ const char * llm_type_name(llm_type type) {
109
109
  case LLM_TYPE_A13B: return "A13B";
110
110
  case LLM_TYPE_21B_A3B: return "21B.A3B";
111
111
  case LLM_TYPE_30B_A3B: return "30B.A3B";
112
+ case LLM_TYPE_106B_A12B: return "106B.A12B";
112
113
  case LLM_TYPE_235B_A22B: return "235B.A22B";
113
114
  case LLM_TYPE_300B_A47B: return "300B.A47B";
115
+ case LLM_TYPE_355B_A32B: return "355B.A32B";
114
116
  case LLM_TYPE_E2B: return "E2B";
115
117
  case LLM_TYPE_E4B: return "E4B";
116
118
  default: return "?B";
@@ -190,6 +192,13 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
190
192
  ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
191
193
  op_tensor = ggml_add(ctx, a, w);
192
194
  } break;
195
+ case GGML_OP_ADD_ID:
196
+ {
197
+ int n_expert_used = hparams.n_expert_used;
198
+ ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
199
+ ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
200
+ op_tensor = ggml_add_id(ctx, a, w, c);
201
+ } break;
193
202
  case GGML_OP_MUL:
194
203
  {
195
204
  ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
@@ -258,6 +267,10 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
258
267
  ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
259
268
  op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
260
269
  } break;
270
+ case GGML_OP_SCALE:
271
+ {
272
+ op_tensor = ggml_scale(ctx, w, 1.0f);
273
+ } break;
261
274
  default:
262
275
  GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
263
276
  }
@@ -1082,6 +1095,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1082
1095
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1083
1096
 
1084
1097
  switch (hparams.n_layer) {
1098
+ case 18: type = LLM_TYPE_537M; break;
1085
1099
  case 26: type = LLM_TYPE_1B; break;
1086
1100
  case 34: type = LLM_TYPE_4B; break;
1087
1101
  case 48: type = LLM_TYPE_12B; break;
@@ -1434,6 +1448,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1434
1448
  default: type = LLM_TYPE_UNKNOWN;
1435
1449
  }
1436
1450
  } break;
1451
+ case LLM_ARCH_GLM4_MOE:
1452
+ {
1453
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1454
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1455
+
1456
+ // MoE parameters
1457
+ ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
1458
+ ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
1459
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1460
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
1461
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1462
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1463
+
1464
+ // Expert gating function (GLM-4.5 uses sigmoid)
1465
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1466
+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1467
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1468
+ }
1469
+
1470
+ // NextN/MTP parameters
1471
+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
1472
+
1473
+ switch (hparams.n_layer) {
1474
+ case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
1475
+ case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
1476
+ default: type = LLM_TYPE_UNKNOWN;
1477
+ }
1478
+ } break;
1437
1479
  case LLM_ARCH_BITNET:
1438
1480
  {
1439
1481
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1783,6 +1825,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1783
1825
  default: type = LLM_TYPE_UNKNOWN;
1784
1826
  }
1785
1827
  } break;
1828
+ case LLM_ARCH_OPENAI_MOE:
1829
+ {
1830
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1831
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1832
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1833
+
1834
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1835
+ hparams.set_swa_pattern(2);
1836
+
1837
+ // TODO: switch (hparams.n_layer)
1838
+ } break;
1786
1839
  case LLM_ARCH_LFM2:
1787
1840
  {
1788
1841
  ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
@@ -1949,6 +2002,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1949
2002
 
1950
2003
  const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
1951
2004
  const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
2005
+ const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
1952
2006
 
1953
2007
  // create tensors for the weights
1954
2008
  {
@@ -2004,7 +2058,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2004
2058
  }
2005
2059
 
2006
2060
  // skip unused tensors
2007
- if (info.op == GGML_OP_NONE) {
2061
+ if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
2008
2062
  const size_t nbytes = ggml_nbytes(t_meta);
2009
2063
  LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
2010
2064
 
@@ -2014,11 +2068,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2014
2068
  return nullptr;
2015
2069
  }
2016
2070
 
2017
- // tensors with "bias" suffix are always used with GGML_OP_ADD
2071
+ // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
2018
2072
  ggml_op op;
2019
2073
  bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
2020
2074
  if (bias) {
2021
- op = GGML_OP_ADD;
2075
+ if (info.op == GGML_OP_MUL_MAT_ID) {
2076
+ op = GGML_OP_ADD_ID;
2077
+ } else {
2078
+ op = GGML_OP_ADD;
2079
+ }
2022
2080
  } else {
2023
2081
  op = info.op;
2024
2082
  }
@@ -4427,6 +4485,105 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4427
4485
  layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4428
4486
  }
4429
4487
  } break;
4488
+ case LLM_ARCH_GLM4_MOE:
4489
+ {
4490
+ const int64_t n_expert = hparams.n_expert;
4491
+ const int64_t n_expert_used = hparams.n_expert_used;
4492
+ const int64_t n_expert_shared = hparams.n_expert_shared;
4493
+
4494
+ GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
4495
+ GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
4496
+
4497
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
4498
+
4499
+ // output
4500
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
4501
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
4502
+ // if output is NULL, init from the input tok embed
4503
+ if (output == NULL) {
4504
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
4505
+ }
4506
+
4507
+ // Load ALL tensors including NextN layer to satisfy total tensor count
4508
+ // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
4509
+ for (int i = 0; i < n_layer; ++i) {
4510
+ int flags = 0;
4511
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
4512
+ // skip all tensors in the NextN layers
4513
+ flags |= TENSOR_SKIP;
4514
+ }
4515
+
4516
+ auto & layer = layers[i];
4517
+
4518
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
4519
+
4520
+ // GLM-style attention with bias terms
4521
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
4522
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
4523
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
4524
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
4525
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
4526
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
4527
+
4528
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
4529
+
4530
+ // K/Q norm tensors (optional for GLM-4.5 355B variant)
4531
+ layer.attn_q_norm = create_tensor(
4532
+ tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
4533
+ layer.attn_k_norm = create_tensor(
4534
+ tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
4535
+
4536
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
4537
+
4538
+ // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
4539
+ // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
4540
+ const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
4541
+
4542
+ if (use_moe) {
4543
+ // MoE layers
4544
+ layer.ffn_gate_inp =
4545
+ create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
4546
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
4547
+
4548
+ // MoE branch
4549
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
4550
+
4551
+ layer.ffn_gate_exps = create_tensor(
4552
+ tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
4553
+ layer.ffn_down_exps = create_tensor(
4554
+ tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
4555
+ layer.ffn_up_exps = create_tensor(
4556
+ tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
4557
+
4558
+ // Shared expert
4559
+ if (n_expert_shared > 0) {
4560
+ const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
4561
+ layer.ffn_gate_shexp = create_tensor(
4562
+ tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
4563
+ layer.ffn_down_shexp = create_tensor(
4564
+ tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
4565
+ layer.ffn_up_shexp = create_tensor(
4566
+ tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
4567
+ }
4568
+ } else {
4569
+ // Dense layers (first k layers) - GLM uses separate gate/up projections
4570
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
4571
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
4572
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
4573
+ }
4574
+
4575
+ // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
4576
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
4577
+ layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
4578
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
4579
+ layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
4580
+ layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
4581
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
4582
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
4583
+ }
4584
+ }
4585
+ }
4586
+ break;
4430
4587
  case LLM_ARCH_NEMOTRON:
4431
4588
  {
4432
4589
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5270,6 +5427,46 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5270
5427
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5271
5428
  }
5272
5429
  } break;
5430
+ case LLM_ARCH_OPENAI_MOE:
5431
+ {
5432
+ const int64_t n_ff_exp = hparams.n_ff_exp;
5433
+
5434
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5435
+
5436
+ // output
5437
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5438
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5439
+
5440
+ for (int i = 0; i < n_layer; ++i) {
5441
+ auto & layer = layers[i];
5442
+
5443
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5444
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5445
+
5446
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
5447
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
5448
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
5449
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
5450
+
5451
+ layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
5452
+
5453
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
5454
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5455
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5456
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5457
+
5458
+ // bias
5459
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0);
5460
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_head_kv * n_rot}, 0);
5461
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_head_kv * n_rot}, 0);
5462
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
5463
+
5464
+ layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
5465
+ layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
5466
+ layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0);
5467
+ layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
5468
+ }
5469
+ } break;
5273
5470
  case LLM_ARCH_LFM2:
5274
5471
  {
5275
5472
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5642,7 +5839,7 @@ void llama_model::print_info() const {
5642
5839
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
5643
5840
  }
5644
5841
 
5645
- if (arch == LLM_ARCH_QWEN3MOE) {
5842
+ if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE) {
5646
5843
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5647
5844
  }
5648
5845
 
@@ -13564,6 +13761,165 @@ struct llm_build_glm4 : public llm_graph_context {
13564
13761
  }
13565
13762
  };
13566
13763
 
13764
+ struct llm_build_glm4_moe : public llm_graph_context {
13765
+ llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
13766
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13767
+
13768
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13769
+
13770
+ ggml_tensor * cur;
13771
+ ggml_tensor * inpL;
13772
+
13773
+ inpL = build_inp_embd(model.tok_embd);
13774
+
13775
+ // inp_pos - contains the positions
13776
+ ggml_tensor * inp_pos = build_inp_pos();
13777
+
13778
+ auto * inp_attn = build_attn_inp_kv_unified();
13779
+
13780
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13781
+
13782
+ // Only process up to last layer (skip final NextN layer)
13783
+ // Final layer tensors are loaded but not processed in forward pass
13784
+ const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
13785
+ for (int il = 0; il < n_transformer_layers; ++il) {
13786
+ ggml_tensor * inpSA = inpL;
13787
+
13788
+ // Pre-attention norm
13789
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
13790
+ cb(cur, "attn_norm", il);
13791
+
13792
+ // self-attention
13793
+ {
13794
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
13795
+ if (model.layers[il].bq) {
13796
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
13797
+ }
13798
+ cb(Qcur, "Qcur", il);
13799
+
13800
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
13801
+ if (model.layers[il].bk) {
13802
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
13803
+ }
13804
+ cb(Kcur, "Kcur", il);
13805
+
13806
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
13807
+ if (model.layers[il].bv) {
13808
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
13809
+ }
13810
+ cb(Vcur, "Vcur", il);
13811
+
13812
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13813
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13814
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13815
+
13816
+ // Apply Q/K norm if available (GLM-4.5 355B variant)
13817
+ if (model.layers[il].attn_q_norm) {
13818
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
13819
+ cb(Qcur, "Qcur_normed", il);
13820
+ }
13821
+ if (model.layers[il].attn_k_norm) {
13822
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
13823
+ cb(Kcur, "Kcur_normed", il);
13824
+ }
13825
+
13826
+ Qcur = ggml_rope_ext(
13827
+ ctx0, Qcur, inp_pos, nullptr,
13828
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13829
+ ext_factor, attn_factor, beta_fast, beta_slow
13830
+ );
13831
+
13832
+ Kcur = ggml_rope_ext(
13833
+ ctx0, Kcur, inp_pos, nullptr,
13834
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13835
+ ext_factor, attn_factor, beta_fast, beta_slow
13836
+ );
13837
+
13838
+ cb(Qcur, "Qcur", il);
13839
+ cb(Kcur, "Kcur", il);
13840
+ cb(Vcur, "Vcur", il);
13841
+
13842
+ cur = build_attn(inp_attn,
13843
+ model.layers[il].wo, NULL,
13844
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13845
+ }
13846
+
13847
+ if (il == n_transformer_layers - 1 && inp_out_ids) {
13848
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13849
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13850
+ }
13851
+
13852
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13853
+ cb(ffn_inp, "ffn_inp", il);
13854
+
13855
+ // Post-attention norm
13856
+ cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
13857
+ cb(cur, "post_attn_norm", il);
13858
+
13859
+ // Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
13860
+ if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
13861
+ // Dense FFN layer
13862
+ cur = build_ffn(cur,
13863
+ model.layers[il].ffn_up, NULL, NULL,
13864
+ model.layers[il].ffn_gate, NULL, NULL,
13865
+ model.layers[il].ffn_down, NULL, NULL,
13866
+ NULL,
13867
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13868
+ cb(cur, "ffn_out", il);
13869
+ } else {
13870
+ // Process routed experts using existing MoE infrastructure
13871
+ ggml_tensor * routed_out = build_moe_ffn(cur,
13872
+ model.layers[il].ffn_gate_inp,
13873
+ model.layers[il].ffn_up_exps,
13874
+ model.layers[il].ffn_gate_exps,
13875
+ model.layers[il].ffn_down_exps,
13876
+ model.layers[il].ffn_exp_probs_b,
13877
+ n_expert, n_expert_used,
13878
+ LLM_FFN_SILU, hparams.expert_weights_norm,
13879
+ true, hparams.expert_weights_scale,
13880
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
13881
+ il);
13882
+ cb(routed_out, "ffn_moe_out", il);
13883
+
13884
+ // Process shared expert on original input
13885
+ ggml_tensor * shared_out = build_ffn(cur,
13886
+ model.layers[il].ffn_up_shexp, NULL, NULL,
13887
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
13888
+ model.layers[il].ffn_down_shexp, NULL, NULL,
13889
+ NULL,
13890
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13891
+ cb(shared_out, "ffn_shexp_out", il);
13892
+
13893
+ // Final output: routed_output + shared_output
13894
+ cur = ggml_add(ctx0, routed_out, shared_out);
13895
+ cb(cur, "ffn_out", il);
13896
+ }
13897
+
13898
+ cur = ggml_add(ctx0, cur, ffn_inp);
13899
+
13900
+ cur = build_cvec(cur, il);
13901
+ cb(cur, "l_out", il);
13902
+
13903
+ // input for next layer
13904
+ inpL = cur;
13905
+ }
13906
+
13907
+ cur = inpL;
13908
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
13909
+
13910
+ cb(cur, "result_norm", -1);
13911
+ res->t_embd = cur;
13912
+
13913
+ // lm_head
13914
+ cur = build_lora_mm(model.output, cur);
13915
+
13916
+ cb(cur, "result_output", -1);
13917
+ res->t_logits = cur;
13918
+
13919
+ ggml_build_forward_expand(gf, cur);
13920
+ }
13921
+ };
13922
+
13567
13923
  struct llm_build_nemotron : public llm_graph_context {
13568
13924
  llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
13569
13925
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -17251,6 +17607,136 @@ struct llm_build_smollm3 : public llm_graph_context {
17251
17607
  }
17252
17608
  };
17253
17609
 
17610
+ struct llm_build_openai_moe_iswa : public llm_graph_context {
17611
+ llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
17612
+ ggml_tensor * cur;
17613
+ ggml_tensor * inpL;
17614
+
17615
+ inpL = build_inp_embd(model.tok_embd);
17616
+
17617
+ // inp_pos - contains the positions
17618
+ ggml_tensor * inp_pos = build_inp_pos();
17619
+
17620
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
17621
+
17622
+ for (int il = 0; il < n_layer; ++il) {
17623
+ ggml_tensor * inpSA = inpL;
17624
+
17625
+ // norm
17626
+ cur = build_norm(inpL,
17627
+ model.layers[il].attn_norm, nullptr,
17628
+ LLM_NORM_RMS, il);
17629
+ cb(cur, "attn_norm", il);
17630
+
17631
+ // self-attention
17632
+ {
17633
+ // compute Q and K and RoPE them
17634
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
17635
+ cb(Qcur, "Qcur", il);
17636
+ if (model.layers[il].bq) {
17637
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
17638
+ cb(Qcur, "Qcur", il);
17639
+ }
17640
+
17641
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
17642
+ cb(Kcur, "Kcur", il);
17643
+ if (model.layers[il].bk) {
17644
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
17645
+ cb(Kcur, "Kcur", il);
17646
+ }
17647
+
17648
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
17649
+ cb(Vcur, "Vcur", il);
17650
+ if (model.layers[il].bv) {
17651
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
17652
+ cb(Vcur, "Vcur", il);
17653
+ }
17654
+
17655
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
17656
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
17657
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
17658
+
17659
+ Qcur = ggml_rope_ext(
17660
+ ctx0, Qcur, inp_pos, nullptr,
17661
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17662
+ ext_factor, attn_factor, beta_fast, beta_slow
17663
+ );
17664
+
17665
+ Kcur = ggml_rope_ext(
17666
+ ctx0, Kcur, inp_pos, nullptr,
17667
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17668
+ ext_factor, attn_factor, beta_fast, beta_slow
17669
+ );
17670
+
17671
+ cb(Qcur, "Qcur", il);
17672
+ cb(Kcur, "Kcur", il);
17673
+ cb(Vcur, "Vcur", il);
17674
+
17675
+ cur = build_attn_with_sinks(inp_attn,
17676
+ model.layers[il].wo, model.layers[il].bo,
17677
+ Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].attn_sinks, 1.0f/sqrtf(float(n_rot)), il);
17678
+
17679
+ cb(cur, "attn_out", il);
17680
+ }
17681
+
17682
+ if (il == n_layer - 1) {
17683
+ // skip computing output for unused tokens
17684
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
17685
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
17686
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
17687
+ }
17688
+
17689
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
17690
+ cb(ffn_inp, "ffn_inp", il);
17691
+
17692
+ cur = ffn_inp;
17693
+ cur = build_norm(cur,
17694
+ model.layers[il].attn_post_norm, nullptr,
17695
+ LLM_NORM_RMS, il);
17696
+ cb(cur, "attn_post_norm", il);
17697
+
17698
+ // MoE branch
17699
+ cur = build_moe_ffn(cur,
17700
+ model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b,
17701
+ model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b,
17702
+ model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b,
17703
+ model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b,
17704
+ nullptr,
17705
+ n_expert, n_expert_used,
17706
+ LLM_FFN_SWIGLU_OAI_MOE, false,
17707
+ false, 0.0,
17708
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
17709
+ il);
17710
+ cb(cur, "ffn_moe_out", il);
17711
+
17712
+ cur = ggml_add(ctx0, cur, ffn_inp);
17713
+
17714
+ cur = build_cvec(cur, il);
17715
+ cb(cur, "l_out", il);
17716
+
17717
+ // input for next layer
17718
+ inpL = cur;
17719
+ }
17720
+
17721
+ cur = inpL;
17722
+
17723
+ cur = build_norm(cur,
17724
+ model.output_norm, NULL,
17725
+ LLM_NORM_RMS, -1);
17726
+
17727
+ cb(cur, "result_norm", -1);
17728
+ res->t_embd = cur;
17729
+
17730
+ // lm_head
17731
+ cur = build_lora_mm(model.output, cur);
17732
+
17733
+ cb(cur, "result_output", -1);
17734
+ res->t_logits = cur;
17735
+
17736
+ ggml_build_forward_expand(gf, cur);
17737
+ }
17738
+ };
17739
+
17254
17740
  struct llm_build_lfm2 : public llm_graph_context {
17255
17741
  const llama_model & model;
17256
17742
 
@@ -17877,6 +18363,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17877
18363
  {
17878
18364
  llm = std::make_unique<llm_build_glm4>(*this, params);
17879
18365
  } break;
18366
+ case LLM_ARCH_GLM4_MOE:
18367
+ {
18368
+ llm = std::make_unique<llm_build_glm4_moe>(*this, params);
18369
+ } break;
17880
18370
  case LLM_ARCH_BITNET:
17881
18371
  {
17882
18372
  llm = std::make_unique<llm_build_bitnet>(*this, params);
@@ -17990,6 +18480,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17990
18480
  {
17991
18481
  llm = std::make_unique<llm_build_smollm3>(*this, params);
17992
18482
  } break;
18483
+ case LLM_ARCH_OPENAI_MOE:
18484
+ {
18485
+ llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
18486
+ } break;
17993
18487
  case LLM_ARCH_FALCON_H1:
17994
18488
  {
17995
18489
  llm = std::make_unique<llm_build_falcon_h1>(*this, params);
@@ -18205,9 +18699,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
18205
18699
  case LLM_ARCH_MINICPM3:
18206
18700
  case LLM_ARCH_DOTS1:
18207
18701
  case LLM_ARCH_HUNYUAN_MOE:
18702
+ case LLM_ARCH_OPENAI_MOE:
18208
18703
  case LLM_ARCH_HUNYUAN_DENSE:
18209
18704
  case LLM_ARCH_LFM2:
18210
18705
  case LLM_ARCH_SMALLTHINKER:
18706
+ case LLM_ARCH_GLM4_MOE:
18211
18707
  return LLAMA_ROPE_TYPE_NEOX;
18212
18708
 
18213
18709
  case LLM_ARCH_QWEN2VL: