@fugood/llama.node 1.1.5 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +17 -13
  3. package/src/LlamaCompletionWorker.cpp +2 -0
  4. package/src/llama.cpp/common/arg.cpp +28 -11
  5. package/src/llama.cpp/common/chat.cpp +46 -2
  6. package/src/llama.cpp/common/chat.h +7 -2
  7. package/src/llama.cpp/common/common.h +3 -2
  8. package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
  9. package/src/llama.cpp/ggml/include/ggml.h +37 -1
  10. package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
  11. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
  12. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
  13. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +6 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
  16. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
  17. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
  20. package/src/llama.cpp/include/llama.h +1 -0
  21. package/src/llama.cpp/src/llama-arch.cpp +65 -0
  22. package/src/llama.cpp/src/llama-arch.h +10 -0
  23. package/src/llama.cpp/src/llama-chat.cpp +13 -0
  24. package/src/llama.cpp/src/llama-chat.h +1 -0
  25. package/src/llama.cpp/src/llama-context.cpp +8 -8
  26. package/src/llama.cpp/src/llama-graph.cpp +118 -9
  27. package/src/llama.cpp/src/llama-graph.h +38 -0
  28. package/src/llama.cpp/src/llama-hparams.h +5 -3
  29. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +4 -0
  30. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  31. package/src/llama.cpp/src/llama-model-loader.h +3 -2
  32. package/src/llama.cpp/src/llama-model.cpp +499 -4
  33. package/src/llama.cpp/src/llama-model.h +24 -4
  34. package/src/llama.cpp/src/llama-quant.cpp +37 -1
  35. package/src/llama.cpp/src/llama-vocab.cpp +42 -0
@@ -109,8 +109,10 @@ const char * llm_type_name(llm_type type) {
109
109
  case LLM_TYPE_A13B: return "A13B";
110
110
  case LLM_TYPE_21B_A3B: return "21B.A3B";
111
111
  case LLM_TYPE_30B_A3B: return "30B.A3B";
112
+ case LLM_TYPE_106B_A12B: return "106B.A12B";
112
113
  case LLM_TYPE_235B_A22B: return "235B.A22B";
113
114
  case LLM_TYPE_300B_A47B: return "300B.A47B";
115
+ case LLM_TYPE_355B_A32B: return "355B.A32B";
114
116
  case LLM_TYPE_E2B: return "E2B";
115
117
  case LLM_TYPE_E4B: return "E4B";
116
118
  default: return "?B";
@@ -190,6 +192,13 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
190
192
  ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
191
193
  op_tensor = ggml_add(ctx, a, w);
192
194
  } break;
195
+ case GGML_OP_ADD_ID:
196
+ {
197
+ int n_expert_used = hparams.n_expert_used;
198
+ ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
199
+ ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
200
+ op_tensor = ggml_add_id(ctx, a, w, c);
201
+ } break;
193
202
  case GGML_OP_MUL:
194
203
  {
195
204
  ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
@@ -258,6 +267,10 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
258
267
  ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
259
268
  op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
260
269
  } break;
270
+ case GGML_OP_SCALE:
271
+ {
272
+ op_tensor = ggml_scale(ctx, w, 1.0f);
273
+ } break;
261
274
  default:
262
275
  GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
263
276
  }
@@ -1434,6 +1447,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1434
1447
  default: type = LLM_TYPE_UNKNOWN;
1435
1448
  }
1436
1449
  } break;
1450
+ case LLM_ARCH_GLM4_MOE:
1451
+ {
1452
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1453
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1454
+
1455
+ // MoE parameters
1456
+ ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
1457
+ ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
1458
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1459
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
1460
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1461
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1462
+
1463
+ // Expert gating function (GLM-4.5 uses sigmoid)
1464
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1465
+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1466
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1467
+ }
1468
+
1469
+ // NextN/MTP parameters
1470
+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
1471
+
1472
+ switch (hparams.n_layer) {
1473
+ case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
1474
+ case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
1475
+ default: type = LLM_TYPE_UNKNOWN;
1476
+ }
1477
+ } break;
1437
1478
  case LLM_ARCH_BITNET:
1438
1479
  {
1439
1480
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1783,6 +1824,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1783
1824
  default: type = LLM_TYPE_UNKNOWN;
1784
1825
  }
1785
1826
  } break;
1827
+ case LLM_ARCH_OPENAI_MOE:
1828
+ {
1829
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1830
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1831
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1832
+
1833
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1834
+ hparams.set_swa_pattern(2);
1835
+
1836
+ // TODO: switch (hparams.n_layer)
1837
+ } break;
1786
1838
  case LLM_ARCH_LFM2:
1787
1839
  {
1788
1840
  ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
@@ -1949,6 +2001,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1949
2001
 
1950
2002
  const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
1951
2003
  const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
2004
+ const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
1952
2005
 
1953
2006
  // create tensors for the weights
1954
2007
  {
@@ -2004,7 +2057,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2004
2057
  }
2005
2058
 
2006
2059
  // skip unused tensors
2007
- if (info.op == GGML_OP_NONE) {
2060
+ if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
2008
2061
  const size_t nbytes = ggml_nbytes(t_meta);
2009
2062
  LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
2010
2063
 
@@ -2014,11 +2067,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2014
2067
  return nullptr;
2015
2068
  }
2016
2069
 
2017
- // tensors with "bias" suffix are always used with GGML_OP_ADD
2070
+ // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
2018
2071
  ggml_op op;
2019
2072
  bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
2020
2073
  if (bias) {
2021
- op = GGML_OP_ADD;
2074
+ if (info.op == GGML_OP_MUL_MAT_ID) {
2075
+ op = GGML_OP_ADD_ID;
2076
+ } else {
2077
+ op = GGML_OP_ADD;
2078
+ }
2022
2079
  } else {
2023
2080
  op = info.op;
2024
2081
  }
@@ -4427,6 +4484,105 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4427
4484
  layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4428
4485
  }
4429
4486
  } break;
4487
+ case LLM_ARCH_GLM4_MOE:
4488
+ {
4489
+ const int64_t n_expert = hparams.n_expert;
4490
+ const int64_t n_expert_used = hparams.n_expert_used;
4491
+ const int64_t n_expert_shared = hparams.n_expert_shared;
4492
+
4493
+ GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
4494
+ GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
4495
+
4496
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
4497
+
4498
+ // output
4499
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
4500
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
4501
+ // if output is NULL, init from the input tok embed
4502
+ if (output == NULL) {
4503
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
4504
+ }
4505
+
4506
+ // Load ALL tensors including NextN layer to satisfy total tensor count
4507
+ // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
4508
+ for (int i = 0; i < n_layer; ++i) {
4509
+ int flags = 0;
4510
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
4511
+ // skip all tensors in the NextN layers
4512
+ flags |= TENSOR_SKIP;
4513
+ }
4514
+
4515
+ auto & layer = layers[i];
4516
+
4517
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
4518
+
4519
+ // GLM-style attention with bias terms
4520
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
4521
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
4522
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
4523
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
4524
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
4525
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
4526
+
4527
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
4528
+
4529
+ // K/Q norm tensors (optional for GLM-4.5 355B variant)
4530
+ layer.attn_q_norm = create_tensor(
4531
+ tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
4532
+ layer.attn_k_norm = create_tensor(
4533
+ tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
4534
+
4535
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
4536
+
4537
+ // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
4538
+ // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
4539
+ const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
4540
+
4541
+ if (use_moe) {
4542
+ // MoE layers
4543
+ layer.ffn_gate_inp =
4544
+ create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
4545
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
4546
+
4547
+ // MoE branch
4548
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
4549
+
4550
+ layer.ffn_gate_exps = create_tensor(
4551
+ tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
4552
+ layer.ffn_down_exps = create_tensor(
4553
+ tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
4554
+ layer.ffn_up_exps = create_tensor(
4555
+ tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
4556
+
4557
+ // Shared expert
4558
+ if (n_expert_shared > 0) {
4559
+ const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
4560
+ layer.ffn_gate_shexp = create_tensor(
4561
+ tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
4562
+ layer.ffn_down_shexp = create_tensor(
4563
+ tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
4564
+ layer.ffn_up_shexp = create_tensor(
4565
+ tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
4566
+ }
4567
+ } else {
4568
+ // Dense layers (first k layers) - GLM uses separate gate/up projections
4569
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
4570
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
4571
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
4572
+ }
4573
+
4574
+ // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
4575
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
4576
+ layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
4577
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
4578
+ layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
4579
+ layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
4580
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
4581
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
4582
+ }
4583
+ }
4584
+ }
4585
+ break;
4430
4586
  case LLM_ARCH_NEMOTRON:
4431
4587
  {
4432
4588
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5270,6 +5426,46 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5270
5426
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5271
5427
  }
5272
5428
  } break;
5429
+ case LLM_ARCH_OPENAI_MOE:
5430
+ {
5431
+ const int64_t n_ff_exp = hparams.n_ff_exp;
5432
+
5433
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5434
+
5435
+ // output
5436
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5437
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5438
+
5439
+ for (int i = 0; i < n_layer; ++i) {
5440
+ auto & layer = layers[i];
5441
+
5442
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5443
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5444
+
5445
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
5446
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
5447
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
5448
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
5449
+
5450
+ layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
5451
+
5452
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
5453
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5454
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5455
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5456
+
5457
+ // bias
5458
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0);
5459
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_head_kv * n_rot}, 0);
5460
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_head_kv * n_rot}, 0);
5461
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
5462
+
5463
+ layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
5464
+ layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
5465
+ layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0);
5466
+ layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
5467
+ }
5468
+ } break;
5273
5469
  case LLM_ARCH_LFM2:
5274
5470
  {
5275
5471
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5642,7 +5838,7 @@ void llama_model::print_info() const {
5642
5838
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
5643
5839
  }
5644
5840
 
5645
- if (arch == LLM_ARCH_QWEN3MOE) {
5841
+ if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE) {
5646
5842
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5647
5843
  }
5648
5844
 
@@ -13564,6 +13760,165 @@ struct llm_build_glm4 : public llm_graph_context {
13564
13760
  }
13565
13761
  };
13566
13762
 
13763
+ struct llm_build_glm4_moe : public llm_graph_context {
13764
+ llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
13765
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13766
+
13767
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13768
+
13769
+ ggml_tensor * cur;
13770
+ ggml_tensor * inpL;
13771
+
13772
+ inpL = build_inp_embd(model.tok_embd);
13773
+
13774
+ // inp_pos - contains the positions
13775
+ ggml_tensor * inp_pos = build_inp_pos();
13776
+
13777
+ auto * inp_attn = build_attn_inp_kv_unified();
13778
+
13779
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13780
+
13781
+ // Only process up to last layer (skip final NextN layer)
13782
+ // Final layer tensors are loaded but not processed in forward pass
13783
+ const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
13784
+ for (int il = 0; il < n_transformer_layers; ++il) {
13785
+ ggml_tensor * inpSA = inpL;
13786
+
13787
+ // Pre-attention norm
13788
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
13789
+ cb(cur, "attn_norm", il);
13790
+
13791
+ // self-attention
13792
+ {
13793
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
13794
+ if (model.layers[il].bq) {
13795
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
13796
+ }
13797
+ cb(Qcur, "Qcur", il);
13798
+
13799
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
13800
+ if (model.layers[il].bk) {
13801
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
13802
+ }
13803
+ cb(Kcur, "Kcur", il);
13804
+
13805
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
13806
+ if (model.layers[il].bv) {
13807
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
13808
+ }
13809
+ cb(Vcur, "Vcur", il);
13810
+
13811
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13812
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13813
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13814
+
13815
+ // Apply Q/K norm if available (GLM-4.5 355B variant)
13816
+ if (model.layers[il].attn_q_norm) {
13817
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
13818
+ cb(Qcur, "Qcur_normed", il);
13819
+ }
13820
+ if (model.layers[il].attn_k_norm) {
13821
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
13822
+ cb(Kcur, "Kcur_normed", il);
13823
+ }
13824
+
13825
+ Qcur = ggml_rope_ext(
13826
+ ctx0, Qcur, inp_pos, nullptr,
13827
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13828
+ ext_factor, attn_factor, beta_fast, beta_slow
13829
+ );
13830
+
13831
+ Kcur = ggml_rope_ext(
13832
+ ctx0, Kcur, inp_pos, nullptr,
13833
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13834
+ ext_factor, attn_factor, beta_fast, beta_slow
13835
+ );
13836
+
13837
+ cb(Qcur, "Qcur", il);
13838
+ cb(Kcur, "Kcur", il);
13839
+ cb(Vcur, "Vcur", il);
13840
+
13841
+ cur = build_attn(inp_attn,
13842
+ model.layers[il].wo, NULL,
13843
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13844
+ }
13845
+
13846
+ if (il == n_transformer_layers - 1 && inp_out_ids) {
13847
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13848
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13849
+ }
13850
+
13851
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13852
+ cb(ffn_inp, "ffn_inp", il);
13853
+
13854
+ // Post-attention norm
13855
+ cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
13856
+ cb(cur, "post_attn_norm", il);
13857
+
13858
+ // Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
13859
+ if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
13860
+ // Dense FFN layer
13861
+ cur = build_ffn(cur,
13862
+ model.layers[il].ffn_up, NULL, NULL,
13863
+ model.layers[il].ffn_gate, NULL, NULL,
13864
+ model.layers[il].ffn_down, NULL, NULL,
13865
+ NULL,
13866
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13867
+ cb(cur, "ffn_out", il);
13868
+ } else {
13869
+ // Process routed experts using existing MoE infrastructure
13870
+ ggml_tensor * routed_out = build_moe_ffn(cur,
13871
+ model.layers[il].ffn_gate_inp,
13872
+ model.layers[il].ffn_up_exps,
13873
+ model.layers[il].ffn_gate_exps,
13874
+ model.layers[il].ffn_down_exps,
13875
+ model.layers[il].ffn_exp_probs_b,
13876
+ n_expert, n_expert_used,
13877
+ LLM_FFN_SILU, hparams.expert_weights_norm,
13878
+ true, hparams.expert_weights_scale,
13879
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
13880
+ il);
13881
+ cb(routed_out, "ffn_moe_out", il);
13882
+
13883
+ // Process shared expert on original input
13884
+ ggml_tensor * shared_out = build_ffn(cur,
13885
+ model.layers[il].ffn_up_shexp, NULL, NULL,
13886
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
13887
+ model.layers[il].ffn_down_shexp, NULL, NULL,
13888
+ NULL,
13889
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13890
+ cb(shared_out, "ffn_shexp_out", il);
13891
+
13892
+ // Final output: routed_output + shared_output
13893
+ cur = ggml_add(ctx0, routed_out, shared_out);
13894
+ cb(cur, "ffn_out", il);
13895
+ }
13896
+
13897
+ cur = ggml_add(ctx0, cur, ffn_inp);
13898
+
13899
+ cur = build_cvec(cur, il);
13900
+ cb(cur, "l_out", il);
13901
+
13902
+ // input for next layer
13903
+ inpL = cur;
13904
+ }
13905
+
13906
+ cur = inpL;
13907
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
13908
+
13909
+ cb(cur, "result_norm", -1);
13910
+ res->t_embd = cur;
13911
+
13912
+ // lm_head
13913
+ cur = build_lora_mm(model.output, cur);
13914
+
13915
+ cb(cur, "result_output", -1);
13916
+ res->t_logits = cur;
13917
+
13918
+ ggml_build_forward_expand(gf, cur);
13919
+ }
13920
+ };
13921
+
13567
13922
  struct llm_build_nemotron : public llm_graph_context {
13568
13923
  llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
13569
13924
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -17251,6 +17606,136 @@ struct llm_build_smollm3 : public llm_graph_context {
17251
17606
  }
17252
17607
  };
17253
17608
 
17609
+ struct llm_build_openai_moe_iswa : public llm_graph_context {
17610
+ llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
17611
+ ggml_tensor * cur;
17612
+ ggml_tensor * inpL;
17613
+
17614
+ inpL = build_inp_embd(model.tok_embd);
17615
+
17616
+ // inp_pos - contains the positions
17617
+ ggml_tensor * inp_pos = build_inp_pos();
17618
+
17619
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
17620
+
17621
+ for (int il = 0; il < n_layer; ++il) {
17622
+ ggml_tensor * inpSA = inpL;
17623
+
17624
+ // norm
17625
+ cur = build_norm(inpL,
17626
+ model.layers[il].attn_norm, nullptr,
17627
+ LLM_NORM_RMS, il);
17628
+ cb(cur, "attn_norm", il);
17629
+
17630
+ // self-attention
17631
+ {
17632
+ // compute Q and K and RoPE them
17633
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
17634
+ cb(Qcur, "Qcur", il);
17635
+ if (model.layers[il].bq) {
17636
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
17637
+ cb(Qcur, "Qcur", il);
17638
+ }
17639
+
17640
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
17641
+ cb(Kcur, "Kcur", il);
17642
+ if (model.layers[il].bk) {
17643
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
17644
+ cb(Kcur, "Kcur", il);
17645
+ }
17646
+
17647
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
17648
+ cb(Vcur, "Vcur", il);
17649
+ if (model.layers[il].bv) {
17650
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
17651
+ cb(Vcur, "Vcur", il);
17652
+ }
17653
+
17654
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
17655
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
17656
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
17657
+
17658
+ Qcur = ggml_rope_ext(
17659
+ ctx0, Qcur, inp_pos, nullptr,
17660
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17661
+ ext_factor, attn_factor, beta_fast, beta_slow
17662
+ );
17663
+
17664
+ Kcur = ggml_rope_ext(
17665
+ ctx0, Kcur, inp_pos, nullptr,
17666
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17667
+ ext_factor, attn_factor, beta_fast, beta_slow
17668
+ );
17669
+
17670
+ cb(Qcur, "Qcur", il);
17671
+ cb(Kcur, "Kcur", il);
17672
+ cb(Vcur, "Vcur", il);
17673
+
17674
+ cur = build_attn_with_sinks(inp_attn,
17675
+ model.layers[il].wo, model.layers[il].bo,
17676
+ Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].attn_sinks, 1.0f/sqrtf(float(n_rot)), il);
17677
+
17678
+ cb(cur, "attn_out", il);
17679
+ }
17680
+
17681
+ if (il == n_layer - 1) {
17682
+ // skip computing output for unused tokens
17683
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
17684
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
17685
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
17686
+ }
17687
+
17688
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
17689
+ cb(ffn_inp, "ffn_inp", il);
17690
+
17691
+ cur = ffn_inp;
17692
+ cur = build_norm(cur,
17693
+ model.layers[il].attn_post_norm, nullptr,
17694
+ LLM_NORM_RMS, il);
17695
+ cb(cur, "attn_post_norm", il);
17696
+
17697
+ // MoE branch
17698
+ cur = build_moe_ffn(cur,
17699
+ model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b,
17700
+ model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b,
17701
+ model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b,
17702
+ model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b,
17703
+ nullptr,
17704
+ n_expert, n_expert_used,
17705
+ LLM_FFN_SWIGLU_OAI_MOE, false,
17706
+ false, 0.0,
17707
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
17708
+ il);
17709
+ cb(cur, "ffn_moe_out", il);
17710
+
17711
+ cur = ggml_add(ctx0, cur, ffn_inp);
17712
+
17713
+ cur = build_cvec(cur, il);
17714
+ cb(cur, "l_out", il);
17715
+
17716
+ // input for next layer
17717
+ inpL = cur;
17718
+ }
17719
+
17720
+ cur = inpL;
17721
+
17722
+ cur = build_norm(cur,
17723
+ model.output_norm, NULL,
17724
+ LLM_NORM_RMS, -1);
17725
+
17726
+ cb(cur, "result_norm", -1);
17727
+ res->t_embd = cur;
17728
+
17729
+ // lm_head
17730
+ cur = build_lora_mm(model.output, cur);
17731
+
17732
+ cb(cur, "result_output", -1);
17733
+ res->t_logits = cur;
17734
+
17735
+ ggml_build_forward_expand(gf, cur);
17736
+ }
17737
+ };
17738
+
17254
17739
  struct llm_build_lfm2 : public llm_graph_context {
17255
17740
  const llama_model & model;
17256
17741
 
@@ -17877,6 +18362,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17877
18362
  {
17878
18363
  llm = std::make_unique<llm_build_glm4>(*this, params);
17879
18364
  } break;
18365
+ case LLM_ARCH_GLM4_MOE:
18366
+ {
18367
+ llm = std::make_unique<llm_build_glm4_moe>(*this, params);
18368
+ } break;
17880
18369
  case LLM_ARCH_BITNET:
17881
18370
  {
17882
18371
  llm = std::make_unique<llm_build_bitnet>(*this, params);
@@ -17990,6 +18479,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17990
18479
  {
17991
18480
  llm = std::make_unique<llm_build_smollm3>(*this, params);
17992
18481
  } break;
18482
+ case LLM_ARCH_OPENAI_MOE:
18483
+ {
18484
+ llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
18485
+ } break;
17993
18486
  case LLM_ARCH_FALCON_H1:
17994
18487
  {
17995
18488
  llm = std::make_unique<llm_build_falcon_h1>(*this, params);
@@ -18205,9 +18698,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
18205
18698
  case LLM_ARCH_MINICPM3:
18206
18699
  case LLM_ARCH_DOTS1:
18207
18700
  case LLM_ARCH_HUNYUAN_MOE:
18701
+ case LLM_ARCH_OPENAI_MOE:
18208
18702
  case LLM_ARCH_HUNYUAN_DENSE:
18209
18703
  case LLM_ARCH_LFM2:
18210
18704
  case LLM_ARCH_SMALLTHINKER:
18705
+ case LLM_ARCH_GLM4_MOE:
18211
18706
  return LLAMA_ROPE_TYPE_NEOX;
18212
18707
 
18213
18708
  case LLM_ARCH_QWEN2VL:
@@ -101,8 +101,10 @@ enum llm_type {
101
101
  LLM_TYPE_A13B,
102
102
  LLM_TYPE_21B_A3B, // Ernie MoE small
103
103
  LLM_TYPE_30B_A3B,
104
+ LLM_TYPE_106B_A12B, // GLM-4.5-Air
104
105
  LLM_TYPE_235B_A22B,
105
106
  LLM_TYPE_300B_A47B, // Ernie MoE big
107
+ LLM_TYPE_355B_A32B, // GLM-4.5
106
108
  LLM_TYPE_E2B,
107
109
  LLM_TYPE_E4B,
108
110
  };
@@ -166,6 +168,15 @@ struct llama_layer_shortconv {
166
168
  struct ggml_tensor * out_proj = nullptr;
167
169
  };
168
170
 
171
+ struct llama_layer_nextn {
172
+ struct ggml_tensor * eh_proj = nullptr;
173
+ struct ggml_tensor * embed_tokens = nullptr;
174
+ struct ggml_tensor * enorm = nullptr;
175
+ struct ggml_tensor * hnorm = nullptr;
176
+ struct ggml_tensor * shared_head_head = nullptr;
177
+ struct ggml_tensor * shared_head_norm = nullptr;
178
+ };
179
+
169
180
  struct llama_layer {
170
181
  // normalization
171
182
  struct ggml_tensor * attn_norm = nullptr;
@@ -241,10 +252,14 @@ struct llama_layer {
241
252
  struct ggml_tensor * ffn_up_enc = nullptr;
242
253
 
243
254
  // ff MoE
244
- struct ggml_tensor * ffn_gate_inp = nullptr;
245
- struct ggml_tensor * ffn_gate_exps = nullptr;
246
- struct ggml_tensor * ffn_down_exps = nullptr;
247
- struct ggml_tensor * ffn_up_exps = nullptr;
255
+ struct ggml_tensor * ffn_gate_inp = nullptr;
256
+ struct ggml_tensor * ffn_gate_exps = nullptr;
257
+ struct ggml_tensor * ffn_down_exps = nullptr;
258
+ struct ggml_tensor * ffn_up_exps = nullptr;
259
+ struct ggml_tensor * ffn_gate_inp_b = nullptr;
260
+ struct ggml_tensor * ffn_gate_exps_b = nullptr;
261
+ struct ggml_tensor * ffn_down_exps_b = nullptr;
262
+ struct ggml_tensor * ffn_up_exps_b = nullptr;
248
263
 
249
264
  // ff shared expert (shexp)
250
265
  struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
@@ -349,11 +364,16 @@ struct llama_layer {
349
364
  struct ggml_tensor * laurel_r = nullptr;
350
365
  struct ggml_tensor * laurel_post_norm = nullptr;
351
366
 
367
+ // openai-moe
368
+ struct ggml_tensor * attn_sinks = nullptr;
369
+
352
370
  struct llama_layer_posnet posnet;
353
371
 
354
372
  struct llama_layer_convnext convnext;
355
373
 
356
374
  struct llama_layer_shortconv shortconv;
375
+
376
+ struct llama_layer_nextn nextn;
357
377
  };
358
378
 
359
379
  struct llama_model {