@fugood/llama.node 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +17 -13
  4. package/src/LlamaCompletionWorker.cpp +2 -0
  5. package/src/LlamaContext.cpp +3 -0
  6. package/src/llama.cpp/common/arg.cpp +80 -10
  7. package/src/llama.cpp/common/chat.cpp +52 -8
  8. package/src/llama.cpp/common/chat.h +7 -2
  9. package/src/llama.cpp/common/common.cpp +1 -0
  10. package/src/llama.cpp/common/common.h +16 -6
  11. package/src/llama.cpp/common/speculative.cpp +135 -54
  12. package/src/llama.cpp/common/speculative.h +8 -1
  13. package/src/llama.cpp/ggml/CMakeLists.txt +4 -2
  14. package/src/llama.cpp/ggml/include/ggml.h +37 -1
  15. package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
  23. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
  28. package/src/llama.cpp/include/llama.h +9 -4
  29. package/src/llama.cpp/src/llama-arch.cpp +105 -0
  30. package/src/llama.cpp/src/llama-arch.h +12 -0
  31. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  32. package/src/llama.cpp/src/llama-chat.cpp +33 -1
  33. package/src/llama.cpp/src/llama-chat.h +2 -0
  34. package/src/llama.cpp/src/llama-context.cpp +19 -10
  35. package/src/llama.cpp/src/llama-context.h +4 -1
  36. package/src/llama.cpp/src/llama-graph.cpp +175 -148
  37. package/src/llama.cpp/src/llama-graph.h +60 -23
  38. package/src/llama.cpp/src/llama-hparams.h +5 -3
  39. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +6 -2
  40. package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
  41. package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
  42. package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
  43. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  44. package/src/llama.cpp/src/llama-model-loader.h +3 -2
  45. package/src/llama.cpp/src/llama-model.cpp +949 -75
  46. package/src/llama.cpp/src/llama-model.h +24 -4
  47. package/src/llama.cpp/src/llama-quant.cpp +40 -4
  48. package/src/llama.cpp/src/llama-vocab.cpp +49 -1
  49. package/src/llama.cpp/src/llama-vocab.h +1 -0
@@ -109,8 +109,10 @@ const char * llm_type_name(llm_type type) {
109
109
  case LLM_TYPE_A13B: return "A13B";
110
110
  case LLM_TYPE_21B_A3B: return "21B.A3B";
111
111
  case LLM_TYPE_30B_A3B: return "30B.A3B";
112
+ case LLM_TYPE_106B_A12B: return "106B.A12B";
112
113
  case LLM_TYPE_235B_A22B: return "235B.A22B";
113
114
  case LLM_TYPE_300B_A47B: return "300B.A47B";
115
+ case LLM_TYPE_355B_A32B: return "355B.A32B";
114
116
  case LLM_TYPE_E2B: return "E2B";
115
117
  case LLM_TYPE_E4B: return "E4B";
116
118
  default: return "?B";
@@ -190,6 +192,13 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
190
192
  ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
191
193
  op_tensor = ggml_add(ctx, a, w);
192
194
  } break;
195
+ case GGML_OP_ADD_ID:
196
+ {
197
+ int n_expert_used = hparams.n_expert_used;
198
+ ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
199
+ ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
200
+ op_tensor = ggml_add_id(ctx, a, w, c);
201
+ } break;
193
202
  case GGML_OP_MUL:
194
203
  {
195
204
  ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
@@ -258,6 +267,10 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
258
267
  ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
259
268
  op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
260
269
  } break;
270
+ case GGML_OP_SCALE:
271
+ {
272
+ op_tensor = ggml_scale(ctx, w, 1.0f);
273
+ } break;
261
274
  default:
262
275
  GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
263
276
  }
@@ -290,7 +303,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
290
303
  }
291
304
 
292
305
  // CPU: ACCEL -> GPU host -> CPU extra -> CPU
293
- static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
306
+ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
294
307
  buft_list_t buft_list;
295
308
 
296
309
  // add ACCEL buffer types
@@ -319,21 +332,22 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
319
332
  }
320
333
  }
321
334
 
322
- // add extra buffer types, only if no GPU device is present
323
- // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
324
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
325
- if (cpu_dev == nullptr) {
326
- throw std::runtime_error(format("%s: no CPU backend found", __func__));
327
- }
335
+ // add extra buffer types
336
+ if (use_extra_bufts) {
337
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
338
+ if (cpu_dev == nullptr) {
339
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
340
+ }
328
341
 
329
- auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
330
- auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
331
- ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
332
- if (ggml_backend_dev_get_extra_bufts_fn) {
333
- ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
334
- while (extra_bufts && *extra_bufts) {
335
- buft_list.emplace_back(cpu_dev, *extra_bufts);
336
- ++extra_bufts;
342
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
343
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
344
+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
345
+ if (ggml_backend_dev_get_extra_bufts_fn) {
346
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
347
+ while (extra_bufts && *extra_bufts) {
348
+ buft_list.emplace_back(cpu_dev, *extra_bufts);
349
+ ++extra_bufts;
350
+ }
337
351
  }
338
352
  }
339
353
 
@@ -869,6 +883,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
869
883
  hparams.causal_attn = false;
870
884
  }
871
885
  break;
886
+ case LLM_ARCH_LLADA:
887
+ {
888
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
889
+ // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
890
+ switch (hparams.n_layer) {
891
+ case 32:
892
+ type = LLM_TYPE_8B;
893
+ break;
894
+ default:
895
+ type = LLM_TYPE_UNKNOWN;
896
+ }
897
+ // Set non-causal attention for diffusion models
898
+ hparams.causal_attn = false;
899
+ }
900
+ break;
872
901
  case LLM_ARCH_QWEN2MOE:
873
902
  {
874
903
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
@@ -883,6 +912,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
883
912
  } break;
884
913
  case LLM_ARCH_QWEN3:
885
914
  {
915
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
886
916
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
887
917
  switch (hparams.n_layer) {
888
918
  case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
@@ -1417,6 +1447,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1417
1447
  default: type = LLM_TYPE_UNKNOWN;
1418
1448
  }
1419
1449
  } break;
1450
+ case LLM_ARCH_GLM4_MOE:
1451
+ {
1452
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1453
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1454
+
1455
+ // MoE parameters
1456
+ ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
1457
+ ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
1458
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1459
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
1460
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1461
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1462
+
1463
+ // Expert gating function (GLM-4.5 uses sigmoid)
1464
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1465
+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1466
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1467
+ }
1468
+
1469
+ // NextN/MTP parameters
1470
+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
1471
+
1472
+ switch (hparams.n_layer) {
1473
+ case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
1474
+ case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
1475
+ default: type = LLM_TYPE_UNKNOWN;
1476
+ }
1477
+ } break;
1420
1478
  case LLM_ARCH_BITNET:
1421
1479
  {
1422
1480
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1744,6 +1802,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1744
1802
  default: type = LLM_TYPE_UNKNOWN;
1745
1803
  }
1746
1804
  } break;
1805
+ case LLM_ARCH_HUNYUAN_DENSE:
1806
+ {
1807
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1808
+
1809
+ switch (hparams.n_embd) {
1810
+ case 1024: type = LLM_TYPE_0_5B; break;
1811
+ case 2048: type = LLM_TYPE_1_8B; break;
1812
+ case 3072: type = LLM_TYPE_4B; break;
1813
+ case 4096: type = LLM_TYPE_7B; break;
1814
+ default: type = LLM_TYPE_UNKNOWN;
1815
+ }
1816
+ } break;
1747
1817
  case LLM_ARCH_SMOLLM3:
1748
1818
  {
1749
1819
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1754,6 +1824,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1754
1824
  default: type = LLM_TYPE_UNKNOWN;
1755
1825
  }
1756
1826
  } break;
1827
+ case LLM_ARCH_OPENAI_MOE:
1828
+ {
1829
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1830
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1831
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1832
+
1833
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1834
+ hparams.set_swa_pattern(2);
1835
+
1836
+ // TODO: switch (hparams.n_layer)
1837
+ } break;
1757
1838
  case LLM_ARCH_LFM2:
1758
1839
  {
1759
1840
  ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
@@ -1824,7 +1905,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1824
1905
  LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
1825
1906
 
1826
1907
  // build a list of buffer types for the CPU and GPU devices
1827
- pimpl->cpu_buft_list = make_cpu_buft_list(devices);
1908
+ pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
1828
1909
  for (auto * dev : devices) {
1829
1910
  buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
1830
1911
  // add CPU buffer types as a fallback
@@ -1920,6 +2001,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1920
2001
 
1921
2002
  const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
1922
2003
  const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
2004
+ const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
1923
2005
 
1924
2006
  // create tensors for the weights
1925
2007
  {
@@ -1975,7 +2057,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1975
2057
  }
1976
2058
 
1977
2059
  // skip unused tensors
1978
- if (info.op == GGML_OP_NONE) {
2060
+ if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
1979
2061
  const size_t nbytes = ggml_nbytes(t_meta);
1980
2062
  LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
1981
2063
 
@@ -1985,11 +2067,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1985
2067
  return nullptr;
1986
2068
  }
1987
2069
 
1988
- // tensors with "bias" suffix are always used with GGML_OP_ADD
2070
+ // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
1989
2071
  ggml_op op;
1990
2072
  bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
1991
2073
  if (bias) {
1992
- op = GGML_OP_ADD;
2074
+ if (info.op == GGML_OP_MUL_MAT_ID) {
2075
+ op = GGML_OP_ADD_ID;
2076
+ } else {
2077
+ op = GGML_OP_ADD;
2078
+ }
1993
2079
  } else {
1994
2080
  op = info.op;
1995
2081
  }
@@ -2029,7 +2115,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2029
2115
  for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
2030
2116
  std::regex pattern(overrides->pattern);
2031
2117
  if (std::regex_search(tensor_name, pattern)) {
2032
- buft = overrides->buft;
2118
+ if (overrides->buft == ggml_backend_cpu_buffer_type()) {
2119
+ // when overriding to a CPU buffer, consider the extra buffer types
2120
+ buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
2121
+ } else {
2122
+ buft = overrides->buft;
2123
+ }
2124
+
2033
2125
  LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
2034
2126
  tensor_name.c_str(),
2035
2127
  ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
@@ -2149,6 +2241,53 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2149
2241
  }
2150
2242
  }
2151
2243
  } break;
2244
+ case LLM_ARCH_LLADA:
2245
+ {
2246
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
2247
+
2248
+ // output
2249
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
2250
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
2251
+
2252
+ // if output is NULL, init from the input tok embed
2253
+ if (output == NULL) {
2254
+ output =
2255
+ create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
2256
+ }
2257
+
2258
+ for (int i = 0; i < n_layer; ++i) {
2259
+ auto & layer = layers[i];
2260
+
2261
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
2262
+
2263
+ // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
2264
+ layer.wq =
2265
+ create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
2266
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
2267
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
2268
+ // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
2269
+ layer.wo =
2270
+ create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
2271
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
2272
+
2273
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
2274
+
2275
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
2276
+ TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2277
+
2278
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
2279
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
2280
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
2281
+
2282
+ // optional MLP bias
2283
+ layer.ffn_gate_b =
2284
+ create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
2285
+ layer.ffn_down_b =
2286
+ create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
2287
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
2288
+ }
2289
+ }
2290
+ break;
2152
2291
  case LLM_ARCH_LLAMA4:
2153
2292
  {
2154
2293
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4345,6 +4484,105 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4345
4484
  layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4346
4485
  }
4347
4486
  } break;
4487
+ case LLM_ARCH_GLM4_MOE:
4488
+ {
4489
+ const int64_t n_expert = hparams.n_expert;
4490
+ const int64_t n_expert_used = hparams.n_expert_used;
4491
+ const int64_t n_expert_shared = hparams.n_expert_shared;
4492
+
4493
+ GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
4494
+ GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
4495
+
4496
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
4497
+
4498
+ // output
4499
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
4500
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
4501
+ // if output is NULL, init from the input tok embed
4502
+ if (output == NULL) {
4503
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
4504
+ }
4505
+
4506
+ // Load ALL tensors including NextN layer to satisfy total tensor count
4507
+ // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
4508
+ for (int i = 0; i < n_layer; ++i) {
4509
+ int flags = 0;
4510
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
4511
+ // skip all tensors in the NextN layers
4512
+ flags |= TENSOR_SKIP;
4513
+ }
4514
+
4515
+ auto & layer = layers[i];
4516
+
4517
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
4518
+
4519
+ // GLM-style attention with bias terms
4520
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
4521
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
4522
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
4523
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
4524
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
4525
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
4526
+
4527
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
4528
+
4529
+ // K/Q norm tensors (optional for GLM-4.5 355B variant)
4530
+ layer.attn_q_norm = create_tensor(
4531
+ tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
4532
+ layer.attn_k_norm = create_tensor(
4533
+ tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
4534
+
4535
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
4536
+
4537
+ // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
4538
+ // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
4539
+ const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
4540
+
4541
+ if (use_moe) {
4542
+ // MoE layers
4543
+ layer.ffn_gate_inp =
4544
+ create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
4545
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
4546
+
4547
+ // MoE branch
4548
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
4549
+
4550
+ layer.ffn_gate_exps = create_tensor(
4551
+ tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
4552
+ layer.ffn_down_exps = create_tensor(
4553
+ tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
4554
+ layer.ffn_up_exps = create_tensor(
4555
+ tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
4556
+
4557
+ // Shared expert
4558
+ if (n_expert_shared > 0) {
4559
+ const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
4560
+ layer.ffn_gate_shexp = create_tensor(
4561
+ tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
4562
+ layer.ffn_down_shexp = create_tensor(
4563
+ tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
4564
+ layer.ffn_up_shexp = create_tensor(
4565
+ tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
4566
+ }
4567
+ } else {
4568
+ // Dense layers (first k layers) - GLM uses separate gate/up projections
4569
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
4570
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
4571
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
4572
+ }
4573
+
4574
+ // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
4575
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
4576
+ layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
4577
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
4578
+ layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
4579
+ layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
4580
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
4581
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
4582
+ }
4583
+ }
4584
+ }
4585
+ break;
4348
4586
  case LLM_ARCH_NEMOTRON:
4349
4587
  {
4350
4588
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5126,6 +5364,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5126
5364
  layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
5127
5365
  }
5128
5366
  } break;
5367
+ case LLM_ARCH_HUNYUAN_DENSE:
5368
+ {
5369
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5370
+
5371
+ // output
5372
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5373
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5374
+ // if output is NULL, init from the input tok embed
5375
+ if (output == NULL) {
5376
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5377
+ }
5378
+
5379
+ for (int i = 0; i < n_layer; ++i) {
5380
+ auto & layer = layers[i];
5381
+
5382
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5383
+
5384
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5385
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5386
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5387
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5388
+
5389
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5390
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5391
+
5392
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5393
+
5394
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5395
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5396
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5397
+
5398
+ }
5399
+ } break;
5129
5400
  case LLM_ARCH_SMOLLM3:
5130
5401
  {
5131
5402
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5155,6 +5426,46 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5155
5426
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5156
5427
  }
5157
5428
  } break;
5429
+ case LLM_ARCH_OPENAI_MOE:
5430
+ {
5431
+ const int64_t n_ff_exp = hparams.n_ff_exp;
5432
+
5433
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5434
+
5435
+ // output
5436
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5437
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5438
+
5439
+ for (int i = 0; i < n_layer; ++i) {
5440
+ auto & layer = layers[i];
5441
+
5442
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5443
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5444
+
5445
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
5446
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
5447
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
5448
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
5449
+
5450
+ layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
5451
+
5452
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
5453
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5454
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5455
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5456
+
5457
+ // bias
5458
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0);
5459
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_head_kv * n_rot}, 0);
5460
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_head_kv * n_rot}, 0);
5461
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
5462
+
5463
+ layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
5464
+ layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
5465
+ layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0);
5466
+ layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
5467
+ }
5468
+ } break;
5158
5469
  case LLM_ARCH_LFM2:
5159
5470
  {
5160
5471
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5527,7 +5838,7 @@ void llama_model::print_info() const {
5527
5838
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
5528
5839
  }
5529
5840
 
5530
- if (arch == LLM_ARCH_QWEN3MOE) {
5841
+ if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE) {
5531
5842
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5532
5843
  }
5533
5844
 
@@ -8042,8 +8353,10 @@ struct llm_build_dream : public llm_graph_context {
8042
8353
  }
8043
8354
  };
8044
8355
 
8045
- struct llm_build_qwen2vl : public llm_graph_context {
8046
- llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
8356
+ struct llm_build_llada : public llm_graph_context {
8357
+ llm_build_llada(const llama_model & model, const llm_graph_params & params) :
8358
+ llm_graph_context(params) {
8359
+ // LLaDA is similar to LLaMA but uses non-causal attention for diffusion
8047
8360
  const int64_t n_embd_head = hparams.n_embd_head_v;
8048
8361
 
8049
8362
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -8057,10 +8370,8 @@ struct llm_build_qwen2vl : public llm_graph_context {
8057
8370
  // inp_pos - contains the positions
8058
8371
  ggml_tensor * inp_pos = build_inp_pos();
8059
8372
 
8060
- auto * inp_attn = build_attn_inp_kv_unified();
8061
-
8062
- int sections[4];
8063
- std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
8373
+ // Non-causal attention for diffusion
8374
+ auto * inp_attn = build_attn_inp_no_cache();
8064
8375
 
8065
8376
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8066
8377
 
@@ -8068,53 +8379,40 @@ struct llm_build_qwen2vl : public llm_graph_context {
8068
8379
  ggml_tensor * inpSA = inpL;
8069
8380
 
8070
8381
  // norm
8071
- cur = build_norm(inpL,
8072
- model.layers[il].attn_norm, NULL,
8073
- LLM_NORM_RMS, il);
8382
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
8074
8383
  cb(cur, "attn_norm", il);
8075
8384
 
8076
8385
  // self-attention
8077
8386
  {
8078
- // compute Q and K and RoPE them
8387
+ // compute separate Q, K, V projections without bias, matching LLaDALlamaBlock
8079
8388
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
8080
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8081
- cb(Qcur, "Qcur", il);
8082
-
8083
8389
  ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
8084
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8085
- cb(Kcur, "Kcur", il);
8086
-
8087
8390
  ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
8088
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8391
+
8392
+ cb(Qcur, "Qcur", il);
8393
+ cb(Kcur, "Kcur", il);
8089
8394
  cb(Vcur, "Vcur", il);
8090
8395
 
8091
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8396
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8092
8397
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8093
8398
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8094
8399
 
8095
- Qcur = ggml_rope_multi(
8096
- ctx0, Qcur, inp_pos, nullptr,
8097
- n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
8098
- ext_factor, attn_factor, beta_fast, beta_slow
8099
- );
8400
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8401
+ ext_factor, attn_factor, beta_fast, beta_slow);
8100
8402
 
8101
- Kcur = ggml_rope_multi(
8102
- ctx0, Kcur, inp_pos, nullptr,
8103
- n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
8104
- ext_factor, attn_factor, beta_fast, beta_slow
8105
- );
8403
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8404
+ ext_factor, attn_factor, beta_fast, beta_slow);
8106
8405
 
8107
8406
  cb(Qcur, "Qcur", il);
8108
8407
  cb(Kcur, "Kcur", il);
8109
8408
  cb(Vcur, "Vcur", il);
8110
8409
 
8111
- cur = build_attn(inp_attn,
8112
- model.layers[il].wo, model.layers[il].bo,
8113
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8410
+ cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr,
8411
+ 1.0f / sqrtf(float(n_embd_head)), il);
8114
8412
  }
8115
8413
 
8116
8414
  if (il == n_layer - 1 && inp_out_ids) {
8117
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8415
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8118
8416
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8119
8417
  }
8120
8418
 
@@ -8122,17 +8420,11 @@ struct llm_build_qwen2vl : public llm_graph_context {
8122
8420
  cb(ffn_inp, "ffn_inp", il);
8123
8421
 
8124
8422
  // feed-forward network
8125
- cur = build_norm(ffn_inp,
8126
- model.layers[il].ffn_norm, NULL,
8127
- LLM_NORM_RMS, il);
8423
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
8128
8424
  cb(cur, "ffn_norm", il);
8129
8425
 
8130
- cur = build_ffn(cur,
8131
- model.layers[il].ffn_up, NULL, NULL,
8132
- model.layers[il].ffn_gate, NULL, NULL,
8133
- model.layers[il].ffn_down, NULL, NULL,
8134
- NULL,
8135
- LLM_FFN_SILU, LLM_FFN_PAR, il);
8426
+ cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
8427
+ model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
8136
8428
  cb(cur, "ffn_out", il);
8137
8429
 
8138
8430
  cur = ggml_add(ctx0, cur, ffn_inp);
@@ -8146,9 +8438,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
8146
8438
 
8147
8439
  cur = inpL;
8148
8440
 
8149
- cur = build_norm(cur,
8150
- model.output_norm, NULL,
8151
- LLM_NORM_RMS, -1);
8441
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
8152
8442
 
8153
8443
  cb(cur, "result_norm", -1);
8154
8444
  res->t_embd = cur;
@@ -8163,8 +8453,8 @@ struct llm_build_qwen2vl : public llm_graph_context {
8163
8453
  }
8164
8454
  };
8165
8455
 
8166
- struct llm_build_qwen2moe : public llm_graph_context {
8167
- llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
8456
+ struct llm_build_qwen2vl : public llm_graph_context {
8457
+ llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
8168
8458
  const int64_t n_embd_head = hparams.n_embd_head_v;
8169
8459
 
8170
8460
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -8180,6 +8470,9 @@ struct llm_build_qwen2moe : public llm_graph_context {
8180
8470
 
8181
8471
  auto * inp_attn = build_attn_inp_kv_unified();
8182
8472
 
8473
+ int sections[4];
8474
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
8475
+
8183
8476
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8184
8477
 
8185
8478
  for (int il = 0; il < n_layer; ++il) {
@@ -8191,14 +8484,132 @@ struct llm_build_qwen2moe : public llm_graph_context {
8191
8484
  LLM_NORM_RMS, il);
8192
8485
  cb(cur, "attn_norm", il);
8193
8486
 
8194
- // self_attention
8487
+ // self-attention
8195
8488
  {
8196
8489
  // compute Q and K and RoPE them
8197
8490
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
8491
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8198
8492
  cb(Qcur, "Qcur", il);
8199
- if (model.layers[il].bq) {
8200
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8201
- cb(Qcur, "Qcur", il);
8493
+
8494
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
8495
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8496
+ cb(Kcur, "Kcur", il);
8497
+
8498
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
8499
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8500
+ cb(Vcur, "Vcur", il);
8501
+
8502
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8503
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8504
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8505
+
8506
+ Qcur = ggml_rope_multi(
8507
+ ctx0, Qcur, inp_pos, nullptr,
8508
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
8509
+ ext_factor, attn_factor, beta_fast, beta_slow
8510
+ );
8511
+
8512
+ Kcur = ggml_rope_multi(
8513
+ ctx0, Kcur, inp_pos, nullptr,
8514
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
8515
+ ext_factor, attn_factor, beta_fast, beta_slow
8516
+ );
8517
+
8518
+ cb(Qcur, "Qcur", il);
8519
+ cb(Kcur, "Kcur", il);
8520
+ cb(Vcur, "Vcur", il);
8521
+
8522
+ cur = build_attn(inp_attn,
8523
+ model.layers[il].wo, model.layers[il].bo,
8524
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8525
+ }
8526
+
8527
+ if (il == n_layer - 1 && inp_out_ids) {
8528
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8529
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8530
+ }
8531
+
8532
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8533
+ cb(ffn_inp, "ffn_inp", il);
8534
+
8535
+ // feed-forward network
8536
+ cur = build_norm(ffn_inp,
8537
+ model.layers[il].ffn_norm, NULL,
8538
+ LLM_NORM_RMS, il);
8539
+ cb(cur, "ffn_norm", il);
8540
+
8541
+ cur = build_ffn(cur,
8542
+ model.layers[il].ffn_up, NULL, NULL,
8543
+ model.layers[il].ffn_gate, NULL, NULL,
8544
+ model.layers[il].ffn_down, NULL, NULL,
8545
+ NULL,
8546
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
8547
+ cb(cur, "ffn_out", il);
8548
+
8549
+ cur = ggml_add(ctx0, cur, ffn_inp);
8550
+
8551
+ cur = build_cvec(cur, il);
8552
+ cb(cur, "l_out", il);
8553
+
8554
+ // input for next layer
8555
+ inpL = cur;
8556
+ }
8557
+
8558
+ cur = inpL;
8559
+
8560
+ cur = build_norm(cur,
8561
+ model.output_norm, NULL,
8562
+ LLM_NORM_RMS, -1);
8563
+
8564
+ cb(cur, "result_norm", -1);
8565
+ res->t_embd = cur;
8566
+
8567
+ // lm_head
8568
+ cur = build_lora_mm(model.output, cur);
8569
+
8570
+ cb(cur, "result_output", -1);
8571
+ res->t_logits = cur;
8572
+
8573
+ ggml_build_forward_expand(gf, cur);
8574
+ }
8575
+ };
8576
+
8577
+ struct llm_build_qwen2moe : public llm_graph_context {
8578
+ llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
8579
+ const int64_t n_embd_head = hparams.n_embd_head_v;
8580
+
8581
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8582
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8583
+
8584
+ ggml_tensor * cur;
8585
+ ggml_tensor * inpL;
8586
+
8587
+ inpL = build_inp_embd(model.tok_embd);
8588
+
8589
+ // inp_pos - contains the positions
8590
+ ggml_tensor * inp_pos = build_inp_pos();
8591
+
8592
+ auto * inp_attn = build_attn_inp_kv_unified();
8593
+
8594
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8595
+
8596
+ for (int il = 0; il < n_layer; ++il) {
8597
+ ggml_tensor * inpSA = inpL;
8598
+
8599
+ // norm
8600
+ cur = build_norm(inpL,
8601
+ model.layers[il].attn_norm, NULL,
8602
+ LLM_NORM_RMS, il);
8603
+ cb(cur, "attn_norm", il);
8604
+
8605
+ // self_attention
8606
+ {
8607
+ // compute Q and K and RoPE them
8608
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
8609
+ cb(Qcur, "Qcur", il);
8610
+ if (model.layers[il].bq) {
8611
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8612
+ cb(Qcur, "Qcur", il);
8202
8613
  }
8203
8614
 
8204
8615
  ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
@@ -13349,6 +13760,165 @@ struct llm_build_glm4 : public llm_graph_context {
13349
13760
  }
13350
13761
  };
13351
13762
 
13763
+ struct llm_build_glm4_moe : public llm_graph_context {
13764
+ llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
13765
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13766
+
13767
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13768
+
13769
+ ggml_tensor * cur;
13770
+ ggml_tensor * inpL;
13771
+
13772
+ inpL = build_inp_embd(model.tok_embd);
13773
+
13774
+ // inp_pos - contains the positions
13775
+ ggml_tensor * inp_pos = build_inp_pos();
13776
+
13777
+ auto * inp_attn = build_attn_inp_kv_unified();
13778
+
13779
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13780
+
13781
+ // Only process up to last layer (skip final NextN layer)
13782
+ // Final layer tensors are loaded but not processed in forward pass
13783
+ const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
13784
+ for (int il = 0; il < n_transformer_layers; ++il) {
13785
+ ggml_tensor * inpSA = inpL;
13786
+
13787
+ // Pre-attention norm
13788
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
13789
+ cb(cur, "attn_norm", il);
13790
+
13791
+ // self-attention
13792
+ {
13793
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
13794
+ if (model.layers[il].bq) {
13795
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
13796
+ }
13797
+ cb(Qcur, "Qcur", il);
13798
+
13799
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
13800
+ if (model.layers[il].bk) {
13801
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
13802
+ }
13803
+ cb(Kcur, "Kcur", il);
13804
+
13805
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
13806
+ if (model.layers[il].bv) {
13807
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
13808
+ }
13809
+ cb(Vcur, "Vcur", il);
13810
+
13811
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13812
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13813
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13814
+
13815
+ // Apply Q/K norm if available (GLM-4.5 355B variant)
13816
+ if (model.layers[il].attn_q_norm) {
13817
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
13818
+ cb(Qcur, "Qcur_normed", il);
13819
+ }
13820
+ if (model.layers[il].attn_k_norm) {
13821
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
13822
+ cb(Kcur, "Kcur_normed", il);
13823
+ }
13824
+
13825
+ Qcur = ggml_rope_ext(
13826
+ ctx0, Qcur, inp_pos, nullptr,
13827
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13828
+ ext_factor, attn_factor, beta_fast, beta_slow
13829
+ );
13830
+
13831
+ Kcur = ggml_rope_ext(
13832
+ ctx0, Kcur, inp_pos, nullptr,
13833
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13834
+ ext_factor, attn_factor, beta_fast, beta_slow
13835
+ );
13836
+
13837
+ cb(Qcur, "Qcur", il);
13838
+ cb(Kcur, "Kcur", il);
13839
+ cb(Vcur, "Vcur", il);
13840
+
13841
+ cur = build_attn(inp_attn,
13842
+ model.layers[il].wo, NULL,
13843
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13844
+ }
13845
+
13846
+ if (il == n_transformer_layers - 1 && inp_out_ids) {
13847
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13848
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13849
+ }
13850
+
13851
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13852
+ cb(ffn_inp, "ffn_inp", il);
13853
+
13854
+ // Post-attention norm
13855
+ cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
13856
+ cb(cur, "post_attn_norm", il);
13857
+
13858
+ // Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
13859
+ if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
13860
+ // Dense FFN layer
13861
+ cur = build_ffn(cur,
13862
+ model.layers[il].ffn_up, NULL, NULL,
13863
+ model.layers[il].ffn_gate, NULL, NULL,
13864
+ model.layers[il].ffn_down, NULL, NULL,
13865
+ NULL,
13866
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13867
+ cb(cur, "ffn_out", il);
13868
+ } else {
13869
+ // Process routed experts using existing MoE infrastructure
13870
+ ggml_tensor * routed_out = build_moe_ffn(cur,
13871
+ model.layers[il].ffn_gate_inp,
13872
+ model.layers[il].ffn_up_exps,
13873
+ model.layers[il].ffn_gate_exps,
13874
+ model.layers[il].ffn_down_exps,
13875
+ model.layers[il].ffn_exp_probs_b,
13876
+ n_expert, n_expert_used,
13877
+ LLM_FFN_SILU, hparams.expert_weights_norm,
13878
+ true, hparams.expert_weights_scale,
13879
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
13880
+ il);
13881
+ cb(routed_out, "ffn_moe_out", il);
13882
+
13883
+ // Process shared expert on original input
13884
+ ggml_tensor * shared_out = build_ffn(cur,
13885
+ model.layers[il].ffn_up_shexp, NULL, NULL,
13886
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
13887
+ model.layers[il].ffn_down_shexp, NULL, NULL,
13888
+ NULL,
13889
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13890
+ cb(shared_out, "ffn_shexp_out", il);
13891
+
13892
+ // Final output: routed_output + shared_output
13893
+ cur = ggml_add(ctx0, routed_out, shared_out);
13894
+ cb(cur, "ffn_out", il);
13895
+ }
13896
+
13897
+ cur = ggml_add(ctx0, cur, ffn_inp);
13898
+
13899
+ cur = build_cvec(cur, il);
13900
+ cb(cur, "l_out", il);
13901
+
13902
+ // input for next layer
13903
+ inpL = cur;
13904
+ }
13905
+
13906
+ cur = inpL;
13907
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
13908
+
13909
+ cb(cur, "result_norm", -1);
13910
+ res->t_embd = cur;
13911
+
13912
+ // lm_head
13913
+ cur = build_lora_mm(model.output, cur);
13914
+
13915
+ cb(cur, "result_output", -1);
13916
+ res->t_logits = cur;
13917
+
13918
+ ggml_build_forward_expand(gf, cur);
13919
+ }
13920
+ };
13921
+
13352
13922
  struct llm_build_nemotron : public llm_graph_context {
13353
13923
  llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
13354
13924
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -16761,6 +17331,144 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
16761
17331
  }
16762
17332
  };
16763
17333
 
17334
+ struct llm_build_hunyuan_dense : public llm_graph_context {
17335
+ llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
17336
+ const int64_t n_embd_head = hparams.n_embd_head_v;
17337
+
17338
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
17339
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
17340
+
17341
+ ggml_tensor * cur;
17342
+ ggml_tensor * inpL;
17343
+
17344
+ inpL = build_inp_embd(model.tok_embd);
17345
+
17346
+ // inp_pos - contains the positions
17347
+ ggml_tensor * inp_pos = build_inp_pos();
17348
+
17349
+ auto * inp_attn = build_attn_inp_kv_unified();
17350
+
17351
+ const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
17352
+
17353
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
17354
+
17355
+ for (int il = 0; il < n_layer; ++il) {
17356
+ ggml_tensor * inpSA = inpL;
17357
+
17358
+ // norm
17359
+ cur = build_norm(inpL,
17360
+ model.layers[il].attn_norm, NULL,
17361
+ LLM_NORM_RMS, il);
17362
+ cb(cur, "attn_norm", il);
17363
+ // self-attention
17364
+ {
17365
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
17366
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
17367
+
17368
+ // compute Q and K and RoPE them
17369
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
17370
+ cb(Qcur, "Qcur", il);
17371
+ if (model.layers[il].bq) {
17372
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
17373
+ cb(Qcur, "Qcur", il);
17374
+ }
17375
+
17376
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
17377
+ cb(Kcur, "Kcur", il);
17378
+ if (model.layers[il].bk) {
17379
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
17380
+ cb(Kcur, "Kcur", il);
17381
+ }
17382
+
17383
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
17384
+ cb(Vcur, "Vcur", il);
17385
+ if (model.layers[il].bv) {
17386
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
17387
+ cb(Vcur, "Vcur", il);
17388
+ }
17389
+
17390
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
17391
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
17392
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
17393
+
17394
+ Qcur = ggml_rope_ext(
17395
+ ctx0, Qcur, inp_pos, rope_factors,
17396
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17397
+ ext_factor, attn_factor, beta_fast, beta_slow
17398
+ );
17399
+
17400
+ cb(Qcur, "Qcur", il);
17401
+ cb(Kcur, "Kcur", il);
17402
+ cb(Vcur, "Vcur", il);
17403
+
17404
+ Kcur = ggml_rope_ext(
17405
+ ctx0, Kcur, inp_pos, rope_factors,
17406
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17407
+ ext_factor, attn_factor, beta_fast, beta_slow
17408
+ );
17409
+
17410
+ Kcur = build_norm(Kcur,
17411
+ model.layers[il].attn_k_norm, nullptr,
17412
+ LLM_NORM_RMS, il);
17413
+ cb(Kcur, "Kcur_norm", il);
17414
+
17415
+ Qcur = build_norm(Qcur,
17416
+ model.layers[il].attn_q_norm, nullptr,
17417
+ LLM_NORM_RMS, il);
17418
+ cb(Qcur, "Qcur_norm", il);
17419
+
17420
+ cur = build_attn(inp_attn,
17421
+ model.layers[il].wo, model.layers[il].bo,
17422
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
17423
+ cb(cur, "attn_out", il);
17424
+ }
17425
+
17426
+ if (il == n_layer - 1 && inp_out_ids) {
17427
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
17428
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
17429
+ }
17430
+
17431
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
17432
+ cb(ffn_inp, "ffn_inp", il);
17433
+
17434
+ cur = build_norm(ffn_inp,
17435
+ model.layers[il].ffn_norm, NULL,
17436
+ LLM_NORM_RMS, il);
17437
+ cb(cur, "ffn_norm", il);
17438
+ // feed-forward network (non-MoE)
17439
+ ggml_tensor * cur_mlp = build_ffn(cur,
17440
+ model.layers[il].ffn_up, NULL, NULL,
17441
+ model.layers[il].ffn_gate, NULL, NULL,
17442
+ model.layers[il].ffn_down, NULL, NULL,
17443
+ NULL,
17444
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
17445
+ cb(cur_mlp, "ffn_out", il);
17446
+
17447
+ cur = ggml_add(ctx0, cur_mlp, ffn_inp);
17448
+
17449
+ cur = build_cvec(cur, il);
17450
+ cb(cur, "l_out", il);
17451
+
17452
+ // input for next layer
17453
+ inpL = cur;
17454
+ }
17455
+ cur = inpL;
17456
+
17457
+ cur = build_norm(cur,
17458
+ model.output_norm, NULL,
17459
+ LLM_NORM_RMS, -1);
17460
+
17461
+ cb(cur, "result_norm", -1);
17462
+ res->t_embd = cur;
17463
+ // lm_head
17464
+ cur = build_lora_mm(model.output, cur);
17465
+ cb(cur, "result_output", -1);
17466
+ res->t_logits = cur;
17467
+
17468
+ ggml_build_forward_expand(gf, cur);
17469
+ }
17470
+ };
17471
+
16764
17472
  struct llm_build_smollm3 : public llm_graph_context {
16765
17473
  llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
16766
17474
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -16898,6 +17606,136 @@ struct llm_build_smollm3 : public llm_graph_context {
16898
17606
  }
16899
17607
  };
16900
17608
 
17609
+ struct llm_build_openai_moe_iswa : public llm_graph_context {
17610
+ llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
17611
+ ggml_tensor * cur;
17612
+ ggml_tensor * inpL;
17613
+
17614
+ inpL = build_inp_embd(model.tok_embd);
17615
+
17616
+ // inp_pos - contains the positions
17617
+ ggml_tensor * inp_pos = build_inp_pos();
17618
+
17619
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
17620
+
17621
+ for (int il = 0; il < n_layer; ++il) {
17622
+ ggml_tensor * inpSA = inpL;
17623
+
17624
+ // norm
17625
+ cur = build_norm(inpL,
17626
+ model.layers[il].attn_norm, nullptr,
17627
+ LLM_NORM_RMS, il);
17628
+ cb(cur, "attn_norm", il);
17629
+
17630
+ // self-attention
17631
+ {
17632
+ // compute Q and K and RoPE them
17633
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
17634
+ cb(Qcur, "Qcur", il);
17635
+ if (model.layers[il].bq) {
17636
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
17637
+ cb(Qcur, "Qcur", il);
17638
+ }
17639
+
17640
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
17641
+ cb(Kcur, "Kcur", il);
17642
+ if (model.layers[il].bk) {
17643
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
17644
+ cb(Kcur, "Kcur", il);
17645
+ }
17646
+
17647
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
17648
+ cb(Vcur, "Vcur", il);
17649
+ if (model.layers[il].bv) {
17650
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
17651
+ cb(Vcur, "Vcur", il);
17652
+ }
17653
+
17654
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
17655
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
17656
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
17657
+
17658
+ Qcur = ggml_rope_ext(
17659
+ ctx0, Qcur, inp_pos, nullptr,
17660
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17661
+ ext_factor, attn_factor, beta_fast, beta_slow
17662
+ );
17663
+
17664
+ Kcur = ggml_rope_ext(
17665
+ ctx0, Kcur, inp_pos, nullptr,
17666
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17667
+ ext_factor, attn_factor, beta_fast, beta_slow
17668
+ );
17669
+
17670
+ cb(Qcur, "Qcur", il);
17671
+ cb(Kcur, "Kcur", il);
17672
+ cb(Vcur, "Vcur", il);
17673
+
17674
+ cur = build_attn_with_sinks(inp_attn,
17675
+ model.layers[il].wo, model.layers[il].bo,
17676
+ Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].attn_sinks, 1.0f/sqrtf(float(n_rot)), il);
17677
+
17678
+ cb(cur, "attn_out", il);
17679
+ }
17680
+
17681
+ if (il == n_layer - 1) {
17682
+ // skip computing output for unused tokens
17683
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
17684
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
17685
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
17686
+ }
17687
+
17688
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
17689
+ cb(ffn_inp, "ffn_inp", il);
17690
+
17691
+ cur = ffn_inp;
17692
+ cur = build_norm(cur,
17693
+ model.layers[il].attn_post_norm, nullptr,
17694
+ LLM_NORM_RMS, il);
17695
+ cb(cur, "attn_post_norm", il);
17696
+
17697
+ // MoE branch
17698
+ cur = build_moe_ffn(cur,
17699
+ model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b,
17700
+ model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b,
17701
+ model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b,
17702
+ model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b,
17703
+ nullptr,
17704
+ n_expert, n_expert_used,
17705
+ LLM_FFN_SWIGLU_OAI_MOE, false,
17706
+ false, 0.0,
17707
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
17708
+ il);
17709
+ cb(cur, "ffn_moe_out", il);
17710
+
17711
+ cur = ggml_add(ctx0, cur, ffn_inp);
17712
+
17713
+ cur = build_cvec(cur, il);
17714
+ cb(cur, "l_out", il);
17715
+
17716
+ // input for next layer
17717
+ inpL = cur;
17718
+ }
17719
+
17720
+ cur = inpL;
17721
+
17722
+ cur = build_norm(cur,
17723
+ model.output_norm, NULL,
17724
+ LLM_NORM_RMS, -1);
17725
+
17726
+ cb(cur, "result_norm", -1);
17727
+ res->t_embd = cur;
17728
+
17729
+ // lm_head
17730
+ cur = build_lora_mm(model.output, cur);
17731
+
17732
+ cb(cur, "result_output", -1);
17733
+ res->t_logits = cur;
17734
+
17735
+ ggml_build_forward_expand(gf, cur);
17736
+ }
17737
+ };
17738
+
16901
17739
  struct llm_build_lfm2 : public llm_graph_context {
16902
17740
  const llama_model & model;
16903
17741
 
@@ -17158,10 +17996,18 @@ struct llm_build_smallthinker : public llm_graph_context{
17158
17996
  cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
17159
17997
  cb(cur, "ffn_norm", il);
17160
17998
 
17161
- ggml_tensor * ffn_out = build_moe_ffn_from_probs(cur, probs, model.layers[il].ffn_up_exps,
17162
- model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
17163
- nullptr, n_expert, n_expert_used,
17164
- static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
17999
+ ggml_tensor * ffn_out =
18000
+ build_moe_ffn(cur,
18001
+ nullptr,
18002
+ model.layers[il].ffn_up_exps,
18003
+ model.layers[il].ffn_gate_exps,
18004
+ model.layers[il].ffn_down_exps,
18005
+ nullptr,
18006
+ n_expert, n_expert_used,
18007
+ LLM_FFN_RELU, true,
18008
+ false, 0.0,
18009
+ static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
18010
+ il, probs);
17165
18011
 
17166
18012
  cb(ffn_out, "ffn_out", il);
17167
18013
  cur = ffn_out;
@@ -17201,6 +18047,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
17201
18047
  case LLM_ARCH_NEO_BERT:
17202
18048
  case LLM_ARCH_WAVTOKENIZER_DEC:
17203
18049
  case LLM_ARCH_DREAM:
18050
+ case LLM_ARCH_LLADA:
17204
18051
  {
17205
18052
  res = nullptr;
17206
18053
  } break;
@@ -17236,6 +18083,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
17236
18083
  /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
17237
18084
  /* n_seq_max */ cparams.n_seq_max,
17238
18085
  /* offload */ cparams.offload_kqv,
18086
+ /* unified */ cparams.kv_unified,
17239
18087
  /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
17240
18088
  /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
17241
18089
  } else {
@@ -17367,6 +18215,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17367
18215
  llm = std::make_unique<llm_build_dream>(*this, params);
17368
18216
  }
17369
18217
  break;
18218
+ case LLM_ARCH_LLADA:
18219
+ {
18220
+ llm = std::make_unique<llm_build_llada>(*this, params);
18221
+ }
18222
+ break;
17370
18223
  case LLM_ARCH_QWEN2VL:
17371
18224
  {
17372
18225
  llm = std::make_unique<llm_build_qwen2vl>(*this, params);
@@ -17509,6 +18362,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17509
18362
  {
17510
18363
  llm = std::make_unique<llm_build_glm4>(*this, params);
17511
18364
  } break;
18365
+ case LLM_ARCH_GLM4_MOE:
18366
+ {
18367
+ llm = std::make_unique<llm_build_glm4_moe>(*this, params);
18368
+ } break;
17512
18369
  case LLM_ARCH_BITNET:
17513
18370
  {
17514
18371
  llm = std::make_unique<llm_build_bitnet>(*this, params);
@@ -17614,10 +18471,18 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17614
18471
  {
17615
18472
  llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
17616
18473
  } break;
18474
+ case LLM_ARCH_HUNYUAN_DENSE:
18475
+ {
18476
+ llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
18477
+ } break;
17617
18478
  case LLM_ARCH_SMOLLM3:
17618
18479
  {
17619
18480
  llm = std::make_unique<llm_build_smollm3>(*this, params);
17620
18481
  } break;
18482
+ case LLM_ARCH_OPENAI_MOE:
18483
+ {
18484
+ llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
18485
+ } break;
17621
18486
  case LLM_ARCH_FALCON_H1:
17622
18487
  {
17623
18488
  llm = std::make_unique<llm_build_falcon_h1>(*this, params);
@@ -17663,6 +18528,7 @@ llama_model_params llama_model_default_params() {
17663
18528
  /*.use_mmap =*/ true,
17664
18529
  /*.use_mlock =*/ false,
17665
18530
  /*.check_tensors =*/ false,
18531
+ /*.use_extra_bufts =*/ true,
17666
18532
  };
17667
18533
 
17668
18534
  #ifdef GGML_USE_METAL
@@ -17765,6 +18631,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
17765
18631
 
17766
18632
  // use what we call a normal RoPE, operating on pairs of consecutive head values
17767
18633
  case LLM_ARCH_LLAMA:
18634
+ case LLM_ARCH_LLADA:
17768
18635
  case LLM_ARCH_LLAMA4:
17769
18636
  case LLM_ARCH_DECI:
17770
18637
  case LLM_ARCH_BAICHUAN:
@@ -17831,8 +18698,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
17831
18698
  case LLM_ARCH_MINICPM3:
17832
18699
  case LLM_ARCH_DOTS1:
17833
18700
  case LLM_ARCH_HUNYUAN_MOE:
18701
+ case LLM_ARCH_OPENAI_MOE:
18702
+ case LLM_ARCH_HUNYUAN_DENSE:
17834
18703
  case LLM_ARCH_LFM2:
17835
18704
  case LLM_ARCH_SMALLTHINKER:
18705
+ case LLM_ARCH_GLM4_MOE:
17836
18706
  return LLAMA_ROPE_TYPE_NEOX;
17837
18707
 
17838
18708
  case LLM_ARCH_QWEN2VL:
@@ -17943,6 +18813,10 @@ bool llama_model_is_recurrent(const llama_model * model) {
17943
18813
  return llm_arch_is_recurrent(model->arch);
17944
18814
  }
17945
18815
 
18816
+ bool llama_model_is_diffusion(const llama_model * model) {
18817
+ return llm_arch_is_diffusion(model->arch);
18818
+ }
18819
+
17946
18820
  const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
17947
18821
  return model->tensors_by_name;
17948
18822
  }