@fugood/llama.node 1.4.6 → 1.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +25 -26
  4. package/src/LlamaContext.cpp +2 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +364 -193
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
  9. package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
  10. package/src/llama.cpp/common/chat-parser.cpp +3 -2
  11. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  12. package/src/llama.cpp/common/chat.cpp +272 -0
  13. package/src/llama.cpp/common/common.cpp +130 -67
  14. package/src/llama.cpp/common/common.h +40 -16
  15. package/src/llama.cpp/common/console.cpp +680 -47
  16. package/src/llama.cpp/common/console.h +30 -8
  17. package/src/llama.cpp/common/download.cpp +69 -25
  18. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  19. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  20. package/src/llama.cpp/common/log.cpp +5 -0
  21. package/src/llama.cpp/common/log.h +1 -0
  22. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  23. package/src/llama.cpp/common/preset.cpp +206 -0
  24. package/src/llama.cpp/common/preset.h +32 -0
  25. package/src/llama.cpp/common/sampling.cpp +91 -92
  26. package/src/llama.cpp/common/sampling.h +11 -6
  27. package/src/llama.cpp/common/speculative.cpp +1 -1
  28. package/src/llama.cpp/ggml/CMakeLists.txt +5 -0
  29. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  30. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  31. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  32. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  33. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +69 -39
  37. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
  39. package/src/llama.cpp/include/llama.h +18 -1
  40. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  41. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  42. package/src/llama.cpp/src/llama-arch.h +9 -2
  43. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  44. package/src/llama.cpp/src/llama-batch.h +4 -2
  45. package/src/llama.cpp/src/llama-context.cpp +99 -29
  46. package/src/llama.cpp/src/llama-context.h +9 -3
  47. package/src/llama.cpp/src/llama-grammar.cpp +233 -33
  48. package/src/llama.cpp/src/llama-grammar.h +20 -1
  49. package/src/llama.cpp/src/llama-graph.cpp +85 -17
  50. package/src/llama.cpp/src/llama-graph.h +17 -4
  51. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  52. package/src/llama.cpp/src/llama-hparams.h +5 -1
  53. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  54. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  55. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  56. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  57. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  58. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  59. package/src/llama.cpp/src/llama-model.cpp +123 -52
  60. package/src/llama.cpp/src/llama-model.h +1 -0
  61. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  62. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  63. package/src/llama.cpp/src/llama.cpp +675 -1
  64. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  65. package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
  66. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  67. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  68. package/src/llama.cpp/src/models/models.h +8 -7
  69. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  70. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  71. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -120,6 +120,8 @@ const char * llm_type_name(llm_type type) {
120
120
  case LLM_TYPE_16B_A1B: return "16B.A1B";
121
121
  case LLM_TYPE_21B_A3B: return "21B.A3B";
122
122
  case LLM_TYPE_30B_A3B: return "30B.A3B";
123
+ case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
124
+ case LLM_TYPE_80B_A3B: return "80B.A3B";
123
125
  case LLM_TYPE_100B_A6B: return "100B.A6B";
124
126
  case LLM_TYPE_106B_A12B: return "106B.A12B";
125
127
  case LLM_TYPE_230B_A10B: return "230B.A10B";
@@ -667,6 +669,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
667
669
  hparams.n_swa = 8192;
668
670
  hparams.n_attn_temp_floor_scale = 8192;
669
671
  hparams.f_attn_temp_scale = 0.1f;
672
+ hparams.f_attn_temp_offset = 1.0f;
670
673
  hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
671
674
  }
672
675
 
@@ -1264,18 +1267,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1264
1267
  } break;
1265
1268
  case LLM_ARCH_GEMMA3:
1266
1269
  {
1267
- hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1268
- hparams.set_swa_pattern(6);
1270
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1271
+ if (found_swa && hparams.n_swa > 0) {
1272
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1273
+ hparams.set_swa_pattern(6);
1269
1274
 
1270
- hparams.rope_freq_base_train_swa = 10000.0f;
1271
- hparams.rope_freq_scale_train_swa = 1.0f;
1275
+ hparams.rope_freq_base_train_swa = 10000.0f;
1276
+ hparams.rope_freq_scale_train_swa = 1.0f;
1277
+ } else {
1278
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1279
+ }
1272
1280
 
1273
- ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1281
+ hparams.f_final_logit_softcapping = 0.0f;
1282
+ ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
1274
1283
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1275
1284
 
1276
1285
  switch (hparams.n_layer) {
1277
1286
  case 18: type = LLM_TYPE_270M; break;
1278
1287
  case 26: type = LLM_TYPE_1B; break;
1288
+ case 32: type = LLM_TYPE_8B; break; // Rnj-1
1279
1289
  case 34: type = LLM_TYPE_4B; break;
1280
1290
  case 48: type = LLM_TYPE_12B; break;
1281
1291
  case 62: type = LLM_TYPE_27B; break;
@@ -1599,8 +1609,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1599
1609
  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1600
1610
  ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1601
1611
 
1602
- switch (hparams.n_layer) {
1603
- case 28: type = LLM_TYPE_20B; break;
1612
+ switch (hparams.n_ff_exp) {
1613
+ case 1408: type = LLM_TYPE_16B; break;
1614
+ case 1792: type = LLM_TYPE_20B; break;
1604
1615
  default: type = LLM_TYPE_UNKNOWN;
1605
1616
  }
1606
1617
  } break;
@@ -1626,12 +1637,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1626
1637
  // that have no expert_gating_func model parameter set
1627
1638
  hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
1628
1639
  }
1629
- ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
1640
+
1641
+ if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
1642
+ // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
1643
+ // cancel the factor from the convert script
1644
+ hparams.rope_yarn_log_mul /= 0.1f;
1645
+ }
1630
1646
 
1631
1647
  // (optional) temperature tuning - used by mistral-large
1632
1648
  ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
1633
1649
  ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
1634
1650
 
1651
+ hparams.f_attn_temp_offset = 0.0f;
1652
+
1635
1653
  switch (hparams.n_layer) {
1636
1654
  case 27: type = LLM_TYPE_16B; break;
1637
1655
  case 60: type = LLM_TYPE_236B; break;
@@ -1671,7 +1689,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1671
1689
  } break;
1672
1690
  case LLM_ARCH_GLM4:
1673
1691
  {
1674
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1692
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1693
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
1675
1694
  switch (hparams.n_layer) {
1676
1695
  case 40: type = LLM_TYPE_9B; break;
1677
1696
  case 61: type = LLM_TYPE_32B; break;
@@ -1680,8 +1699,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1680
1699
  } break;
1681
1700
  case LLM_ARCH_GLM4_MOE:
1682
1701
  {
1683
- ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1684
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1702
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1703
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1704
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
1685
1705
 
1686
1706
  // MoE parameters
1687
1707
  ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
@@ -1780,6 +1800,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1780
1800
  }
1781
1801
  } break;
1782
1802
  case LLM_ARCH_NEMOTRON_H:
1803
+ case LLM_ARCH_NEMOTRON_H_MOE:
1783
1804
  {
1784
1805
  ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1785
1806
  ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
@@ -1795,7 +1816,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1795
1816
 
1796
1817
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1797
1818
 
1819
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1820
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
1821
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
1822
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1823
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
1824
+
1798
1825
  switch (hparams.n_layer) {
1826
+ case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
1799
1827
  case 56: type = LLM_TYPE_9B; break;
1800
1828
  default: type = LLM_TYPE_UNKNOWN;
1801
1829
  }
@@ -2249,7 +2277,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2249
2277
  }
2250
2278
 
2251
2279
  switch (hparams.n_layer) {
2252
- case 80: type = LLM_TYPE_80B_A3B; break;
2280
+ case 48: type = LLM_TYPE_80B_A3B; break;
2253
2281
  default: type = LLM_TYPE_UNKNOWN;
2254
2282
  }
2255
2283
  } break;
@@ -2258,9 +2286,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2258
2286
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2259
2287
  ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
2260
2288
 
2261
- ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
2262
- ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
2263
- ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
2289
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
2290
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
2291
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f);
2292
+
2293
+ hparams.f_attn_temp_offset = 0.0f;
2264
2294
 
2265
2295
  // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
2266
2296
  if (hparams.f_attn_temp_scale != 0.0f) {
@@ -2270,18 +2300,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2270
2300
  }
2271
2301
  }
2272
2302
 
2273
- // TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
2274
- // but may need further verification with other values
2275
- if (hparams.rope_yarn_log_mul != 0.0f) {
2276
- float factor = 1.0f / hparams.rope_freq_scale_train;
2277
- float mscale = 1.0f;
2278
- float mscale_all_dims = hparams.rope_yarn_log_mul;
2279
- static auto get_mscale = [](float scale, float mscale) {
2280
- return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
2281
- };
2282
- hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
2283
- }
2284
-
2285
2303
  switch (hparams.n_layer) {
2286
2304
  case 26: type = LLM_TYPE_3B; break;
2287
2305
  case 34: type = LLM_TYPE_8B; break;
@@ -3381,9 +3399,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3381
3399
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3382
3400
 
3383
3401
  // optional bias tensors
3384
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
3385
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
3386
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
3402
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3403
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3404
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3387
3405
 
3388
3406
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3389
3407
 
@@ -5152,6 +5170,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5152
5170
  }
5153
5171
  } break;
5154
5172
  case LLM_ARCH_NEMOTRON_H:
5173
+ case LLM_ARCH_NEMOTRON_H_MOE:
5155
5174
  {
5156
5175
  // mamba2 Mixer SSM params
5157
5176
  // NOTE: int64_t for tensor dimensions
@@ -5162,6 +5181,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5162
5181
  const int64_t n_group = hparams.ssm_n_group;
5163
5182
  const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
5164
5183
 
5184
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
5185
+ const int64_t n_ff_shexp = hparams.n_ff_shexp;
5186
+
5165
5187
  // embeddings
5166
5188
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5167
5189
 
@@ -5211,12 +5233,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5211
5233
  layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
5212
5234
  layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
5213
5235
  layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5214
- } else {
5215
- // mlp layers
5216
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
5217
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
5218
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5219
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
5236
+ } else {
5237
+ if (n_expert != 0) {
5238
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
5239
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
5240
+
5241
+ // MoE branch
5242
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5243
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5244
+
5245
+ // Shared expert branch
5246
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
5247
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
5248
+
5249
+ } else {
5250
+ // mlp layers
5251
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
5252
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
5253
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5254
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
5255
+ }
5220
5256
  }
5221
5257
  }
5222
5258
  } break;
@@ -6200,8 +6236,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6200
6236
  {
6201
6237
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6202
6238
 
6203
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6204
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6239
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
6240
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6205
6241
 
6206
6242
  if (output == NULL) {
6207
6243
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
@@ -6599,9 +6635,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6599
6635
 
6600
6636
  std::vector<ggml_backend_buffer_ptr> bufs;
6601
6637
  if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
6638
+ GGML_ASSERT(!ml.no_alloc);
6602
6639
  for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
6603
6640
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
6604
- // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
6641
+ // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
6642
+ // then we could just use metal for all layers
6605
6643
  // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
6606
6644
  void * addr = nullptr;
6607
6645
  size_t first, last; // NOLINT
@@ -6617,9 +6655,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6617
6655
  bufs.emplace_back(buf);
6618
6656
  buf_map.emplace(idx, buf);
6619
6657
  }
6620
- }
6621
- else {
6622
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6658
+ } else {
6659
+ ggml_backend_buffer_t buf;
6660
+ if (ml.no_alloc) {
6661
+ buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
6662
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
6663
+ t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
6664
+ }
6665
+ } else {
6666
+ buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
6667
+ }
6623
6668
  if (buf == nullptr) {
6624
6669
  throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6625
6670
  }
@@ -6674,6 +6719,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6674
6719
  }
6675
6720
  }
6676
6721
 
6722
+ if (ml.no_alloc) {
6723
+ return true;
6724
+ }
6725
+
6677
6726
  // load tensor data
6678
6727
  for (auto & [ctx, buf_map] : ctx_buf_maps) {
6679
6728
  if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
@@ -6716,9 +6765,18 @@ size_t llama_model::n_devices() const {
6716
6765
 
6717
6766
  std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
6718
6767
  std::map<ggml_backend_buffer_type_t, size_t> ret;
6719
- for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
6720
- for (const auto & buf : bufs) {
6721
- ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6768
+ for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
6769
+ if (hparams.no_alloc) {
6770
+ GGML_ASSERT(bufs.size() == 1);
6771
+ ggml_backend_buffer_t buf = bufs[0].get();
6772
+ GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
6773
+ ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
6774
+ ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
6775
+ } else {
6776
+ for (const auto & buf : bufs) {
6777
+ // GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
6778
+ ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6779
+ }
6722
6780
  }
6723
6781
  }
6724
6782
  return ret;
@@ -6763,6 +6821,7 @@ void llama_model::print_info() const {
6763
6821
  // hparams
6764
6822
  LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
6765
6823
  LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
6824
+ LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
6766
6825
 
6767
6826
  if (!hparams.vocab_only) {
6768
6827
  LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
@@ -6797,6 +6856,7 @@ void llama_model::print_info() const {
6797
6856
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
6798
6857
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
6799
6858
  LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
6859
+ LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
6800
6860
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
6801
6861
  // MRoPE (Multi-axis Rotary Position Embedding) sections
6802
6862
  if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
@@ -6819,7 +6879,8 @@ void llama_model::print_info() const {
6819
6879
  arch == LLM_ARCH_PLAMO2 ||
6820
6880
  arch == LLM_ARCH_GRANITE_HYBRID ||
6821
6881
  arch == LLM_ARCH_QWEN3NEXT ||
6822
- arch == LLM_ARCH_NEMOTRON_H) {
6882
+ arch == LLM_ARCH_NEMOTRON_H ||
6883
+ arch == LLM_ARCH_NEMOTRON_H_MOE) {
6823
6884
  LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
6824
6885
  LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
6825
6886
  LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
@@ -6860,7 +6921,6 @@ void llama_model::print_info() const {
6860
6921
  LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
6861
6922
  LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
6862
6923
  LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
6863
- LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
6864
6924
  }
6865
6925
 
6866
6926
  if (arch == LLM_ARCH_QWEN2MOE) {
@@ -6875,7 +6935,8 @@ void llama_model::print_info() const {
6875
6935
  if (arch == LLM_ARCH_MINICPM ||
6876
6936
  arch == LLM_ARCH_GRANITE ||
6877
6937
  arch == LLM_ARCH_GRANITE_MOE ||
6878
- arch == LLM_ARCH_GRANITE_HYBRID) {
6938
+ arch == LLM_ARCH_GRANITE_HYBRID ||
6939
+ arch == LLM_ARCH_NEMOTRON_H_MOE) {
6879
6940
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
6880
6941
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
6881
6942
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -7056,7 +7117,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
7056
7117
  if (arch == LLM_ARCH_FALCON_H1) {
7057
7118
  filter_attn = [&](int32_t) { return true; };
7058
7119
  filter_recr = [&](int32_t) { return true; };
7059
- } else if (arch == LLM_ARCH_NEMOTRON_H) {
7120
+ } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
7060
7121
  filter_attn = [&](int32_t il) {
7061
7122
  return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
7062
7123
  };
@@ -7304,7 +7365,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7304
7365
  } break;
7305
7366
  case LLM_ARCH_GEMMA3:
7306
7367
  {
7307
- llm = std::make_unique<llm_build_gemma3_iswa>(*this, params);
7368
+ if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
7369
+ llm = std::make_unique<llm_build_gemma3<true>>(*this, params);
7370
+ } else {
7371
+ llm = std::make_unique<llm_build_gemma3<false>>(*this, params);
7372
+ }
7308
7373
  } break;
7309
7374
  case LLM_ARCH_GEMMA3N:
7310
7375
  {
@@ -7423,6 +7488,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7423
7488
  llm = std::make_unique<llm_build_nemotron>(*this, params);
7424
7489
  } break;
7425
7490
  case LLM_ARCH_NEMOTRON_H:
7491
+ case LLM_ARCH_NEMOTRON_H_MOE:
7426
7492
  {
7427
7493
  llm = std::make_unique<llm_build_nemotron_h>(*this, params);
7428
7494
  } break;
@@ -7607,6 +7673,7 @@ llama_model_params llama_model_default_params() {
7607
7673
  /*.check_tensors =*/ false,
7608
7674
  /*.use_extra_bufts =*/ true,
7609
7675
  /*.no_host =*/ false,
7676
+ /*.no_alloc =*/ false,
7610
7677
  };
7611
7678
 
7612
7679
  return result;
@@ -7706,6 +7773,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7706
7773
  case LLM_ARCH_ARWKV7:
7707
7774
  case LLM_ARCH_WAVTOKENIZER_DEC:
7708
7775
  case LLM_ARCH_NEMOTRON_H:
7776
+ case LLM_ARCH_NEMOTRON_H_MOE:
7709
7777
  return LLAMA_ROPE_TYPE_NONE;
7710
7778
 
7711
7779
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -7726,7 +7794,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7726
7794
  case LLM_ARCH_DEEPSEEK2:
7727
7795
  case LLM_ARCH_PLM:
7728
7796
  case LLM_ARCH_CHATGLM:
7729
- case LLM_ARCH_GLM4:
7730
7797
  case LLM_ARCH_GRANITE:
7731
7798
  case LLM_ARCH_GRANITE_MOE:
7732
7799
  case LLM_ARCH_GRANITE_HYBRID:
@@ -7788,7 +7855,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7788
7855
  case LLM_ARCH_LFM2:
7789
7856
  case LLM_ARCH_LFM2MOE:
7790
7857
  case LLM_ARCH_SMALLTHINKER:
7791
- case LLM_ARCH_GLM4_MOE:
7792
7858
  case LLM_ARCH_SEED_OSS:
7793
7859
  case LLM_ARCH_GROVEMOE:
7794
7860
  case LLM_ARCH_APERTUS:
@@ -7805,6 +7871,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7805
7871
  case LLM_ARCH_QWEN3VLMOE:
7806
7872
  return LLAMA_ROPE_TYPE_IMROPE;
7807
7873
 
7874
+ case LLM_ARCH_GLM4:
7875
+ return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
7876
+ case LLM_ARCH_GLM4_MOE:
7877
+ return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
7878
+
7808
7879
  // all model arches should be listed explicitly here
7809
7880
  case LLM_ARCH_UNKNOWN:
7810
7881
  GGML_ABORT("unknown architecture");
@@ -113,6 +113,7 @@ enum llm_type {
113
113
  LLM_TYPE_16B_A1B,
114
114
  LLM_TYPE_21B_A3B, // Ernie MoE small
115
115
  LLM_TYPE_30B_A3B,
116
+ LLM_TYPE_31B_A3_5B,
116
117
  LLM_TYPE_80B_A3B, // Qwen3 Next
117
118
  LLM_TYPE_100B_A6B,
118
119
  LLM_TYPE_106B_A12B, // GLM-4.5-Air
@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
596
596
  }
597
597
 
598
598
  std::vector<std::string> splits = {};
599
- llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
599
+ llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
600
600
  ml.init_mappings(false); // no prefetching
601
601
 
602
602
  llama_model model(llama_model_default_params());
@@ -1895,7 +1895,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1895
1895
  clean_spaces = false;
1896
1896
  } else if (
1897
1897
  tokenizer_pre == "qwen2" ||
1898
- tokenizer_pre == "deepseek-r1-qwen") {
1898
+ tokenizer_pre == "deepseek-r1-qwen" ||
1899
+ tokenizer_pre == "kormo") {
1899
1900
  pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1900
1901
  clean_spaces = false;
1901
1902
  } else if (