@fugood/llama.node 1.4.7 → 1.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +22 -23
  4. package/src/LlamaContext.cpp +2 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +364 -193
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  9. package/src/llama.cpp/common/chat.cpp +140 -0
  10. package/src/llama.cpp/common/common.cpp +130 -67
  11. package/src/llama.cpp/common/common.h +40 -16
  12. package/src/llama.cpp/common/console.cpp +98 -18
  13. package/src/llama.cpp/common/console.h +30 -8
  14. package/src/llama.cpp/common/download.cpp +69 -25
  15. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  16. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  17. package/src/llama.cpp/common/log.cpp +5 -0
  18. package/src/llama.cpp/common/log.h +1 -0
  19. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  20. package/src/llama.cpp/common/preset.cpp +206 -0
  21. package/src/llama.cpp/common/preset.h +32 -0
  22. package/src/llama.cpp/common/sampling.cpp +91 -92
  23. package/src/llama.cpp/common/sampling.h +11 -6
  24. package/src/llama.cpp/common/speculative.cpp +1 -1
  25. package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
  26. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  27. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  28. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  29. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  30. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  31. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +60 -39
  33. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
  35. package/src/llama.cpp/include/llama.h +18 -1
  36. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  37. package/src/llama.cpp/src/llama-arch.h +9 -2
  38. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  39. package/src/llama.cpp/src/llama-batch.h +4 -2
  40. package/src/llama.cpp/src/llama-context.cpp +93 -23
  41. package/src/llama.cpp/src/llama-context.h +8 -2
  42. package/src/llama.cpp/src/llama-graph.cpp +84 -16
  43. package/src/llama.cpp/src/llama-graph.h +17 -4
  44. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  45. package/src/llama.cpp/src/llama-hparams.h +5 -1
  46. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  47. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  48. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  49. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  50. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  51. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  52. package/src/llama.cpp/src/llama-model.cpp +103 -44
  53. package/src/llama.cpp/src/llama-model.h +1 -0
  54. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  55. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  56. package/src/llama.cpp/src/llama.cpp +675 -1
  57. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  58. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  59. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  60. package/src/llama.cpp/src/models/models.h +5 -5
  61. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  62. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  63. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -120,6 +120,8 @@ const char * llm_type_name(llm_type type) {
120
120
  case LLM_TYPE_16B_A1B: return "16B.A1B";
121
121
  case LLM_TYPE_21B_A3B: return "21B.A3B";
122
122
  case LLM_TYPE_30B_A3B: return "30B.A3B";
123
+ case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
124
+ case LLM_TYPE_80B_A3B: return "80B.A3B";
123
125
  case LLM_TYPE_100B_A6B: return "100B.A6B";
124
126
  case LLM_TYPE_106B_A12B: return "106B.A12B";
125
127
  case LLM_TYPE_230B_A10B: return "230B.A10B";
@@ -667,6 +669,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
667
669
  hparams.n_swa = 8192;
668
670
  hparams.n_attn_temp_floor_scale = 8192;
669
671
  hparams.f_attn_temp_scale = 0.1f;
672
+ hparams.f_attn_temp_offset = 1.0f;
670
673
  hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
671
674
  }
672
675
 
@@ -1634,12 +1637,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1634
1637
  // that have no expert_gating_func model parameter set
1635
1638
  hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
1636
1639
  }
1637
- ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
1640
+
1641
+ if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
1642
+ // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
1643
+ // cancel the factor from the convert script
1644
+ hparams.rope_yarn_log_mul /= 0.1f;
1645
+ }
1638
1646
 
1639
1647
  // (optional) temperature tuning - used by mistral-large
1640
1648
  ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
1641
1649
  ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
1642
1650
 
1651
+ hparams.f_attn_temp_offset = 0.0f;
1652
+
1643
1653
  switch (hparams.n_layer) {
1644
1654
  case 27: type = LLM_TYPE_16B; break;
1645
1655
  case 60: type = LLM_TYPE_236B; break;
@@ -1679,7 +1689,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1679
1689
  } break;
1680
1690
  case LLM_ARCH_GLM4:
1681
1691
  {
1682
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1692
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1693
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
1683
1694
  switch (hparams.n_layer) {
1684
1695
  case 40: type = LLM_TYPE_9B; break;
1685
1696
  case 61: type = LLM_TYPE_32B; break;
@@ -1688,8 +1699,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1688
1699
  } break;
1689
1700
  case LLM_ARCH_GLM4_MOE:
1690
1701
  {
1691
- ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1692
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1702
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1703
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1704
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
1693
1705
 
1694
1706
  // MoE parameters
1695
1707
  ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
@@ -1788,6 +1800,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1788
1800
  }
1789
1801
  } break;
1790
1802
  case LLM_ARCH_NEMOTRON_H:
1803
+ case LLM_ARCH_NEMOTRON_H_MOE:
1791
1804
  {
1792
1805
  ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1793
1806
  ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
@@ -1803,7 +1816,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1803
1816
 
1804
1817
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1805
1818
 
1819
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1820
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
1821
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
1822
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1823
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
1824
+
1806
1825
  switch (hparams.n_layer) {
1826
+ case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
1807
1827
  case 56: type = LLM_TYPE_9B; break;
1808
1828
  default: type = LLM_TYPE_UNKNOWN;
1809
1829
  }
@@ -2257,7 +2277,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2257
2277
  }
2258
2278
 
2259
2279
  switch (hparams.n_layer) {
2260
- case 80: type = LLM_TYPE_80B_A3B; break;
2280
+ case 48: type = LLM_TYPE_80B_A3B; break;
2261
2281
  default: type = LLM_TYPE_UNKNOWN;
2262
2282
  }
2263
2283
  } break;
@@ -2266,9 +2286,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2266
2286
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2267
2287
  ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
2268
2288
 
2269
- ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
2270
- ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
2271
- ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
2289
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
2290
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
2291
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f);
2292
+
2293
+ hparams.f_attn_temp_offset = 0.0f;
2272
2294
 
2273
2295
  // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
2274
2296
  if (hparams.f_attn_temp_scale != 0.0f) {
@@ -2278,18 +2300,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2278
2300
  }
2279
2301
  }
2280
2302
 
2281
- // TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
2282
- // but may need further verification with other values
2283
- if (hparams.rope_yarn_log_mul != 0.0f) {
2284
- float factor = 1.0f / hparams.rope_freq_scale_train;
2285
- float mscale = 1.0f;
2286
- float mscale_all_dims = hparams.rope_yarn_log_mul;
2287
- static auto get_mscale = [](float scale, float mscale) {
2288
- return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
2289
- };
2290
- hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
2291
- }
2292
-
2293
2303
  switch (hparams.n_layer) {
2294
2304
  case 26: type = LLM_TYPE_3B; break;
2295
2305
  case 34: type = LLM_TYPE_8B; break;
@@ -3389,9 +3399,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3389
3399
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3390
3400
 
3391
3401
  // optional bias tensors
3392
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
3393
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
3394
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
3402
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3403
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3404
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3395
3405
 
3396
3406
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3397
3407
 
@@ -5160,6 +5170,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5160
5170
  }
5161
5171
  } break;
5162
5172
  case LLM_ARCH_NEMOTRON_H:
5173
+ case LLM_ARCH_NEMOTRON_H_MOE:
5163
5174
  {
5164
5175
  // mamba2 Mixer SSM params
5165
5176
  // NOTE: int64_t for tensor dimensions
@@ -5170,6 +5181,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5170
5181
  const int64_t n_group = hparams.ssm_n_group;
5171
5182
  const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
5172
5183
 
5184
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
5185
+ const int64_t n_ff_shexp = hparams.n_ff_shexp;
5186
+
5173
5187
  // embeddings
5174
5188
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5175
5189
 
@@ -5219,12 +5233,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5219
5233
  layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
5220
5234
  layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
5221
5235
  layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5222
- } else {
5223
- // mlp layers
5224
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
5225
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
5226
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5227
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
5236
+ } else {
5237
+ if (n_expert != 0) {
5238
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
5239
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
5240
+
5241
+ // MoE branch
5242
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5243
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5244
+
5245
+ // Shared expert branch
5246
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
5247
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
5248
+
5249
+ } else {
5250
+ // mlp layers
5251
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
5252
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
5253
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5254
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
5255
+ }
5228
5256
  }
5229
5257
  }
5230
5258
  } break;
@@ -6208,8 +6236,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6208
6236
  {
6209
6237
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6210
6238
 
6211
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6212
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6239
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
6240
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6213
6241
 
6214
6242
  if (output == NULL) {
6215
6243
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
@@ -6607,9 +6635,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6607
6635
 
6608
6636
  std::vector<ggml_backend_buffer_ptr> bufs;
6609
6637
  if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
6638
+ GGML_ASSERT(!ml.no_alloc);
6610
6639
  for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
6611
6640
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
6612
- // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
6641
+ // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
6642
+ // then we could just use metal for all layers
6613
6643
  // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
6614
6644
  void * addr = nullptr;
6615
6645
  size_t first, last; // NOLINT
@@ -6625,9 +6655,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6625
6655
  bufs.emplace_back(buf);
6626
6656
  buf_map.emplace(idx, buf);
6627
6657
  }
6628
- }
6629
- else {
6630
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6658
+ } else {
6659
+ ggml_backend_buffer_t buf;
6660
+ if (ml.no_alloc) {
6661
+ buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
6662
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
6663
+ t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
6664
+ }
6665
+ } else {
6666
+ buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
6667
+ }
6631
6668
  if (buf == nullptr) {
6632
6669
  throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6633
6670
  }
@@ -6682,6 +6719,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6682
6719
  }
6683
6720
  }
6684
6721
 
6722
+ if (ml.no_alloc) {
6723
+ return true;
6724
+ }
6725
+
6685
6726
  // load tensor data
6686
6727
  for (auto & [ctx, buf_map] : ctx_buf_maps) {
6687
6728
  if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
@@ -6724,9 +6765,18 @@ size_t llama_model::n_devices() const {
6724
6765
 
6725
6766
  std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
6726
6767
  std::map<ggml_backend_buffer_type_t, size_t> ret;
6727
- for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
6728
- for (const auto & buf : bufs) {
6729
- ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6768
+ for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
6769
+ if (hparams.no_alloc) {
6770
+ GGML_ASSERT(bufs.size() == 1);
6771
+ ggml_backend_buffer_t buf = bufs[0].get();
6772
+ GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
6773
+ ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
6774
+ ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
6775
+ } else {
6776
+ for (const auto & buf : bufs) {
6777
+ // GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
6778
+ ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6779
+ }
6730
6780
  }
6731
6781
  }
6732
6782
  return ret;
@@ -6771,6 +6821,7 @@ void llama_model::print_info() const {
6771
6821
  // hparams
6772
6822
  LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
6773
6823
  LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
6824
+ LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
6774
6825
 
6775
6826
  if (!hparams.vocab_only) {
6776
6827
  LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
@@ -6805,6 +6856,7 @@ void llama_model::print_info() const {
6805
6856
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
6806
6857
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
6807
6858
  LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
6859
+ LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
6808
6860
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
6809
6861
  // MRoPE (Multi-axis Rotary Position Embedding) sections
6810
6862
  if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
@@ -6827,7 +6879,8 @@ void llama_model::print_info() const {
6827
6879
  arch == LLM_ARCH_PLAMO2 ||
6828
6880
  arch == LLM_ARCH_GRANITE_HYBRID ||
6829
6881
  arch == LLM_ARCH_QWEN3NEXT ||
6830
- arch == LLM_ARCH_NEMOTRON_H) {
6882
+ arch == LLM_ARCH_NEMOTRON_H ||
6883
+ arch == LLM_ARCH_NEMOTRON_H_MOE) {
6831
6884
  LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
6832
6885
  LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
6833
6886
  LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
@@ -6868,7 +6921,6 @@ void llama_model::print_info() const {
6868
6921
  LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
6869
6922
  LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
6870
6923
  LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
6871
- LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
6872
6924
  }
6873
6925
 
6874
6926
  if (arch == LLM_ARCH_QWEN2MOE) {
@@ -6883,7 +6935,8 @@ void llama_model::print_info() const {
6883
6935
  if (arch == LLM_ARCH_MINICPM ||
6884
6936
  arch == LLM_ARCH_GRANITE ||
6885
6937
  arch == LLM_ARCH_GRANITE_MOE ||
6886
- arch == LLM_ARCH_GRANITE_HYBRID) {
6938
+ arch == LLM_ARCH_GRANITE_HYBRID ||
6939
+ arch == LLM_ARCH_NEMOTRON_H_MOE) {
6887
6940
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
6888
6941
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
6889
6942
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -7064,7 +7117,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
7064
7117
  if (arch == LLM_ARCH_FALCON_H1) {
7065
7118
  filter_attn = [&](int32_t) { return true; };
7066
7119
  filter_recr = [&](int32_t) { return true; };
7067
- } else if (arch == LLM_ARCH_NEMOTRON_H) {
7120
+ } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
7068
7121
  filter_attn = [&](int32_t il) {
7069
7122
  return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
7070
7123
  };
@@ -7435,6 +7488,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7435
7488
  llm = std::make_unique<llm_build_nemotron>(*this, params);
7436
7489
  } break;
7437
7490
  case LLM_ARCH_NEMOTRON_H:
7491
+ case LLM_ARCH_NEMOTRON_H_MOE:
7438
7492
  {
7439
7493
  llm = std::make_unique<llm_build_nemotron_h>(*this, params);
7440
7494
  } break;
@@ -7619,6 +7673,7 @@ llama_model_params llama_model_default_params() {
7619
7673
  /*.check_tensors =*/ false,
7620
7674
  /*.use_extra_bufts =*/ true,
7621
7675
  /*.no_host =*/ false,
7676
+ /*.no_alloc =*/ false,
7622
7677
  };
7623
7678
 
7624
7679
  return result;
@@ -7718,6 +7773,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7718
7773
  case LLM_ARCH_ARWKV7:
7719
7774
  case LLM_ARCH_WAVTOKENIZER_DEC:
7720
7775
  case LLM_ARCH_NEMOTRON_H:
7776
+ case LLM_ARCH_NEMOTRON_H_MOE:
7721
7777
  return LLAMA_ROPE_TYPE_NONE;
7722
7778
 
7723
7779
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -7738,7 +7794,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7738
7794
  case LLM_ARCH_DEEPSEEK2:
7739
7795
  case LLM_ARCH_PLM:
7740
7796
  case LLM_ARCH_CHATGLM:
7741
- case LLM_ARCH_GLM4:
7742
7797
  case LLM_ARCH_GRANITE:
7743
7798
  case LLM_ARCH_GRANITE_MOE:
7744
7799
  case LLM_ARCH_GRANITE_HYBRID:
@@ -7800,7 +7855,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7800
7855
  case LLM_ARCH_LFM2:
7801
7856
  case LLM_ARCH_LFM2MOE:
7802
7857
  case LLM_ARCH_SMALLTHINKER:
7803
- case LLM_ARCH_GLM4_MOE:
7804
7858
  case LLM_ARCH_SEED_OSS:
7805
7859
  case LLM_ARCH_GROVEMOE:
7806
7860
  case LLM_ARCH_APERTUS:
@@ -7817,6 +7871,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7817
7871
  case LLM_ARCH_QWEN3VLMOE:
7818
7872
  return LLAMA_ROPE_TYPE_IMROPE;
7819
7873
 
7874
+ case LLM_ARCH_GLM4:
7875
+ return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
7876
+ case LLM_ARCH_GLM4_MOE:
7877
+ return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
7878
+
7820
7879
  // all model arches should be listed explicitly here
7821
7880
  case LLM_ARCH_UNKNOWN:
7822
7881
  GGML_ABORT("unknown architecture");
@@ -113,6 +113,7 @@ enum llm_type {
113
113
  LLM_TYPE_16B_A1B,
114
114
  LLM_TYPE_21B_A3B, // Ernie MoE small
115
115
  LLM_TYPE_30B_A3B,
116
+ LLM_TYPE_31B_A3_5B,
116
117
  LLM_TYPE_80B_A3B, // Qwen3 Next
117
118
  LLM_TYPE_100B_A6B,
118
119
  LLM_TYPE_106B_A12B, // GLM-4.5-Air
@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
596
596
  }
597
597
 
598
598
  std::vector<std::string> splits = {};
599
- llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
599
+ llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
600
600
  ml.init_mappings(false); // no prefetching
601
601
 
602
602
  llama_model model(llama_model_default_params());
@@ -1895,7 +1895,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1895
1895
  clean_spaces = false;
1896
1896
  } else if (
1897
1897
  tokenizer_pre == "qwen2" ||
1898
- tokenizer_pre == "deepseek-r1-qwen") {
1898
+ tokenizer_pre == "deepseek-r1-qwen" ||
1899
+ tokenizer_pre == "kormo") {
1899
1900
  pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1900
1901
  clean_spaces = false;
1901
1902
  } else if (