@fugood/llama.node 1.4.7 → 1.4.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +23 -24
  4. package/src/LlamaContext.cpp +4 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +470 -223
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  9. package/src/llama.cpp/common/chat.cpp +140 -0
  10. package/src/llama.cpp/common/common.cpp +130 -67
  11. package/src/llama.cpp/common/common.h +44 -17
  12. package/src/llama.cpp/common/console.cpp +98 -18
  13. package/src/llama.cpp/common/console.h +30 -8
  14. package/src/llama.cpp/common/download.cpp +69 -25
  15. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  16. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  17. package/src/llama.cpp/common/log.cpp +5 -0
  18. package/src/llama.cpp/common/log.h +1 -0
  19. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  20. package/src/llama.cpp/common/preset.cpp +206 -0
  21. package/src/llama.cpp/common/preset.h +32 -0
  22. package/src/llama.cpp/common/sampling.cpp +67 -54
  23. package/src/llama.cpp/common/sampling.h +8 -0
  24. package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
  25. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  26. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  27. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  28. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  29. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  30. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
  31. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +285 -0
  32. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -45
  34. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +288 -1
  36. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
  39. package/src/llama.cpp/include/llama.h +18 -1
  40. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  41. package/src/llama.cpp/src/llama-arch.h +9 -2
  42. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  43. package/src/llama.cpp/src/llama-batch.h +4 -2
  44. package/src/llama.cpp/src/llama-context.cpp +93 -23
  45. package/src/llama.cpp/src/llama-context.h +8 -2
  46. package/src/llama.cpp/src/llama-graph.cpp +84 -16
  47. package/src/llama.cpp/src/llama-graph.h +17 -4
  48. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  49. package/src/llama.cpp/src/llama-hparams.h +5 -1
  50. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  51. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  52. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  53. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  54. package/src/llama.cpp/src/llama-mmap.cpp +123 -28
  55. package/src/llama.cpp/src/llama-mmap.h +5 -1
  56. package/src/llama.cpp/src/llama-model-loader.cpp +58 -13
  57. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  58. package/src/llama.cpp/src/llama-model.cpp +110 -49
  59. package/src/llama.cpp/src/llama-model.h +1 -0
  60. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  61. package/src/llama.cpp/src/llama-sampling.cpp +16 -0
  62. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  63. package/src/llama.cpp/src/llama.cpp +665 -1
  64. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  65. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  66. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  67. package/src/llama.cpp/src/models/models.h +5 -5
  68. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  69. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  70. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -473,6 +473,7 @@ llama_model_loader::llama_model_loader(
473
473
  std::vector<std::string> & splits,
474
474
  bool use_mmap,
475
475
  bool check_tensors,
476
+ bool no_alloc,
476
477
  const llama_model_kv_override * param_overrides_p,
477
478
  const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
478
479
  int trace = 0;
@@ -503,7 +504,7 @@ llama_model_loader::llama_model_loader(
503
504
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
504
505
  llm_kv = LLM_KV(llm_arch_from_string(arch_name));
505
506
 
506
- files.emplace_back(new llama_file(fname.c_str(), "rb"));
507
+ files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
507
508
  contexts.emplace_back(ctx);
508
509
 
509
510
  // Save tensors data offset of the main file.
@@ -571,7 +572,7 @@ llama_model_loader::llama_model_loader(
571
572
  }
572
573
  }
573
574
 
574
- files.emplace_back(new llama_file(fname_split, "rb"));
575
+ files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
575
576
  contexts.emplace_back(ctx);
576
577
 
577
578
  // Save tensors data offset info of the shard.
@@ -716,6 +717,7 @@ llama_model_loader::llama_model_loader(
716
717
 
717
718
  this->use_mmap = use_mmap;
718
719
  this->check_tensors = check_tensors;
720
+ this->no_alloc = no_alloc;
719
721
  }
720
722
 
721
723
  std::string llama_model_loader::get_arch_name() const {
@@ -933,7 +935,15 @@ bool llama_model_loader::load_all_data(
933
935
  // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
934
936
  // NVMe raid configurations might require more / larger buffers.
935
937
  constexpr size_t n_buffers = 4;
936
- constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
938
+
939
+ size_t alignment = 1;
940
+ for (const auto & file : files) {
941
+ alignment = std::max(file->read_alignment(), alignment);
942
+ }
943
+
944
+ // Buffer size: balance between memory usage and I/O efficiency
945
+ // 64MB works well for NVMe drives
946
+ const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
937
947
 
938
948
  std::vector<ggml_backend_buffer_t> host_buffers;
939
949
  std::vector<ggml_backend_event_t> events;
@@ -983,6 +993,7 @@ bool llama_model_loader::load_all_data(
983
993
  // If the backend is supported, create pinned memory buffers and events for synchronisation.
984
994
  for (size_t idx = 0; idx < n_buffers; ++idx) {
985
995
  auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
996
+
986
997
  if (!buf) {
987
998
  LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
988
999
  ggml_backend_dev_name(dev));
@@ -1064,9 +1075,9 @@ bool llama_model_loader::load_all_data(
1064
1075
  }
1065
1076
  } else {
1066
1077
  const auto & file = files.at(weight->idx);
1078
+
1067
1079
  if (ggml_backend_buffer_is_host(cur->buffer)) {
1068
- file->seek(weight->offs, SEEK_SET);
1069
- file->read_raw(cur->data, n_size);
1080
+ file->read_raw_at(cur->data, n_size, weight->offs);
1070
1081
  if (check_tensors) {
1071
1082
  validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
1072
1083
  return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
@@ -1075,26 +1086,60 @@ bool llama_model_loader::load_all_data(
1075
1086
  } else {
1076
1087
  // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
1077
1088
  if (upload_backend) {
1078
- file->seek(weight->offs, SEEK_SET);
1089
+ auto offset = (off_t) weight->offs;
1090
+ alignment = file->read_alignment();
1091
+ off_t aligned_offset = offset & ~(alignment - 1);
1092
+ off_t offset_from_alignment = offset - aligned_offset;
1093
+ file->seek(aligned_offset, SEEK_SET);
1094
+
1095
+ // Calculate aligned read boundaries
1096
+ size_t read_start = aligned_offset;
1097
+ size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
1079
1098
 
1080
1099
  size_t bytes_read = 0;
1100
+ size_t data_read = 0; // Actual tensor data copied (excluding padding)
1101
+
1102
+ while (bytes_read < read_end - read_start) {
1103
+ size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
1081
1104
 
1082
- while (bytes_read < n_size) {
1083
- size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
1105
+ // Align the destination pointer within the pinned buffer
1106
+ uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
1084
1107
 
1108
+ // Wait for previous upload to complete before reusing buffer
1085
1109
  ggml_backend_event_synchronize(events[buffer_idx]);
1086
- file->read_raw(host_ptrs[buffer_idx], read_iteration);
1087
- ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
1110
+
1111
+ // Read aligned chunk from file
1112
+ file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
1113
+
1114
+ // Calculate actual data portion (excluding alignment padding)
1115
+ uintptr_t ptr_data = ptr_dest_aligned;
1116
+ size_t data_to_copy = read_size;
1117
+
1118
+ // Skip alignment padding at start of first chunk
1119
+ if (bytes_read == 0) {
1120
+ ptr_data += offset_from_alignment;
1121
+ data_to_copy -= offset_from_alignment;
1122
+ }
1123
+
1124
+ // Trim alignment padding at end of last chunk
1125
+ if (aligned_offset + bytes_read + read_size > offset + n_size) {
1126
+ data_to_copy -= (read_end - (offset + n_size));
1127
+ }
1128
+
1129
+ // Async upload actual data to GPU
1130
+ ggml_backend_tensor_set_async(upload_backend, cur,
1131
+ reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
1088
1132
  ggml_backend_event_record(events[buffer_idx], upload_backend);
1089
1133
 
1090
- bytes_read += read_iteration;
1134
+ data_read += data_to_copy;
1135
+ bytes_read += read_size;
1136
+
1091
1137
  ++buffer_idx;
1092
1138
  buffer_idx %= n_buffers;
1093
1139
  }
1094
1140
  } else {
1095
1141
  read_buf.resize(n_size);
1096
- file->seek(weight->offs, SEEK_SET);
1097
- file->read_raw(read_buf.data(), n_size);
1142
+ file->read_raw_at(read_buf.data(), n_size, weight->offs);
1098
1143
  ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
1099
1144
  if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
1100
1145
  throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
@@ -71,6 +71,7 @@ struct llama_model_loader {
71
71
 
72
72
  bool use_mmap = false;
73
73
  bool check_tensors;
74
+ bool no_alloc;
74
75
 
75
76
  llama_files files;
76
77
  llama_ftype ftype;
@@ -97,6 +98,7 @@ struct llama_model_loader {
97
98
  std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
98
99
  bool use_mmap,
99
100
  bool check_tensors,
101
+ bool no_alloc,
100
102
  const llama_model_kv_override * param_overrides_p,
101
103
  const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
102
104
 
@@ -120,6 +120,8 @@ const char * llm_type_name(llm_type type) {
120
120
  case LLM_TYPE_16B_A1B: return "16B.A1B";
121
121
  case LLM_TYPE_21B_A3B: return "21B.A3B";
122
122
  case LLM_TYPE_30B_A3B: return "30B.A3B";
123
+ case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
124
+ case LLM_TYPE_80B_A3B: return "80B.A3B";
123
125
  case LLM_TYPE_100B_A6B: return "100B.A6B";
124
126
  case LLM_TYPE_106B_A12B: return "106B.A12B";
125
127
  case LLM_TYPE_230B_A10B: return "230B.A10B";
@@ -667,6 +669,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
667
669
  hparams.n_swa = 8192;
668
670
  hparams.n_attn_temp_floor_scale = 8192;
669
671
  hparams.f_attn_temp_scale = 0.1f;
672
+ hparams.f_attn_temp_offset = 1.0f;
670
673
  hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
671
674
  }
672
675
 
@@ -1634,12 +1637,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1634
1637
  // that have no expert_gating_func model parameter set
1635
1638
  hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
1636
1639
  }
1637
- ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
1640
+
1641
+ if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
1642
+ // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
1643
+ // cancel the factor from the convert script
1644
+ hparams.rope_yarn_log_mul /= 0.1f;
1645
+ }
1638
1646
 
1639
1647
  // (optional) temperature tuning - used by mistral-large
1640
1648
  ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
1641
1649
  ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
1642
1650
 
1651
+ hparams.f_attn_temp_offset = 0.0f;
1652
+
1643
1653
  switch (hparams.n_layer) {
1644
1654
  case 27: type = LLM_TYPE_16B; break;
1645
1655
  case 60: type = LLM_TYPE_236B; break;
@@ -1679,7 +1689,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1679
1689
  } break;
1680
1690
  case LLM_ARCH_GLM4:
1681
1691
  {
1682
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1692
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1693
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
1683
1694
  switch (hparams.n_layer) {
1684
1695
  case 40: type = LLM_TYPE_9B; break;
1685
1696
  case 61: type = LLM_TYPE_32B; break;
@@ -1688,8 +1699,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1688
1699
  } break;
1689
1700
  case LLM_ARCH_GLM4_MOE:
1690
1701
  {
1691
- ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1692
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1702
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1703
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1704
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
1693
1705
 
1694
1706
  // MoE parameters
1695
1707
  ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
@@ -1788,6 +1800,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1788
1800
  }
1789
1801
  } break;
1790
1802
  case LLM_ARCH_NEMOTRON_H:
1803
+ case LLM_ARCH_NEMOTRON_H_MOE:
1791
1804
  {
1792
1805
  ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1793
1806
  ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
@@ -1803,7 +1816,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1803
1816
 
1804
1817
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1805
1818
 
1819
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1820
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
1821
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
1822
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1823
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
1824
+
1806
1825
  switch (hparams.n_layer) {
1826
+ case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
1807
1827
  case 56: type = LLM_TYPE_9B; break;
1808
1828
  default: type = LLM_TYPE_UNKNOWN;
1809
1829
  }
@@ -2257,7 +2277,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2257
2277
  }
2258
2278
 
2259
2279
  switch (hparams.n_layer) {
2260
- case 80: type = LLM_TYPE_80B_A3B; break;
2280
+ case 48: type = LLM_TYPE_80B_A3B; break;
2261
2281
  default: type = LLM_TYPE_UNKNOWN;
2262
2282
  }
2263
2283
  } break;
@@ -2266,9 +2286,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2266
2286
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2267
2287
  ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
2268
2288
 
2269
- ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
2270
- ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
2271
- ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
2289
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
2290
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
2291
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f);
2292
+
2293
+ hparams.f_attn_temp_offset = 0.0f;
2272
2294
 
2273
2295
  // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
2274
2296
  if (hparams.f_attn_temp_scale != 0.0f) {
@@ -2278,18 +2300,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2278
2300
  }
2279
2301
  }
2280
2302
 
2281
- // TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
2282
- // but may need further verification with other values
2283
- if (hparams.rope_yarn_log_mul != 0.0f) {
2284
- float factor = 1.0f / hparams.rope_freq_scale_train;
2285
- float mscale = 1.0f;
2286
- float mscale_all_dims = hparams.rope_yarn_log_mul;
2287
- static auto get_mscale = [](float scale, float mscale) {
2288
- return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
2289
- };
2290
- hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
2291
- }
2292
-
2293
2303
  switch (hparams.n_layer) {
2294
2304
  case 26: type = LLM_TYPE_3B; break;
2295
2305
  case 34: type = LLM_TYPE_8B; break;
@@ -2368,10 +2378,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2368
2378
  if (cpu_dev == nullptr) {
2369
2379
  throw std::runtime_error(format("%s: no CPU backend found", __func__));
2370
2380
  }
2371
- const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
2372
- const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
2381
+ const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
2382
+ const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
2373
2383
  auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
2374
- const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
2384
+ const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
2375
2385
  if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
2376
2386
  LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
2377
2387
  return {cpu_dev, &pimpl->cpu_buft_list};
@@ -3389,9 +3399,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3389
3399
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3390
3400
 
3391
3401
  // optional bias tensors
3392
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
3393
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
3394
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
3402
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3403
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3404
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3395
3405
 
3396
3406
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3397
3407
 
@@ -5160,6 +5170,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5160
5170
  }
5161
5171
  } break;
5162
5172
  case LLM_ARCH_NEMOTRON_H:
5173
+ case LLM_ARCH_NEMOTRON_H_MOE:
5163
5174
  {
5164
5175
  // mamba2 Mixer SSM params
5165
5176
  // NOTE: int64_t for tensor dimensions
@@ -5170,6 +5181,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5170
5181
  const int64_t n_group = hparams.ssm_n_group;
5171
5182
  const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
5172
5183
 
5184
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
5185
+ const int64_t n_ff_shexp = hparams.n_ff_shexp;
5186
+
5173
5187
  // embeddings
5174
5188
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5175
5189
 
@@ -5219,12 +5233,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5219
5233
  layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
5220
5234
  layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
5221
5235
  layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5222
- } else {
5223
- // mlp layers
5224
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
5225
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
5226
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5227
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
5236
+ } else {
5237
+ if (n_expert != 0) {
5238
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
5239
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
5240
+
5241
+ // MoE branch
5242
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5243
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5244
+
5245
+ // Shared expert branch
5246
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
5247
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
5248
+
5249
+ } else {
5250
+ // mlp layers
5251
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
5252
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
5253
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5254
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
5255
+ }
5228
5256
  }
5229
5257
  }
5230
5258
  } break;
@@ -6208,8 +6236,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6208
6236
  {
6209
6237
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6210
6238
 
6211
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6212
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6239
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
6240
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6213
6241
 
6214
6242
  if (output == NULL) {
6215
6243
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
@@ -6607,9 +6635,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6607
6635
 
6608
6636
  std::vector<ggml_backend_buffer_ptr> bufs;
6609
6637
  if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
6638
+ GGML_ASSERT(!ml.no_alloc);
6610
6639
  for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
6611
6640
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
6612
- // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
6641
+ // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
6642
+ // then we could just use metal for all layers
6613
6643
  // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
6614
6644
  void * addr = nullptr;
6615
6645
  size_t first, last; // NOLINT
@@ -6625,9 +6655,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6625
6655
  bufs.emplace_back(buf);
6626
6656
  buf_map.emplace(idx, buf);
6627
6657
  }
6628
- }
6629
- else {
6630
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6658
+ } else {
6659
+ ggml_backend_buffer_t buf;
6660
+ if (ml.no_alloc) {
6661
+ buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
6662
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
6663
+ t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
6664
+ }
6665
+ } else {
6666
+ buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
6667
+ }
6631
6668
  if (buf == nullptr) {
6632
6669
  throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6633
6670
  }
@@ -6656,10 +6693,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6656
6693
  if (llama_supports_gpu_offload()) {
6657
6694
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
6658
6695
 
6659
- LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
6660
- if (n_gpu_layers > (int) hparams.n_layer) {
6696
+ int n_repeating = n_gpu;
6697
+ if (n_repeating > 0) {
6661
6698
  LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
6699
+ n_repeating--;
6662
6700
  }
6701
+ LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
6663
6702
 
6664
6703
  const int max_backend_supported_layers = hparams.n_layer + 1;
6665
6704
  const int max_offloadable_layers = hparams.n_layer + 1;
@@ -6682,6 +6721,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6682
6721
  }
6683
6722
  }
6684
6723
 
6724
+ if (ml.no_alloc) {
6725
+ return true;
6726
+ }
6727
+
6685
6728
  // load tensor data
6686
6729
  for (auto & [ctx, buf_map] : ctx_buf_maps) {
6687
6730
  if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
@@ -6724,9 +6767,18 @@ size_t llama_model::n_devices() const {
6724
6767
 
6725
6768
  std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
6726
6769
  std::map<ggml_backend_buffer_type_t, size_t> ret;
6727
- for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
6728
- for (const auto & buf : bufs) {
6729
- ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6770
+ for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
6771
+ if (hparams.no_alloc) {
6772
+ GGML_ASSERT(bufs.size() == 1);
6773
+ ggml_backend_buffer_t buf = bufs[0].get();
6774
+ GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
6775
+ ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
6776
+ ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
6777
+ } else {
6778
+ for (const auto & buf : bufs) {
6779
+ // GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
6780
+ ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6781
+ }
6730
6782
  }
6731
6783
  }
6732
6784
  return ret;
@@ -6771,6 +6823,7 @@ void llama_model::print_info() const {
6771
6823
  // hparams
6772
6824
  LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
6773
6825
  LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
6826
+ LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
6774
6827
 
6775
6828
  if (!hparams.vocab_only) {
6776
6829
  LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
@@ -6805,6 +6858,7 @@ void llama_model::print_info() const {
6805
6858
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
6806
6859
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
6807
6860
  LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
6861
+ LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
6808
6862
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
6809
6863
  // MRoPE (Multi-axis Rotary Position Embedding) sections
6810
6864
  if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
@@ -6827,7 +6881,8 @@ void llama_model::print_info() const {
6827
6881
  arch == LLM_ARCH_PLAMO2 ||
6828
6882
  arch == LLM_ARCH_GRANITE_HYBRID ||
6829
6883
  arch == LLM_ARCH_QWEN3NEXT ||
6830
- arch == LLM_ARCH_NEMOTRON_H) {
6884
+ arch == LLM_ARCH_NEMOTRON_H ||
6885
+ arch == LLM_ARCH_NEMOTRON_H_MOE) {
6831
6886
  LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
6832
6887
  LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
6833
6888
  LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
@@ -6868,7 +6923,6 @@ void llama_model::print_info() const {
6868
6923
  LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
6869
6924
  LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
6870
6925
  LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
6871
- LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
6872
6926
  }
6873
6927
 
6874
6928
  if (arch == LLM_ARCH_QWEN2MOE) {
@@ -6883,7 +6937,8 @@ void llama_model::print_info() const {
6883
6937
  if (arch == LLM_ARCH_MINICPM ||
6884
6938
  arch == LLM_ARCH_GRANITE ||
6885
6939
  arch == LLM_ARCH_GRANITE_MOE ||
6886
- arch == LLM_ARCH_GRANITE_HYBRID) {
6940
+ arch == LLM_ARCH_GRANITE_HYBRID ||
6941
+ arch == LLM_ARCH_NEMOTRON_H_MOE) {
6887
6942
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
6888
6943
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
6889
6944
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -7064,7 +7119,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
7064
7119
  if (arch == LLM_ARCH_FALCON_H1) {
7065
7120
  filter_attn = [&](int32_t) { return true; };
7066
7121
  filter_recr = [&](int32_t) { return true; };
7067
- } else if (arch == LLM_ARCH_NEMOTRON_H) {
7122
+ } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
7068
7123
  filter_attn = [&](int32_t il) {
7069
7124
  return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
7070
7125
  };
@@ -7435,6 +7490,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7435
7490
  llm = std::make_unique<llm_build_nemotron>(*this, params);
7436
7491
  } break;
7437
7492
  case LLM_ARCH_NEMOTRON_H:
7493
+ case LLM_ARCH_NEMOTRON_H_MOE:
7438
7494
  {
7439
7495
  llm = std::make_unique<llm_build_nemotron_h>(*this, params);
7440
7496
  } break;
@@ -7619,6 +7675,7 @@ llama_model_params llama_model_default_params() {
7619
7675
  /*.check_tensors =*/ false,
7620
7676
  /*.use_extra_bufts =*/ true,
7621
7677
  /*.no_host =*/ false,
7678
+ /*.no_alloc =*/ false,
7622
7679
  };
7623
7680
 
7624
7681
  return result;
@@ -7718,6 +7775,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7718
7775
  case LLM_ARCH_ARWKV7:
7719
7776
  case LLM_ARCH_WAVTOKENIZER_DEC:
7720
7777
  case LLM_ARCH_NEMOTRON_H:
7778
+ case LLM_ARCH_NEMOTRON_H_MOE:
7721
7779
  return LLAMA_ROPE_TYPE_NONE;
7722
7780
 
7723
7781
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -7738,7 +7796,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7738
7796
  case LLM_ARCH_DEEPSEEK2:
7739
7797
  case LLM_ARCH_PLM:
7740
7798
  case LLM_ARCH_CHATGLM:
7741
- case LLM_ARCH_GLM4:
7742
7799
  case LLM_ARCH_GRANITE:
7743
7800
  case LLM_ARCH_GRANITE_MOE:
7744
7801
  case LLM_ARCH_GRANITE_HYBRID:
@@ -7800,7 +7857,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7800
7857
  case LLM_ARCH_LFM2:
7801
7858
  case LLM_ARCH_LFM2MOE:
7802
7859
  case LLM_ARCH_SMALLTHINKER:
7803
- case LLM_ARCH_GLM4_MOE:
7804
7860
  case LLM_ARCH_SEED_OSS:
7805
7861
  case LLM_ARCH_GROVEMOE:
7806
7862
  case LLM_ARCH_APERTUS:
@@ -7817,6 +7873,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7817
7873
  case LLM_ARCH_QWEN3VLMOE:
7818
7874
  return LLAMA_ROPE_TYPE_IMROPE;
7819
7875
 
7876
+ case LLM_ARCH_GLM4:
7877
+ return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
7878
+ case LLM_ARCH_GLM4_MOE:
7879
+ return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
7880
+
7820
7881
  // all model arches should be listed explicitly here
7821
7882
  case LLM_ARCH_UNKNOWN:
7822
7883
  GGML_ABORT("unknown architecture");
@@ -113,6 +113,7 @@ enum llm_type {
113
113
  LLM_TYPE_16B_A1B,
114
114
  LLM_TYPE_21B_A3B, // Ernie MoE small
115
115
  LLM_TYPE_30B_A3B,
116
+ LLM_TYPE_31B_A3_5B,
116
117
  LLM_TYPE_80B_A3B, // Qwen3 Next
117
118
  LLM_TYPE_100B_A6B,
118
119
  LLM_TYPE_106B_A12B, // GLM-4.5-Air
@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
596
596
  }
597
597
 
598
598
  std::vector<std::string> splits = {};
599
- llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
599
+ llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
600
600
  ml.init_mappings(false); // no prefetching
601
601
 
602
602
  llama_model model(llama_model_default_params());
@@ -362,23 +362,39 @@ const char * llama_sampler_name(const struct llama_sampler * smpl) {
362
362
  }
363
363
 
364
364
  void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) {
365
+ if (!smpl) {
366
+ return;
367
+ }
368
+
365
369
  if (smpl->iface->accept) {
366
370
  smpl->iface->accept(smpl, token);
367
371
  }
368
372
  }
369
373
 
370
374
  void llama_sampler_apply(struct llama_sampler * smpl, struct llama_token_data_array * cur_p) {
375
+ if (!smpl) {
376
+ return;
377
+ }
378
+
371
379
  GGML_ASSERT(smpl->iface->apply);
372
380
  smpl->iface->apply(smpl, cur_p);
373
381
  }
374
382
 
375
383
  void llama_sampler_reset(struct llama_sampler * smpl) {
384
+ if (!smpl) {
385
+ return;
386
+ }
387
+
376
388
  if (smpl->iface->reset) {
377
389
  smpl->iface->reset(smpl);
378
390
  }
379
391
  }
380
392
 
381
393
  struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
394
+ if (!smpl) {
395
+ return nullptr;
396
+ }
397
+
382
398
  if (smpl->iface->clone) {
383
399
  return smpl->iface->clone(smpl);
384
400
  }
@@ -1895,7 +1895,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1895
1895
  clean_spaces = false;
1896
1896
  } else if (
1897
1897
  tokenizer_pre == "qwen2" ||
1898
- tokenizer_pre == "deepseek-r1-qwen") {
1898
+ tokenizer_pre == "deepseek-r1-qwen" ||
1899
+ tokenizer_pre == "kormo") {
1899
1900
  pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1900
1901
  clean_spaces = false;
1901
1902
  } else if (