@fugood/llama.node 1.4.7 → 1.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +22 -23
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +364 -193
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +140 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +40 -16
- package/src/llama.cpp/common/console.cpp +98 -18
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +91 -92
- package/src/llama.cpp/common/sampling.h +11 -6
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +60 -39
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +93 -23
- package/src/llama.cpp/src/llama-context.h +8 -2
- package/src/llama.cpp/src/llama-graph.cpp +84 -16
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +103 -44
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +675 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +5 -5
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -120,6 +120,8 @@ const char * llm_type_name(llm_type type) {
|
|
|
120
120
|
case LLM_TYPE_16B_A1B: return "16B.A1B";
|
|
121
121
|
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
|
122
122
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
123
|
+
case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
|
|
124
|
+
case LLM_TYPE_80B_A3B: return "80B.A3B";
|
|
123
125
|
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
|
124
126
|
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
|
125
127
|
case LLM_TYPE_230B_A10B: return "230B.A10B";
|
|
@@ -667,6 +669,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
667
669
|
hparams.n_swa = 8192;
|
|
668
670
|
hparams.n_attn_temp_floor_scale = 8192;
|
|
669
671
|
hparams.f_attn_temp_scale = 0.1f;
|
|
672
|
+
hparams.f_attn_temp_offset = 1.0f;
|
|
670
673
|
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
|
671
674
|
}
|
|
672
675
|
|
|
@@ -1634,12 +1637,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1634
1637
|
// that have no expert_gating_func model parameter set
|
|
1635
1638
|
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
|
|
1636
1639
|
}
|
|
1637
|
-
|
|
1640
|
+
|
|
1641
|
+
if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
|
|
1642
|
+
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
|
1643
|
+
// cancel the factor from the convert script
|
|
1644
|
+
hparams.rope_yarn_log_mul /= 0.1f;
|
|
1645
|
+
}
|
|
1638
1646
|
|
|
1639
1647
|
// (optional) temperature tuning - used by mistral-large
|
|
1640
1648
|
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
|
1641
1649
|
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
|
|
1642
1650
|
|
|
1651
|
+
hparams.f_attn_temp_offset = 0.0f;
|
|
1652
|
+
|
|
1643
1653
|
switch (hparams.n_layer) {
|
|
1644
1654
|
case 27: type = LLM_TYPE_16B; break;
|
|
1645
1655
|
case 60: type = LLM_TYPE_236B; break;
|
|
@@ -1679,7 +1689,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1679
1689
|
} break;
|
|
1680
1690
|
case LLM_ARCH_GLM4:
|
|
1681
1691
|
{
|
|
1682
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
|
1692
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1693
|
+
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
|
1683
1694
|
switch (hparams.n_layer) {
|
|
1684
1695
|
case 40: type = LLM_TYPE_9B; break;
|
|
1685
1696
|
case 61: type = LLM_TYPE_32B; break;
|
|
@@ -1688,8 +1699,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1688
1699
|
} break;
|
|
1689
1700
|
case LLM_ARCH_GLM4_MOE:
|
|
1690
1701
|
{
|
|
1691
|
-
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
|
1692
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
|
1702
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1703
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1704
|
+
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
|
1693
1705
|
|
|
1694
1706
|
// MoE parameters
|
|
1695
1707
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
|
@@ -1788,6 +1800,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1788
1800
|
}
|
|
1789
1801
|
} break;
|
|
1790
1802
|
case LLM_ARCH_NEMOTRON_H:
|
|
1803
|
+
case LLM_ARCH_NEMOTRON_H_MOE:
|
|
1791
1804
|
{
|
|
1792
1805
|
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
1793
1806
|
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
@@ -1803,7 +1816,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1803
1816
|
|
|
1804
1817
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1805
1818
|
|
|
1819
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
1820
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
|
1821
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
|
|
1822
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1823
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
1824
|
+
|
|
1806
1825
|
switch (hparams.n_layer) {
|
|
1826
|
+
case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
|
|
1807
1827
|
case 56: type = LLM_TYPE_9B; break;
|
|
1808
1828
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1809
1829
|
}
|
|
@@ -2257,7 +2277,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2257
2277
|
}
|
|
2258
2278
|
|
|
2259
2279
|
switch (hparams.n_layer) {
|
|
2260
|
-
case
|
|
2280
|
+
case 48: type = LLM_TYPE_80B_A3B; break;
|
|
2261
2281
|
default: type = LLM_TYPE_UNKNOWN;
|
|
2262
2282
|
}
|
|
2263
2283
|
} break;
|
|
@@ -2266,9 +2286,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2266
2286
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2267
2287
|
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
|
2268
2288
|
|
|
2269
|
-
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,
|
|
2270
|
-
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,
|
|
2271
|
-
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
|
|
2289
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
|
|
2290
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
|
|
2291
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f);
|
|
2292
|
+
|
|
2293
|
+
hparams.f_attn_temp_offset = 0.0f;
|
|
2272
2294
|
|
|
2273
2295
|
// TODO: maybe add n_attn_temp_floor_scale as a separate KV?
|
|
2274
2296
|
if (hparams.f_attn_temp_scale != 0.0f) {
|
|
@@ -2278,18 +2300,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2278
2300
|
}
|
|
2279
2301
|
}
|
|
2280
2302
|
|
|
2281
|
-
// TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
|
|
2282
|
-
// but may need further verification with other values
|
|
2283
|
-
if (hparams.rope_yarn_log_mul != 0.0f) {
|
|
2284
|
-
float factor = 1.0f / hparams.rope_freq_scale_train;
|
|
2285
|
-
float mscale = 1.0f;
|
|
2286
|
-
float mscale_all_dims = hparams.rope_yarn_log_mul;
|
|
2287
|
-
static auto get_mscale = [](float scale, float mscale) {
|
|
2288
|
-
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
|
|
2289
|
-
};
|
|
2290
|
-
hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
|
|
2291
|
-
}
|
|
2292
|
-
|
|
2293
2303
|
switch (hparams.n_layer) {
|
|
2294
2304
|
case 26: type = LLM_TYPE_3B; break;
|
|
2295
2305
|
case 34: type = LLM_TYPE_8B; break;
|
|
@@ -3389,9 +3399,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3389
3399
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
3390
3400
|
|
|
3391
3401
|
// optional bias tensors
|
|
3392
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
|
3393
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
|
3394
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
|
3402
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3403
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3404
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3395
3405
|
|
|
3396
3406
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3397
3407
|
|
|
@@ -5160,6 +5170,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5160
5170
|
}
|
|
5161
5171
|
} break;
|
|
5162
5172
|
case LLM_ARCH_NEMOTRON_H:
|
|
5173
|
+
case LLM_ARCH_NEMOTRON_H_MOE:
|
|
5163
5174
|
{
|
|
5164
5175
|
// mamba2 Mixer SSM params
|
|
5165
5176
|
// NOTE: int64_t for tensor dimensions
|
|
@@ -5170,6 +5181,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5170
5181
|
const int64_t n_group = hparams.ssm_n_group;
|
|
5171
5182
|
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
|
5172
5183
|
|
|
5184
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
5185
|
+
const int64_t n_ff_shexp = hparams.n_ff_shexp;
|
|
5186
|
+
|
|
5173
5187
|
// embeddings
|
|
5174
5188
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5175
5189
|
|
|
@@ -5219,12 +5233,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5219
5233
|
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
|
|
5220
5234
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
|
|
5221
5235
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
5222
|
-
}
|
|
5223
|
-
|
|
5224
|
-
|
|
5225
|
-
|
|
5226
|
-
|
|
5227
|
-
|
|
5236
|
+
} else {
|
|
5237
|
+
if (n_expert != 0) {
|
|
5238
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
|
|
5239
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
|
|
5240
|
+
|
|
5241
|
+
// MoE branch
|
|
5242
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
5243
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
5244
|
+
|
|
5245
|
+
// Shared expert branch
|
|
5246
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
|
|
5247
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
|
|
5248
|
+
|
|
5249
|
+
} else {
|
|
5250
|
+
// mlp layers
|
|
5251
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
|
5252
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
|
|
5253
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
5254
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
|
5255
|
+
}
|
|
5228
5256
|
}
|
|
5229
5257
|
}
|
|
5230
5258
|
} break;
|
|
@@ -6208,8 +6236,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6208
6236
|
{
|
|
6209
6237
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
6210
6238
|
|
|
6211
|
-
output_norm = create_tensor(tn(
|
|
6212
|
-
output = create_tensor(tn(LLM_TENSOR_OUTPUT,
|
|
6239
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
|
|
6240
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
6213
6241
|
|
|
6214
6242
|
if (output == NULL) {
|
|
6215
6243
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
@@ -6607,9 +6635,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6607
6635
|
|
|
6608
6636
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
|
6609
6637
|
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
|
6638
|
+
GGML_ASSERT(!ml.no_alloc);
|
|
6610
6639
|
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
|
6611
6640
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
|
6612
|
-
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
|
|
6641
|
+
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
|
|
6642
|
+
// then we could just use metal for all layers
|
|
6613
6643
|
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
|
6614
6644
|
void * addr = nullptr;
|
|
6615
6645
|
size_t first, last; // NOLINT
|
|
@@ -6625,9 +6655,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6625
6655
|
bufs.emplace_back(buf);
|
|
6626
6656
|
buf_map.emplace(idx, buf);
|
|
6627
6657
|
}
|
|
6628
|
-
}
|
|
6629
|
-
|
|
6630
|
-
|
|
6658
|
+
} else {
|
|
6659
|
+
ggml_backend_buffer_t buf;
|
|
6660
|
+
if (ml.no_alloc) {
|
|
6661
|
+
buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
|
|
6662
|
+
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
|
6663
|
+
t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
|
|
6664
|
+
}
|
|
6665
|
+
} else {
|
|
6666
|
+
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
|
|
6667
|
+
}
|
|
6631
6668
|
if (buf == nullptr) {
|
|
6632
6669
|
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
|
6633
6670
|
}
|
|
@@ -6682,6 +6719,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6682
6719
|
}
|
|
6683
6720
|
}
|
|
6684
6721
|
|
|
6722
|
+
if (ml.no_alloc) {
|
|
6723
|
+
return true;
|
|
6724
|
+
}
|
|
6725
|
+
|
|
6685
6726
|
// load tensor data
|
|
6686
6727
|
for (auto & [ctx, buf_map] : ctx_buf_maps) {
|
|
6687
6728
|
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
|
|
@@ -6724,9 +6765,18 @@ size_t llama_model::n_devices() const {
|
|
|
6724
6765
|
|
|
6725
6766
|
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
|
6726
6767
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
6727
|
-
for (const auto & [
|
|
6728
|
-
|
|
6729
|
-
|
|
6768
|
+
for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
|
|
6769
|
+
if (hparams.no_alloc) {
|
|
6770
|
+
GGML_ASSERT(bufs.size() == 1);
|
|
6771
|
+
ggml_backend_buffer_t buf = bufs[0].get();
|
|
6772
|
+
GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
|
|
6773
|
+
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
|
|
6774
|
+
ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
|
|
6775
|
+
} else {
|
|
6776
|
+
for (const auto & buf : bufs) {
|
|
6777
|
+
// GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
|
|
6778
|
+
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
|
6779
|
+
}
|
|
6730
6780
|
}
|
|
6731
6781
|
}
|
|
6732
6782
|
return ret;
|
|
@@ -6771,6 +6821,7 @@ void llama_model::print_info() const {
|
|
|
6771
6821
|
// hparams
|
|
6772
6822
|
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
|
|
6773
6823
|
LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
|
|
6824
|
+
LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
|
|
6774
6825
|
|
|
6775
6826
|
if (!hparams.vocab_only) {
|
|
6776
6827
|
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
|
@@ -6805,6 +6856,7 @@ void llama_model::print_info() const {
|
|
|
6805
6856
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
6806
6857
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
6807
6858
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
6859
|
+
LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
6808
6860
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
6809
6861
|
// MRoPE (Multi-axis Rotary Position Embedding) sections
|
|
6810
6862
|
if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
|
|
@@ -6827,7 +6879,8 @@ void llama_model::print_info() const {
|
|
|
6827
6879
|
arch == LLM_ARCH_PLAMO2 ||
|
|
6828
6880
|
arch == LLM_ARCH_GRANITE_HYBRID ||
|
|
6829
6881
|
arch == LLM_ARCH_QWEN3NEXT ||
|
|
6830
|
-
arch == LLM_ARCH_NEMOTRON_H
|
|
6882
|
+
arch == LLM_ARCH_NEMOTRON_H ||
|
|
6883
|
+
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
|
6831
6884
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
6832
6885
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
6833
6886
|
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
@@ -6868,7 +6921,6 @@ void llama_model::print_info() const {
|
|
|
6868
6921
|
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
6869
6922
|
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
6870
6923
|
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
6871
|
-
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
6872
6924
|
}
|
|
6873
6925
|
|
|
6874
6926
|
if (arch == LLM_ARCH_QWEN2MOE) {
|
|
@@ -6883,7 +6935,8 @@ void llama_model::print_info() const {
|
|
|
6883
6935
|
if (arch == LLM_ARCH_MINICPM ||
|
|
6884
6936
|
arch == LLM_ARCH_GRANITE ||
|
|
6885
6937
|
arch == LLM_ARCH_GRANITE_MOE ||
|
|
6886
|
-
arch == LLM_ARCH_GRANITE_HYBRID
|
|
6938
|
+
arch == LLM_ARCH_GRANITE_HYBRID ||
|
|
6939
|
+
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
|
6887
6940
|
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
|
6888
6941
|
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
|
6889
6942
|
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
@@ -7064,7 +7117,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
7064
7117
|
if (arch == LLM_ARCH_FALCON_H1) {
|
|
7065
7118
|
filter_attn = [&](int32_t) { return true; };
|
|
7066
7119
|
filter_recr = [&](int32_t) { return true; };
|
|
7067
|
-
} else if (arch == LLM_ARCH_NEMOTRON_H) {
|
|
7120
|
+
} else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
|
7068
7121
|
filter_attn = [&](int32_t il) {
|
|
7069
7122
|
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
|
7070
7123
|
};
|
|
@@ -7435,6 +7488,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7435
7488
|
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
|
7436
7489
|
} break;
|
|
7437
7490
|
case LLM_ARCH_NEMOTRON_H:
|
|
7491
|
+
case LLM_ARCH_NEMOTRON_H_MOE:
|
|
7438
7492
|
{
|
|
7439
7493
|
llm = std::make_unique<llm_build_nemotron_h>(*this, params);
|
|
7440
7494
|
} break;
|
|
@@ -7619,6 +7673,7 @@ llama_model_params llama_model_default_params() {
|
|
|
7619
7673
|
/*.check_tensors =*/ false,
|
|
7620
7674
|
/*.use_extra_bufts =*/ true,
|
|
7621
7675
|
/*.no_host =*/ false,
|
|
7676
|
+
/*.no_alloc =*/ false,
|
|
7622
7677
|
};
|
|
7623
7678
|
|
|
7624
7679
|
return result;
|
|
@@ -7718,6 +7773,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7718
7773
|
case LLM_ARCH_ARWKV7:
|
|
7719
7774
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
7720
7775
|
case LLM_ARCH_NEMOTRON_H:
|
|
7776
|
+
case LLM_ARCH_NEMOTRON_H_MOE:
|
|
7721
7777
|
return LLAMA_ROPE_TYPE_NONE;
|
|
7722
7778
|
|
|
7723
7779
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
|
@@ -7738,7 +7794,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7738
7794
|
case LLM_ARCH_DEEPSEEK2:
|
|
7739
7795
|
case LLM_ARCH_PLM:
|
|
7740
7796
|
case LLM_ARCH_CHATGLM:
|
|
7741
|
-
case LLM_ARCH_GLM4:
|
|
7742
7797
|
case LLM_ARCH_GRANITE:
|
|
7743
7798
|
case LLM_ARCH_GRANITE_MOE:
|
|
7744
7799
|
case LLM_ARCH_GRANITE_HYBRID:
|
|
@@ -7800,7 +7855,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7800
7855
|
case LLM_ARCH_LFM2:
|
|
7801
7856
|
case LLM_ARCH_LFM2MOE:
|
|
7802
7857
|
case LLM_ARCH_SMALLTHINKER:
|
|
7803
|
-
case LLM_ARCH_GLM4_MOE:
|
|
7804
7858
|
case LLM_ARCH_SEED_OSS:
|
|
7805
7859
|
case LLM_ARCH_GROVEMOE:
|
|
7806
7860
|
case LLM_ARCH_APERTUS:
|
|
@@ -7817,6 +7871,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7817
7871
|
case LLM_ARCH_QWEN3VLMOE:
|
|
7818
7872
|
return LLAMA_ROPE_TYPE_IMROPE;
|
|
7819
7873
|
|
|
7874
|
+
case LLM_ARCH_GLM4:
|
|
7875
|
+
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
|
|
7876
|
+
case LLM_ARCH_GLM4_MOE:
|
|
7877
|
+
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
|
|
7878
|
+
|
|
7820
7879
|
// all model arches should be listed explicitly here
|
|
7821
7880
|
case LLM_ARCH_UNKNOWN:
|
|
7822
7881
|
GGML_ABORT("unknown architecture");
|
|
@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
596
596
|
}
|
|
597
597
|
|
|
598
598
|
std::vector<std::string> splits = {};
|
|
599
|
-
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
|
|
599
|
+
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
|
600
600
|
ml.init_mappings(false); // no prefetching
|
|
601
601
|
|
|
602
602
|
llama_model model(llama_model_default_params());
|
|
@@ -1895,7 +1895,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1895
1895
|
clean_spaces = false;
|
|
1896
1896
|
} else if (
|
|
1897
1897
|
tokenizer_pre == "qwen2" ||
|
|
1898
|
-
tokenizer_pre == "deepseek-r1-qwen"
|
|
1898
|
+
tokenizer_pre == "deepseek-r1-qwen" ||
|
|
1899
|
+
tokenizer_pre == "kormo") {
|
|
1899
1900
|
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
|
1900
1901
|
clean_spaces = false;
|
|
1901
1902
|
} else if (
|