@fugood/llama.node 1.4.6 → 1.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +25 -26
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +364 -193
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
- package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
- package/src/llama.cpp/common/chat-parser.cpp +3 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +272 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +40 -16
- package/src/llama.cpp/common/console.cpp +680 -47
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +91 -92
- package/src/llama.cpp/common/sampling.h +11 -6
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +69 -39
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +99 -29
- package/src/llama.cpp/src/llama-context.h +9 -3
- package/src/llama.cpp/src/llama-grammar.cpp +233 -33
- package/src/llama.cpp/src/llama-grammar.h +20 -1
- package/src/llama.cpp/src/llama-graph.cpp +85 -17
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +123 -52
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +675 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +8 -7
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -120,6 +120,8 @@ const char * llm_type_name(llm_type type) {
|
|
|
120
120
|
case LLM_TYPE_16B_A1B: return "16B.A1B";
|
|
121
121
|
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
|
122
122
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
123
|
+
case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
|
|
124
|
+
case LLM_TYPE_80B_A3B: return "80B.A3B";
|
|
123
125
|
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
|
124
126
|
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
|
125
127
|
case LLM_TYPE_230B_A10B: return "230B.A10B";
|
|
@@ -667,6 +669,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
667
669
|
hparams.n_swa = 8192;
|
|
668
670
|
hparams.n_attn_temp_floor_scale = 8192;
|
|
669
671
|
hparams.f_attn_temp_scale = 0.1f;
|
|
672
|
+
hparams.f_attn_temp_offset = 1.0f;
|
|
670
673
|
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
|
671
674
|
}
|
|
672
675
|
|
|
@@ -1264,18 +1267,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1264
1267
|
} break;
|
|
1265
1268
|
case LLM_ARCH_GEMMA3:
|
|
1266
1269
|
{
|
|
1267
|
-
hparams.
|
|
1268
|
-
hparams.
|
|
1270
|
+
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
1271
|
+
if (found_swa && hparams.n_swa > 0) {
|
|
1272
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1273
|
+
hparams.set_swa_pattern(6);
|
|
1269
1274
|
|
|
1270
|
-
|
|
1271
|
-
|
|
1275
|
+
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1276
|
+
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1277
|
+
} else {
|
|
1278
|
+
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
1279
|
+
}
|
|
1272
1280
|
|
|
1273
|
-
|
|
1281
|
+
hparams.f_final_logit_softcapping = 0.0f;
|
|
1282
|
+
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
|
1274
1283
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1275
1284
|
|
|
1276
1285
|
switch (hparams.n_layer) {
|
|
1277
1286
|
case 18: type = LLM_TYPE_270M; break;
|
|
1278
1287
|
case 26: type = LLM_TYPE_1B; break;
|
|
1288
|
+
case 32: type = LLM_TYPE_8B; break; // Rnj-1
|
|
1279
1289
|
case 34: type = LLM_TYPE_4B; break;
|
|
1280
1290
|
case 48: type = LLM_TYPE_12B; break;
|
|
1281
1291
|
case 62: type = LLM_TYPE_27B; break;
|
|
@@ -1599,8 +1609,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1599
1609
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1600
1610
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1601
1611
|
|
|
1602
|
-
switch (hparams.
|
|
1603
|
-
case
|
|
1612
|
+
switch (hparams.n_ff_exp) {
|
|
1613
|
+
case 1408: type = LLM_TYPE_16B; break;
|
|
1614
|
+
case 1792: type = LLM_TYPE_20B; break;
|
|
1604
1615
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1605
1616
|
}
|
|
1606
1617
|
} break;
|
|
@@ -1626,12 +1637,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1626
1637
|
// that have no expert_gating_func model parameter set
|
|
1627
1638
|
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
|
|
1628
1639
|
}
|
|
1629
|
-
|
|
1640
|
+
|
|
1641
|
+
if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
|
|
1642
|
+
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
|
1643
|
+
// cancel the factor from the convert script
|
|
1644
|
+
hparams.rope_yarn_log_mul /= 0.1f;
|
|
1645
|
+
}
|
|
1630
1646
|
|
|
1631
1647
|
// (optional) temperature tuning - used by mistral-large
|
|
1632
1648
|
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
|
1633
1649
|
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
|
|
1634
1650
|
|
|
1651
|
+
hparams.f_attn_temp_offset = 0.0f;
|
|
1652
|
+
|
|
1635
1653
|
switch (hparams.n_layer) {
|
|
1636
1654
|
case 27: type = LLM_TYPE_16B; break;
|
|
1637
1655
|
case 60: type = LLM_TYPE_236B; break;
|
|
@@ -1671,7 +1689,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1671
1689
|
} break;
|
|
1672
1690
|
case LLM_ARCH_GLM4:
|
|
1673
1691
|
{
|
|
1674
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
|
1692
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1693
|
+
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
|
1675
1694
|
switch (hparams.n_layer) {
|
|
1676
1695
|
case 40: type = LLM_TYPE_9B; break;
|
|
1677
1696
|
case 61: type = LLM_TYPE_32B; break;
|
|
@@ -1680,8 +1699,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1680
1699
|
} break;
|
|
1681
1700
|
case LLM_ARCH_GLM4_MOE:
|
|
1682
1701
|
{
|
|
1683
|
-
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
|
1684
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
|
1702
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1703
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1704
|
+
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
|
1685
1705
|
|
|
1686
1706
|
// MoE parameters
|
|
1687
1707
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
|
@@ -1780,6 +1800,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1780
1800
|
}
|
|
1781
1801
|
} break;
|
|
1782
1802
|
case LLM_ARCH_NEMOTRON_H:
|
|
1803
|
+
case LLM_ARCH_NEMOTRON_H_MOE:
|
|
1783
1804
|
{
|
|
1784
1805
|
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
1785
1806
|
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
@@ -1795,7 +1816,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1795
1816
|
|
|
1796
1817
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1797
1818
|
|
|
1819
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
1820
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
|
1821
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
|
|
1822
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1823
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
1824
|
+
|
|
1798
1825
|
switch (hparams.n_layer) {
|
|
1826
|
+
case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
|
|
1799
1827
|
case 56: type = LLM_TYPE_9B; break;
|
|
1800
1828
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1801
1829
|
}
|
|
@@ -2249,7 +2277,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2249
2277
|
}
|
|
2250
2278
|
|
|
2251
2279
|
switch (hparams.n_layer) {
|
|
2252
|
-
case
|
|
2280
|
+
case 48: type = LLM_TYPE_80B_A3B; break;
|
|
2253
2281
|
default: type = LLM_TYPE_UNKNOWN;
|
|
2254
2282
|
}
|
|
2255
2283
|
} break;
|
|
@@ -2258,9 +2286,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2258
2286
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2259
2287
|
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
|
2260
2288
|
|
|
2261
|
-
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,
|
|
2262
|
-
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,
|
|
2263
|
-
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
|
|
2289
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
|
|
2290
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
|
|
2291
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f);
|
|
2292
|
+
|
|
2293
|
+
hparams.f_attn_temp_offset = 0.0f;
|
|
2264
2294
|
|
|
2265
2295
|
// TODO: maybe add n_attn_temp_floor_scale as a separate KV?
|
|
2266
2296
|
if (hparams.f_attn_temp_scale != 0.0f) {
|
|
@@ -2270,18 +2300,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2270
2300
|
}
|
|
2271
2301
|
}
|
|
2272
2302
|
|
|
2273
|
-
// TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
|
|
2274
|
-
// but may need further verification with other values
|
|
2275
|
-
if (hparams.rope_yarn_log_mul != 0.0f) {
|
|
2276
|
-
float factor = 1.0f / hparams.rope_freq_scale_train;
|
|
2277
|
-
float mscale = 1.0f;
|
|
2278
|
-
float mscale_all_dims = hparams.rope_yarn_log_mul;
|
|
2279
|
-
static auto get_mscale = [](float scale, float mscale) {
|
|
2280
|
-
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
|
|
2281
|
-
};
|
|
2282
|
-
hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
|
|
2283
|
-
}
|
|
2284
|
-
|
|
2285
2303
|
switch (hparams.n_layer) {
|
|
2286
2304
|
case 26: type = LLM_TYPE_3B; break;
|
|
2287
2305
|
case 34: type = LLM_TYPE_8B; break;
|
|
@@ -3381,9 +3399,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3381
3399
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
3382
3400
|
|
|
3383
3401
|
// optional bias tensors
|
|
3384
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
|
3385
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
|
3386
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
|
3402
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3403
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3404
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3387
3405
|
|
|
3388
3406
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3389
3407
|
|
|
@@ -5152,6 +5170,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5152
5170
|
}
|
|
5153
5171
|
} break;
|
|
5154
5172
|
case LLM_ARCH_NEMOTRON_H:
|
|
5173
|
+
case LLM_ARCH_NEMOTRON_H_MOE:
|
|
5155
5174
|
{
|
|
5156
5175
|
// mamba2 Mixer SSM params
|
|
5157
5176
|
// NOTE: int64_t for tensor dimensions
|
|
@@ -5162,6 +5181,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5162
5181
|
const int64_t n_group = hparams.ssm_n_group;
|
|
5163
5182
|
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
|
5164
5183
|
|
|
5184
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
5185
|
+
const int64_t n_ff_shexp = hparams.n_ff_shexp;
|
|
5186
|
+
|
|
5165
5187
|
// embeddings
|
|
5166
5188
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5167
5189
|
|
|
@@ -5211,12 +5233,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5211
5233
|
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
|
|
5212
5234
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
|
|
5213
5235
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
5214
|
-
}
|
|
5215
|
-
|
|
5216
|
-
|
|
5217
|
-
|
|
5218
|
-
|
|
5219
|
-
|
|
5236
|
+
} else {
|
|
5237
|
+
if (n_expert != 0) {
|
|
5238
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
|
|
5239
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
|
|
5240
|
+
|
|
5241
|
+
// MoE branch
|
|
5242
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
5243
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
5244
|
+
|
|
5245
|
+
// Shared expert branch
|
|
5246
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
|
|
5247
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
|
|
5248
|
+
|
|
5249
|
+
} else {
|
|
5250
|
+
// mlp layers
|
|
5251
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
|
5252
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
|
|
5253
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
5254
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
|
5255
|
+
}
|
|
5220
5256
|
}
|
|
5221
5257
|
}
|
|
5222
5258
|
} break;
|
|
@@ -6200,8 +6236,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6200
6236
|
{
|
|
6201
6237
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
6202
6238
|
|
|
6203
|
-
output_norm = create_tensor(tn(
|
|
6204
|
-
output = create_tensor(tn(LLM_TENSOR_OUTPUT,
|
|
6239
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
|
|
6240
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
6205
6241
|
|
|
6206
6242
|
if (output == NULL) {
|
|
6207
6243
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
@@ -6599,9 +6635,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6599
6635
|
|
|
6600
6636
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
|
6601
6637
|
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
|
6638
|
+
GGML_ASSERT(!ml.no_alloc);
|
|
6602
6639
|
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
|
6603
6640
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
|
6604
|
-
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
|
|
6641
|
+
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
|
|
6642
|
+
// then we could just use metal for all layers
|
|
6605
6643
|
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
|
6606
6644
|
void * addr = nullptr;
|
|
6607
6645
|
size_t first, last; // NOLINT
|
|
@@ -6617,9 +6655,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6617
6655
|
bufs.emplace_back(buf);
|
|
6618
6656
|
buf_map.emplace(idx, buf);
|
|
6619
6657
|
}
|
|
6620
|
-
}
|
|
6621
|
-
|
|
6622
|
-
|
|
6658
|
+
} else {
|
|
6659
|
+
ggml_backend_buffer_t buf;
|
|
6660
|
+
if (ml.no_alloc) {
|
|
6661
|
+
buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
|
|
6662
|
+
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
|
6663
|
+
t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
|
|
6664
|
+
}
|
|
6665
|
+
} else {
|
|
6666
|
+
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
|
|
6667
|
+
}
|
|
6623
6668
|
if (buf == nullptr) {
|
|
6624
6669
|
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
|
6625
6670
|
}
|
|
@@ -6674,6 +6719,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6674
6719
|
}
|
|
6675
6720
|
}
|
|
6676
6721
|
|
|
6722
|
+
if (ml.no_alloc) {
|
|
6723
|
+
return true;
|
|
6724
|
+
}
|
|
6725
|
+
|
|
6677
6726
|
// load tensor data
|
|
6678
6727
|
for (auto & [ctx, buf_map] : ctx_buf_maps) {
|
|
6679
6728
|
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
|
|
@@ -6716,9 +6765,18 @@ size_t llama_model::n_devices() const {
|
|
|
6716
6765
|
|
|
6717
6766
|
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
|
6718
6767
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
6719
|
-
for (const auto & [
|
|
6720
|
-
|
|
6721
|
-
|
|
6768
|
+
for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
|
|
6769
|
+
if (hparams.no_alloc) {
|
|
6770
|
+
GGML_ASSERT(bufs.size() == 1);
|
|
6771
|
+
ggml_backend_buffer_t buf = bufs[0].get();
|
|
6772
|
+
GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
|
|
6773
|
+
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
|
|
6774
|
+
ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
|
|
6775
|
+
} else {
|
|
6776
|
+
for (const auto & buf : bufs) {
|
|
6777
|
+
// GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
|
|
6778
|
+
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
|
6779
|
+
}
|
|
6722
6780
|
}
|
|
6723
6781
|
}
|
|
6724
6782
|
return ret;
|
|
@@ -6763,6 +6821,7 @@ void llama_model::print_info() const {
|
|
|
6763
6821
|
// hparams
|
|
6764
6822
|
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
|
|
6765
6823
|
LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
|
|
6824
|
+
LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
|
|
6766
6825
|
|
|
6767
6826
|
if (!hparams.vocab_only) {
|
|
6768
6827
|
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
|
@@ -6797,6 +6856,7 @@ void llama_model::print_info() const {
|
|
|
6797
6856
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
6798
6857
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
6799
6858
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
6859
|
+
LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
6800
6860
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
6801
6861
|
// MRoPE (Multi-axis Rotary Position Embedding) sections
|
|
6802
6862
|
if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
|
|
@@ -6819,7 +6879,8 @@ void llama_model::print_info() const {
|
|
|
6819
6879
|
arch == LLM_ARCH_PLAMO2 ||
|
|
6820
6880
|
arch == LLM_ARCH_GRANITE_HYBRID ||
|
|
6821
6881
|
arch == LLM_ARCH_QWEN3NEXT ||
|
|
6822
|
-
arch == LLM_ARCH_NEMOTRON_H
|
|
6882
|
+
arch == LLM_ARCH_NEMOTRON_H ||
|
|
6883
|
+
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
|
6823
6884
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
6824
6885
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
6825
6886
|
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
@@ -6860,7 +6921,6 @@ void llama_model::print_info() const {
|
|
|
6860
6921
|
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
6861
6922
|
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
6862
6923
|
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
6863
|
-
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
6864
6924
|
}
|
|
6865
6925
|
|
|
6866
6926
|
if (arch == LLM_ARCH_QWEN2MOE) {
|
|
@@ -6875,7 +6935,8 @@ void llama_model::print_info() const {
|
|
|
6875
6935
|
if (arch == LLM_ARCH_MINICPM ||
|
|
6876
6936
|
arch == LLM_ARCH_GRANITE ||
|
|
6877
6937
|
arch == LLM_ARCH_GRANITE_MOE ||
|
|
6878
|
-
arch == LLM_ARCH_GRANITE_HYBRID
|
|
6938
|
+
arch == LLM_ARCH_GRANITE_HYBRID ||
|
|
6939
|
+
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
|
6879
6940
|
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
|
6880
6941
|
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
|
6881
6942
|
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
@@ -7056,7 +7117,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
7056
7117
|
if (arch == LLM_ARCH_FALCON_H1) {
|
|
7057
7118
|
filter_attn = [&](int32_t) { return true; };
|
|
7058
7119
|
filter_recr = [&](int32_t) { return true; };
|
|
7059
|
-
} else if (arch == LLM_ARCH_NEMOTRON_H) {
|
|
7120
|
+
} else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
|
7060
7121
|
filter_attn = [&](int32_t il) {
|
|
7061
7122
|
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
|
7062
7123
|
};
|
|
@@ -7304,7 +7365,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7304
7365
|
} break;
|
|
7305
7366
|
case LLM_ARCH_GEMMA3:
|
|
7306
7367
|
{
|
|
7307
|
-
|
|
7368
|
+
if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
|
|
7369
|
+
llm = std::make_unique<llm_build_gemma3<true>>(*this, params);
|
|
7370
|
+
} else {
|
|
7371
|
+
llm = std::make_unique<llm_build_gemma3<false>>(*this, params);
|
|
7372
|
+
}
|
|
7308
7373
|
} break;
|
|
7309
7374
|
case LLM_ARCH_GEMMA3N:
|
|
7310
7375
|
{
|
|
@@ -7423,6 +7488,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7423
7488
|
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
|
7424
7489
|
} break;
|
|
7425
7490
|
case LLM_ARCH_NEMOTRON_H:
|
|
7491
|
+
case LLM_ARCH_NEMOTRON_H_MOE:
|
|
7426
7492
|
{
|
|
7427
7493
|
llm = std::make_unique<llm_build_nemotron_h>(*this, params);
|
|
7428
7494
|
} break;
|
|
@@ -7607,6 +7673,7 @@ llama_model_params llama_model_default_params() {
|
|
|
7607
7673
|
/*.check_tensors =*/ false,
|
|
7608
7674
|
/*.use_extra_bufts =*/ true,
|
|
7609
7675
|
/*.no_host =*/ false,
|
|
7676
|
+
/*.no_alloc =*/ false,
|
|
7610
7677
|
};
|
|
7611
7678
|
|
|
7612
7679
|
return result;
|
|
@@ -7706,6 +7773,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7706
7773
|
case LLM_ARCH_ARWKV7:
|
|
7707
7774
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
7708
7775
|
case LLM_ARCH_NEMOTRON_H:
|
|
7776
|
+
case LLM_ARCH_NEMOTRON_H_MOE:
|
|
7709
7777
|
return LLAMA_ROPE_TYPE_NONE;
|
|
7710
7778
|
|
|
7711
7779
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
|
@@ -7726,7 +7794,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7726
7794
|
case LLM_ARCH_DEEPSEEK2:
|
|
7727
7795
|
case LLM_ARCH_PLM:
|
|
7728
7796
|
case LLM_ARCH_CHATGLM:
|
|
7729
|
-
case LLM_ARCH_GLM4:
|
|
7730
7797
|
case LLM_ARCH_GRANITE:
|
|
7731
7798
|
case LLM_ARCH_GRANITE_MOE:
|
|
7732
7799
|
case LLM_ARCH_GRANITE_HYBRID:
|
|
@@ -7788,7 +7855,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7788
7855
|
case LLM_ARCH_LFM2:
|
|
7789
7856
|
case LLM_ARCH_LFM2MOE:
|
|
7790
7857
|
case LLM_ARCH_SMALLTHINKER:
|
|
7791
|
-
case LLM_ARCH_GLM4_MOE:
|
|
7792
7858
|
case LLM_ARCH_SEED_OSS:
|
|
7793
7859
|
case LLM_ARCH_GROVEMOE:
|
|
7794
7860
|
case LLM_ARCH_APERTUS:
|
|
@@ -7805,6 +7871,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7805
7871
|
case LLM_ARCH_QWEN3VLMOE:
|
|
7806
7872
|
return LLAMA_ROPE_TYPE_IMROPE;
|
|
7807
7873
|
|
|
7874
|
+
case LLM_ARCH_GLM4:
|
|
7875
|
+
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
|
|
7876
|
+
case LLM_ARCH_GLM4_MOE:
|
|
7877
|
+
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
|
|
7878
|
+
|
|
7808
7879
|
// all model arches should be listed explicitly here
|
|
7809
7880
|
case LLM_ARCH_UNKNOWN:
|
|
7810
7881
|
GGML_ABORT("unknown architecture");
|
|
@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
596
596
|
}
|
|
597
597
|
|
|
598
598
|
std::vector<std::string> splits = {};
|
|
599
|
-
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
|
|
599
|
+
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
|
600
600
|
ml.init_mappings(false); // no prefetching
|
|
601
601
|
|
|
602
602
|
llama_model model(llama_model_default_params());
|
|
@@ -1895,7 +1895,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1895
1895
|
clean_spaces = false;
|
|
1896
1896
|
} else if (
|
|
1897
1897
|
tokenizer_pre == "qwen2" ||
|
|
1898
|
-
tokenizer_pre == "deepseek-r1-qwen"
|
|
1898
|
+
tokenizer_pre == "deepseek-r1-qwen" ||
|
|
1899
|
+
tokenizer_pre == "kormo") {
|
|
1899
1900
|
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
|
1900
1901
|
clean_spaces = false;
|
|
1901
1902
|
} else if (
|