@fugood/llama.node 1.4.14 → 1.5.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +13 -6
- package/lib/index.js +2 -2
- package/lib/index.ts +8 -3
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +77 -65
- package/src/LlamaContext.cpp +31 -34
- package/src/llama.cpp/CMakeLists.txt +24 -8
- package/src/llama.cpp/common/CMakeLists.txt +15 -34
- package/src/llama.cpp/common/arg.cpp +59 -10
- package/src/llama.cpp/common/chat-parser.cpp +115 -0
- package/src/llama.cpp/common/chat.cpp +356 -34
- package/src/llama.cpp/common/chat.h +17 -13
- package/src/llama.cpp/common/common.cpp +0 -1
- package/src/llama.cpp/common/common.h +30 -25
- package/src/llama.cpp/common/debug.cpp +165 -0
- package/src/llama.cpp/common/debug.h +43 -0
- package/src/llama.cpp/common/download.cpp +12 -342
- package/src/llama.cpp/common/download.h +6 -0
- package/src/llama.cpp/common/jinja/caps.cpp +237 -0
- package/src/llama.cpp/common/jinja/caps.h +24 -0
- package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
- package/src/llama.cpp/common/jinja/lexer.h +157 -0
- package/src/llama.cpp/common/jinja/parser.cpp +591 -0
- package/src/llama.cpp/common/jinja/parser.h +21 -0
- package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
- package/src/llama.cpp/common/jinja/runtime.h +628 -0
- package/src/llama.cpp/common/jinja/string.cpp +207 -0
- package/src/llama.cpp/common/jinja/string.h +58 -0
- package/src/llama.cpp/common/jinja/utils.h +49 -0
- package/src/llama.cpp/common/jinja/value.cpp +1221 -0
- package/src/llama.cpp/common/jinja/value.h +464 -0
- package/src/llama.cpp/common/preset.cpp +12 -2
- package/src/llama.cpp/common/sampling.cpp +52 -19
- package/src/llama.cpp/ggml/include/ggml.h +39 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
- package/src/llama.cpp/include/llama-cpp.h +3 -1
- package/src/llama.cpp/include/llama.h +29 -2
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-adapter.cpp +7 -13
- package/src/llama.cpp/src/llama-adapter.h +1 -3
- package/src/llama.cpp/src/llama-arch.cpp +35 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +20 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +232 -144
- package/src/llama.cpp/src/llama-context.h +10 -0
- package/src/llama.cpp/src/llama-cparams.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +31 -43
- package/src/llama.cpp/src/llama-hparams.cpp +0 -36
- package/src/llama.cpp/src/llama-hparams.h +38 -1
- package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
- package/src/llama.cpp/src/llama-kv-cache.h +0 -2
- package/src/llama.cpp/src/llama-mmap.cpp +13 -6
- package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
- package/src/llama.cpp/src/llama-model.cpp +215 -97
- package/src/llama.cpp/src/llama-model.h +3 -2
- package/src/llama.cpp/src/llama-sampling.cpp +170 -13
- package/src/llama.cpp/src/llama-vocab.cpp +37 -24
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
- package/src/llama.cpp/src/models/models.h +13 -2
- package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
|
@@ -446,7 +446,7 @@ struct llama_model::impl {
|
|
|
446
446
|
llama_mlocks mlock_bufs;
|
|
447
447
|
llama_mlocks mlock_mmaps;
|
|
448
448
|
|
|
449
|
-
// contexts where the model tensors metadata is stored as well
|
|
449
|
+
// contexts where the model tensors metadata is stored as well as the corresponding buffers:
|
|
450
450
|
std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
|
|
451
451
|
|
|
452
452
|
buft_list_t cpu_buft_list;
|
|
@@ -468,7 +468,11 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi
|
|
|
468
468
|
pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
|
|
469
469
|
}
|
|
470
470
|
|
|
471
|
-
llama_model::~llama_model()
|
|
471
|
+
llama_model::~llama_model() {
|
|
472
|
+
for (auto * lora : loras) {
|
|
473
|
+
delete lora;
|
|
474
|
+
}
|
|
475
|
+
}
|
|
472
476
|
|
|
473
477
|
void llama_model::load_stats(llama_model_loader & ml) {
|
|
474
478
|
pimpl->n_elements = ml.n_elements;
|
|
@@ -1933,6 +1937,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1933
1937
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1934
1938
|
}
|
|
1935
1939
|
} break;
|
|
1940
|
+
case LLM_ARCH_EXAONE_MOE:
|
|
1941
|
+
{
|
|
1942
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1943
|
+
hparams.n_swa = 128;
|
|
1944
|
+
hparams.set_swa_pattern(4);
|
|
1945
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1946
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
1947
|
+
|
|
1948
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1949
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1950
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1951
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
|
|
1952
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1953
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
|
1954
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
|
1955
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
1956
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1957
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
1958
|
+
|
|
1959
|
+
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
|
1960
|
+
|
|
1961
|
+
switch (hparams.n_layer) {
|
|
1962
|
+
case 32: type = LLM_TYPE_30B_A3B; break;
|
|
1963
|
+
case 48:
|
|
1964
|
+
case 49: type = LLM_TYPE_235B_A22B; break;
|
|
1965
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1966
|
+
}
|
|
1967
|
+
} break;
|
|
1936
1968
|
case LLM_ARCH_RWKV6:
|
|
1937
1969
|
case LLM_ARCH_RWKV6QWEN2:
|
|
1938
1970
|
{
|
|
@@ -5516,6 +5548,84 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5516
5548
|
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
5517
5549
|
}
|
|
5518
5550
|
} break;
|
|
5551
|
+
case LLM_ARCH_EXAONE_MOE:
|
|
5552
|
+
{
|
|
5553
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
5554
|
+
const int64_t n_expert = hparams.n_expert;
|
|
5555
|
+
const int64_t n_expert_used = hparams.n_expert_used;
|
|
5556
|
+
const int64_t n_ff_shexp = hparams.n_ff_shexp;
|
|
5557
|
+
const int64_t head_dim = hparams.n_embd_head_k;
|
|
5558
|
+
const int64_t n_qo_dim = n_head * head_dim;
|
|
5559
|
+
const int64_t n_kv_dim = n_head_kv * head_dim;
|
|
5560
|
+
|
|
5561
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5562
|
+
|
|
5563
|
+
// output
|
|
5564
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
5565
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
5566
|
+
|
|
5567
|
+
if (output == NULL) {
|
|
5568
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
5569
|
+
}
|
|
5570
|
+
|
|
5571
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5572
|
+
int flags = 0;
|
|
5573
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
5574
|
+
// skip all tensors in the NextN layers
|
|
5575
|
+
flags |= TENSOR_SKIP;
|
|
5576
|
+
}
|
|
5577
|
+
|
|
5578
|
+
auto & layer = layers[i];
|
|
5579
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, flags);
|
|
5580
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, flags);
|
|
5581
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, flags);
|
|
5582
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, flags);
|
|
5583
|
+
|
|
5584
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0) | flags);
|
|
5585
|
+
|
|
5586
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
|
|
5587
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
|
|
5588
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
|
|
5589
|
+
|
|
5590
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
|
|
5591
|
+
|
|
5592
|
+
// dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end
|
|
5593
|
+
if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers)) {
|
|
5594
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
|
|
5595
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags);
|
|
5596
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
|
|
5597
|
+
} else {
|
|
5598
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
|
|
5599
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
|
|
5600
|
+
|
|
5601
|
+
if (n_expert == 0) {
|
|
5602
|
+
throw std::runtime_error("n_expert must be > 0");
|
|
5603
|
+
}
|
|
5604
|
+
if (n_expert_used == 0) {
|
|
5605
|
+
throw std::runtime_error("n_expert_used must be > 0");
|
|
5606
|
+
}
|
|
5607
|
+
|
|
5608
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
|
|
5609
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
|
|
5610
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
|
|
5611
|
+
|
|
5612
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
|
|
5613
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
|
|
5614
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
|
|
5615
|
+
}
|
|
5616
|
+
|
|
5617
|
+
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
|
5618
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
5619
|
+
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
|
|
5620
|
+
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, flags);
|
|
5621
|
+
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, flags);
|
|
5622
|
+
|
|
5623
|
+
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
|
|
5624
|
+
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
|
|
5625
|
+
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
|
|
5626
|
+
}
|
|
5627
|
+
}
|
|
5628
|
+
} break;
|
|
5519
5629
|
case LLM_ARCH_RWKV6:
|
|
5520
5630
|
{
|
|
5521
5631
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -6763,7 +6873,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6763
6873
|
} else {
|
|
6764
6874
|
// Linear attention (gated delta net) specific tensors
|
|
6765
6875
|
// Create tensors with calculated dimensions
|
|
6766
|
-
|
|
6876
|
+
// note: ssm_in is used by legacy GGUF
|
|
6877
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, TENSOR_NOT_REQUIRED);
|
|
6878
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
|
|
6879
|
+
layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
|
|
6767
6880
|
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
|
|
6768
6881
|
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
|
|
6769
6882
|
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
|
|
@@ -7098,59 +7211,59 @@ void llama_model::print_info() const {
|
|
|
7098
7211
|
};
|
|
7099
7212
|
|
|
7100
7213
|
// hparams
|
|
7101
|
-
LLAMA_LOG_INFO("%s: arch
|
|
7102
|
-
LLAMA_LOG_INFO("%s: vocab_only
|
|
7103
|
-
LLAMA_LOG_INFO("%s: no_alloc
|
|
7214
|
+
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
|
|
7215
|
+
LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
|
|
7216
|
+
LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
|
|
7104
7217
|
|
|
7105
7218
|
if (!hparams.vocab_only) {
|
|
7106
|
-
LLAMA_LOG_INFO("%s: n_ctx_train
|
|
7107
|
-
LLAMA_LOG_INFO("%s: n_embd
|
|
7108
|
-
LLAMA_LOG_INFO("%s: n_embd_inp
|
|
7109
|
-
LLAMA_LOG_INFO("%s: n_layer
|
|
7110
|
-
LLAMA_LOG_INFO("%s: n_head
|
|
7111
|
-
LLAMA_LOG_INFO("%s: n_head_kv
|
|
7112
|
-
LLAMA_LOG_INFO("%s: n_rot
|
|
7113
|
-
LLAMA_LOG_INFO("%s: n_swa
|
|
7114
|
-
LLAMA_LOG_INFO("%s: is_swa_any
|
|
7115
|
-
LLAMA_LOG_INFO("%s: n_embd_head_k
|
|
7116
|
-
LLAMA_LOG_INFO("%s: n_embd_head_v
|
|
7117
|
-
LLAMA_LOG_INFO("%s: n_gqa
|
|
7118
|
-
LLAMA_LOG_INFO("%s: n_embd_k_gqa
|
|
7119
|
-
LLAMA_LOG_INFO("%s: n_embd_v_gqa
|
|
7120
|
-
LLAMA_LOG_INFO("%s: f_norm_eps
|
|
7121
|
-
LLAMA_LOG_INFO("%s: f_norm_rms_eps
|
|
7122
|
-
LLAMA_LOG_INFO("%s: f_clamp_kqv
|
|
7123
|
-
LLAMA_LOG_INFO("%s: f_max_alibi_bias
|
|
7124
|
-
LLAMA_LOG_INFO("%s: f_logit_scale
|
|
7125
|
-
LLAMA_LOG_INFO("%s: f_attn_scale
|
|
7126
|
-
LLAMA_LOG_INFO("%s: n_ff
|
|
7127
|
-
LLAMA_LOG_INFO("%s: n_expert
|
|
7128
|
-
LLAMA_LOG_INFO("%s: n_expert_used
|
|
7129
|
-
LLAMA_LOG_INFO("%s: n_expert_groups
|
|
7130
|
-
LLAMA_LOG_INFO("%s: n_group_used
|
|
7131
|
-
LLAMA_LOG_INFO("%s: causal attn
|
|
7132
|
-
LLAMA_LOG_INFO("%s: pooling type
|
|
7133
|
-
LLAMA_LOG_INFO("%s: rope type
|
|
7134
|
-
LLAMA_LOG_INFO("%s: rope scaling
|
|
7135
|
-
LLAMA_LOG_INFO("%s: freq_base_train
|
|
7136
|
-
LLAMA_LOG_INFO("%s: freq_scale_train
|
|
7219
|
+
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
|
7220
|
+
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
|
7221
|
+
LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
|
|
7222
|
+
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
|
7223
|
+
LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
|
|
7224
|
+
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
|
|
7225
|
+
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
|
7226
|
+
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
|
|
7227
|
+
LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
|
|
7228
|
+
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
|
7229
|
+
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
|
7230
|
+
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
|
|
7231
|
+
LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
|
|
7232
|
+
LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
|
|
7233
|
+
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
|
7234
|
+
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
|
7235
|
+
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
|
7236
|
+
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
|
7237
|
+
LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
|
|
7238
|
+
LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
|
|
7239
|
+
LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
|
|
7240
|
+
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
|
7241
|
+
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
|
7242
|
+
LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
|
|
7243
|
+
LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
|
|
7244
|
+
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
|
7245
|
+
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
|
7246
|
+
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
|
7247
|
+
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
|
7248
|
+
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
7249
|
+
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
7137
7250
|
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
7138
|
-
LLAMA_LOG_INFO("%s: freq_base_swa
|
|
7139
|
-
LLAMA_LOG_INFO("%s: freq_scale_swa
|
|
7251
|
+
LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
|
|
7252
|
+
LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
|
|
7140
7253
|
}
|
|
7141
|
-
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn
|
|
7142
|
-
LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
7143
|
-
LLAMA_LOG_INFO("%s: rope_finetuned
|
|
7254
|
+
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
7255
|
+
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
7256
|
+
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
7144
7257
|
// MRoPE (Multi-axis Rotary Position Embedding) sections
|
|
7145
7258
|
if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
|
|
7146
|
-
LLAMA_LOG_INFO("%s: mrope sections
|
|
7259
|
+
LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
|
|
7147
7260
|
}
|
|
7148
7261
|
if (!classifier_labels.empty()) {
|
|
7149
|
-
LLAMA_LOG_INFO("%s: n_cls_out
|
|
7262
|
+
LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
|
|
7150
7263
|
|
|
7151
7264
|
size_t i = 0;
|
|
7152
7265
|
for (auto label : classifier_labels) {
|
|
7153
|
-
LLAMA_LOG_INFO("%s: cls_label[%2zu]
|
|
7266
|
+
LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
|
|
7154
7267
|
}
|
|
7155
7268
|
}
|
|
7156
7269
|
}
|
|
@@ -7164,55 +7277,55 @@ void llama_model::print_info() const {
|
|
|
7164
7277
|
arch == LLM_ARCH_QWEN3NEXT ||
|
|
7165
7278
|
arch == LLM_ARCH_NEMOTRON_H ||
|
|
7166
7279
|
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
|
7167
|
-
LLAMA_LOG_INFO("%s: ssm_d_conv
|
|
7168
|
-
LLAMA_LOG_INFO("%s: ssm_d_inner
|
|
7169
|
-
LLAMA_LOG_INFO("%s: ssm_d_state
|
|
7170
|
-
LLAMA_LOG_INFO("%s: ssm_dt_rank
|
|
7171
|
-
LLAMA_LOG_INFO("%s: ssm_n_group
|
|
7172
|
-
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms
|
|
7280
|
+
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
7281
|
+
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
7282
|
+
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
7283
|
+
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
|
7284
|
+
LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
|
|
7285
|
+
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
|
|
7173
7286
|
}
|
|
7174
7287
|
|
|
7175
|
-
LLAMA_LOG_INFO("%s: model type
|
|
7288
|
+
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
|
|
7176
7289
|
if (pimpl->n_elements >= 1e12) {
|
|
7177
|
-
LLAMA_LOG_INFO("%s: model params
|
|
7290
|
+
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
|
|
7178
7291
|
} else if (pimpl->n_elements >= 1e9) {
|
|
7179
|
-
LLAMA_LOG_INFO("%s: model params
|
|
7292
|
+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
|
|
7180
7293
|
} else if (pimpl->n_elements >= 1e6) {
|
|
7181
|
-
LLAMA_LOG_INFO("%s: model params
|
|
7294
|
+
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
|
|
7182
7295
|
} else {
|
|
7183
|
-
LLAMA_LOG_INFO("%s: model params
|
|
7296
|
+
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
|
|
7184
7297
|
}
|
|
7185
7298
|
|
|
7186
7299
|
// general kv
|
|
7187
|
-
LLAMA_LOG_INFO("%s: general.name
|
|
7300
|
+
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
|
|
7188
7301
|
|
|
7189
7302
|
if (arch == LLM_ARCH_DEEPSEEK) {
|
|
7190
|
-
LLAMA_LOG_INFO("%s: n_layer_dense_lead
|
|
7191
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7192
|
-
LLAMA_LOG_INFO("%s: n_expert_shared
|
|
7193
|
-
LLAMA_LOG_INFO("%s: expert_weights_scale
|
|
7303
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
7304
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7305
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
7306
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
7194
7307
|
}
|
|
7195
7308
|
|
|
7196
7309
|
if (arch == LLM_ARCH_DEEPSEEK2) {
|
|
7197
|
-
LLAMA_LOG_INFO("%s: n_layer_dense_lead
|
|
7198
|
-
LLAMA_LOG_INFO("%s: n_lora_q
|
|
7199
|
-
LLAMA_LOG_INFO("%s: n_lora_kv
|
|
7200
|
-
LLAMA_LOG_INFO("%s: n_embd_head_k_mla
|
|
7201
|
-
LLAMA_LOG_INFO("%s: n_embd_head_v_mla
|
|
7202
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7203
|
-
LLAMA_LOG_INFO("%s: n_expert_shared
|
|
7204
|
-
LLAMA_LOG_INFO("%s: expert_weights_scale
|
|
7205
|
-
LLAMA_LOG_INFO("%s: expert_weights_norm
|
|
7206
|
-
LLAMA_LOG_INFO("%s: expert_gating_func
|
|
7310
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
7311
|
+
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
|
7312
|
+
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
|
7313
|
+
LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
|
|
7314
|
+
LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
|
|
7315
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7316
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
7317
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
7318
|
+
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
7319
|
+
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
7207
7320
|
}
|
|
7208
7321
|
|
|
7209
7322
|
if (arch == LLM_ARCH_QWEN2MOE) {
|
|
7210
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7211
|
-
LLAMA_LOG_INFO("%s: n_ff_shexp
|
|
7323
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7324
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
7212
7325
|
}
|
|
7213
7326
|
|
|
7214
7327
|
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
|
|
7215
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7328
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7216
7329
|
}
|
|
7217
7330
|
|
|
7218
7331
|
if (arch == LLM_ARCH_MINICPM ||
|
|
@@ -7220,41 +7333,41 @@ void llama_model::print_info() const {
|
|
|
7220
7333
|
arch == LLM_ARCH_GRANITE_MOE ||
|
|
7221
7334
|
arch == LLM_ARCH_GRANITE_HYBRID ||
|
|
7222
7335
|
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
|
7223
|
-
LLAMA_LOG_INFO("%s: f_embedding_scale
|
|
7224
|
-
LLAMA_LOG_INFO("%s: f_residual_scale
|
|
7225
|
-
LLAMA_LOG_INFO("%s: f_attention_scale
|
|
7226
|
-
LLAMA_LOG_INFO("%s: n_ff_shexp
|
|
7336
|
+
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
|
7337
|
+
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
|
7338
|
+
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
7339
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
7227
7340
|
}
|
|
7228
7341
|
|
|
7229
7342
|
if (arch == LLM_ARCH_BAILINGMOE) {
|
|
7230
|
-
LLAMA_LOG_INFO("%s: n_layer_dense_lead
|
|
7231
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7232
|
-
LLAMA_LOG_INFO("%s: n_expert_shared
|
|
7233
|
-
LLAMA_LOG_INFO("%s: expert_weights_scale
|
|
7234
|
-
LLAMA_LOG_INFO("%s: expert_weights_norm
|
|
7343
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
7344
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7345
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
7346
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
7347
|
+
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
7235
7348
|
}
|
|
7236
7349
|
|
|
7237
7350
|
if (arch == LLM_ARCH_BAILINGMOE2) {
|
|
7238
|
-
LLAMA_LOG_INFO("%s: n_layer_dense_lead
|
|
7239
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7240
|
-
LLAMA_LOG_INFO("%s: n_ff_shexp
|
|
7241
|
-
LLAMA_LOG_INFO("%s: n_expert_shared
|
|
7242
|
-
LLAMA_LOG_INFO("%s: expert_weights_scale
|
|
7243
|
-
LLAMA_LOG_INFO("%s: expert_weights_norm
|
|
7244
|
-
LLAMA_LOG_INFO("%s: expert_gating_func
|
|
7245
|
-
LLAMA_LOG_INFO("%s: nextn_predict_layers
|
|
7351
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
7352
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7353
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
7354
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
7355
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
7356
|
+
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
7357
|
+
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
7358
|
+
LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
|
|
7246
7359
|
}
|
|
7247
7360
|
|
|
7248
7361
|
if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
|
|
7249
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7250
|
-
LLAMA_LOG_INFO("%s: expert_gating_func
|
|
7362
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7363
|
+
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
7251
7364
|
}
|
|
7252
7365
|
|
|
7253
7366
|
if (arch == LLM_ARCH_GROVEMOE) {
|
|
7254
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7255
|
-
LLAMA_LOG_INFO("%s: n_ff_chexp
|
|
7256
|
-
LLAMA_LOG_INFO("%s: n_group_experts
|
|
7257
|
-
LLAMA_LOG_INFO("%s: expert_group_scale
|
|
7367
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7368
|
+
LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
|
|
7369
|
+
LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
|
|
7370
|
+
LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
|
|
7258
7371
|
}
|
|
7259
7372
|
|
|
7260
7373
|
vocab.print_info();
|
|
@@ -7808,6 +7921,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7808
7921
|
llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
|
|
7809
7922
|
}
|
|
7810
7923
|
} break;
|
|
7924
|
+
case LLM_ARCH_EXAONE_MOE:
|
|
7925
|
+
{
|
|
7926
|
+
llm = std::make_unique<llm_build_exaone_moe>(*this, params);
|
|
7927
|
+
} break;
|
|
7811
7928
|
case LLM_ARCH_RWKV6:
|
|
7812
7929
|
{
|
|
7813
7930
|
llm = std::make_unique<llm_build_rwkv6>(*this, params);
|
|
@@ -8168,6 +8285,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
8168
8285
|
case LLM_ARCH_NEMOTRON:
|
|
8169
8286
|
case LLM_ARCH_EXAONE:
|
|
8170
8287
|
case LLM_ARCH_EXAONE4:
|
|
8288
|
+
case LLM_ARCH_EXAONE_MOE:
|
|
8171
8289
|
case LLM_ARCH_MINICPM3:
|
|
8172
8290
|
case LLM_ARCH_BAILINGMOE2:
|
|
8173
8291
|
case LLM_ARCH_DOTS1:
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
#include <memory>
|
|
12
12
|
#include <string>
|
|
13
13
|
#include <unordered_map>
|
|
14
|
+
#include <unordered_set>
|
|
14
15
|
#include <vector>
|
|
15
16
|
|
|
16
17
|
struct llama_cparams;
|
|
@@ -476,8 +477,8 @@ struct llama_model {
|
|
|
476
477
|
// for quantize-stats only
|
|
477
478
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
|
478
479
|
|
|
479
|
-
// for keeping track of
|
|
480
|
-
|
|
480
|
+
// for keeping track of associated LoRA adapters
|
|
481
|
+
std::unordered_set<llama_adapter_lora *> loras;
|
|
481
482
|
|
|
482
483
|
int64_t t_load_us = 0;
|
|
483
484
|
int64_t t_start_us = 0;
|