@fugood/llama.node 1.4.13 → 1.4.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +23 -2
- package/lib/index.js +2 -1
- package/lib/index.ts +8 -1
- package/lib/parallel.ts +2 -2
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +9 -12
- package/src/LlamaContext.cpp +16 -4
- package/src/llama.cpp/CMakeLists.txt +24 -8
- package/src/llama.cpp/common/CMakeLists.txt +3 -34
- package/src/llama.cpp/common/arg.cpp +183 -60
- package/src/llama.cpp/common/arg.h +0 -8
- package/src/llama.cpp/common/chat-parser.cpp +115 -0
- package/src/llama.cpp/common/chat.cpp +67 -0
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +2 -1
- package/src/llama.cpp/common/common.h +12 -7
- package/src/llama.cpp/common/debug.cpp +165 -0
- package/src/llama.cpp/common/debug.h +43 -0
- package/src/llama.cpp/common/download.cpp +88 -369
- package/src/llama.cpp/common/download.h +32 -5
- package/src/llama.cpp/common/preset.cpp +87 -2
- package/src/llama.cpp/common/preset.h +10 -1
- package/src/llama.cpp/ggml/include/ggml.h +5 -0
- package/src/llama.cpp/include/llama.h +5 -2
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +35 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +20 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +31 -43
- package/src/llama.cpp/src/llama-mmap.cpp +78 -42
- package/src/llama.cpp/src/llama-mmap.h +5 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +225 -101
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +37 -24
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/llama.cpp +63 -27
- package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
- package/src/llama.cpp/src/models/models.h +13 -2
- package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
|
@@ -446,7 +446,7 @@ struct llama_model::impl {
|
|
|
446
446
|
llama_mlocks mlock_bufs;
|
|
447
447
|
llama_mlocks mlock_mmaps;
|
|
448
448
|
|
|
449
|
-
// contexts where the model tensors metadata is stored as well
|
|
449
|
+
// contexts where the model tensors metadata is stored as well as the corresponding buffers:
|
|
450
450
|
std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
|
|
451
451
|
|
|
452
452
|
buft_list_t cpu_buft_list;
|
|
@@ -1933,6 +1933,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1933
1933
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1934
1934
|
}
|
|
1935
1935
|
} break;
|
|
1936
|
+
case LLM_ARCH_EXAONE_MOE:
|
|
1937
|
+
{
|
|
1938
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1939
|
+
hparams.n_swa = 128;
|
|
1940
|
+
hparams.set_swa_pattern(4);
|
|
1941
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1942
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
1943
|
+
|
|
1944
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1945
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1946
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1947
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
|
|
1948
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1949
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
|
1950
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
|
1951
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
1952
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1953
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
1954
|
+
|
|
1955
|
+
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
|
1956
|
+
|
|
1957
|
+
switch (hparams.n_layer) {
|
|
1958
|
+
case 32: type = LLM_TYPE_30B_A3B; break;
|
|
1959
|
+
case 48:
|
|
1960
|
+
case 49: type = LLM_TYPE_235B_A22B; break;
|
|
1961
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1962
|
+
}
|
|
1963
|
+
} break;
|
|
1936
1964
|
case LLM_ARCH_RWKV6:
|
|
1937
1965
|
case LLM_ARCH_RWKV6QWEN2:
|
|
1938
1966
|
{
|
|
@@ -2440,7 +2468,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2440
2468
|
|
|
2441
2469
|
const bool use_mmap_buffer = true;
|
|
2442
2470
|
|
|
2443
|
-
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n",
|
|
2471
|
+
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
|
|
2472
|
+
__func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
|
|
2444
2473
|
|
|
2445
2474
|
// build a list of buffer types for the CPU and GPU devices
|
|
2446
2475
|
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
|
|
@@ -2451,6 +2480,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2451
2480
|
pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
|
|
2452
2481
|
}
|
|
2453
2482
|
|
|
2483
|
+
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
2484
|
+
if (cpu_dev == nullptr) {
|
|
2485
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
2486
|
+
}
|
|
2487
|
+
|
|
2454
2488
|
// calculate the split points
|
|
2455
2489
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
|
|
2456
2490
|
std::vector<float> splits(n_devices());
|
|
@@ -2461,6 +2495,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2461
2495
|
size_t total;
|
|
2462
2496
|
size_t free;
|
|
2463
2497
|
ggml_backend_dev_memory(dev, &free, &total);
|
|
2498
|
+
|
|
2499
|
+
// devices can return 0 bytes for free and total memory if they do not
|
|
2500
|
+
// have any to report. in this case, we will use the host memory as a fallback
|
|
2501
|
+
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
|
2502
|
+
if (free == 0 && total == 0) {
|
|
2503
|
+
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
|
2504
|
+
}
|
|
2464
2505
|
splits[i] = free;
|
|
2465
2506
|
}
|
|
2466
2507
|
} else {
|
|
@@ -2477,10 +2518,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2477
2518
|
splits[i] /= split_sum;
|
|
2478
2519
|
}
|
|
2479
2520
|
|
|
2480
|
-
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
2481
|
-
if (cpu_dev == nullptr) {
|
|
2482
|
-
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
2483
|
-
}
|
|
2484
2521
|
const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
|
|
2485
2522
|
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
|
|
2486
2523
|
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
|
@@ -5507,6 +5544,84 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5507
5544
|
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
5508
5545
|
}
|
|
5509
5546
|
} break;
|
|
5547
|
+
case LLM_ARCH_EXAONE_MOE:
|
|
5548
|
+
{
|
|
5549
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
5550
|
+
const int64_t n_expert = hparams.n_expert;
|
|
5551
|
+
const int64_t n_expert_used = hparams.n_expert_used;
|
|
5552
|
+
const int64_t n_ff_shexp = hparams.n_ff_shexp;
|
|
5553
|
+
const int64_t head_dim = hparams.n_embd_head_k;
|
|
5554
|
+
const int64_t n_qo_dim = n_head * head_dim;
|
|
5555
|
+
const int64_t n_kv_dim = n_head_kv * head_dim;
|
|
5556
|
+
|
|
5557
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5558
|
+
|
|
5559
|
+
// output
|
|
5560
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
5561
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
5562
|
+
|
|
5563
|
+
if (output == NULL) {
|
|
5564
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
5565
|
+
}
|
|
5566
|
+
|
|
5567
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5568
|
+
int flags = 0;
|
|
5569
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
5570
|
+
// skip all tensors in the NextN layers
|
|
5571
|
+
flags |= TENSOR_SKIP;
|
|
5572
|
+
}
|
|
5573
|
+
|
|
5574
|
+
auto & layer = layers[i];
|
|
5575
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, flags);
|
|
5576
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, flags);
|
|
5577
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, flags);
|
|
5578
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, flags);
|
|
5579
|
+
|
|
5580
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0) | flags);
|
|
5581
|
+
|
|
5582
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
|
|
5583
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
|
|
5584
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
|
|
5585
|
+
|
|
5586
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
|
|
5587
|
+
|
|
5588
|
+
// dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end
|
|
5589
|
+
if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers)) {
|
|
5590
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
|
|
5591
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags);
|
|
5592
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
|
|
5593
|
+
} else {
|
|
5594
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
|
|
5595
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
|
|
5596
|
+
|
|
5597
|
+
if (n_expert == 0) {
|
|
5598
|
+
throw std::runtime_error("n_expert must be > 0");
|
|
5599
|
+
}
|
|
5600
|
+
if (n_expert_used == 0) {
|
|
5601
|
+
throw std::runtime_error("n_expert_used must be > 0");
|
|
5602
|
+
}
|
|
5603
|
+
|
|
5604
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
|
|
5605
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
|
|
5606
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
|
|
5607
|
+
|
|
5608
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
|
|
5609
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
|
|
5610
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
|
|
5611
|
+
}
|
|
5612
|
+
|
|
5613
|
+
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
|
5614
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
5615
|
+
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
|
|
5616
|
+
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, flags);
|
|
5617
|
+
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, flags);
|
|
5618
|
+
|
|
5619
|
+
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
|
|
5620
|
+
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
|
|
5621
|
+
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
|
|
5622
|
+
}
|
|
5623
|
+
}
|
|
5624
|
+
} break;
|
|
5510
5625
|
case LLM_ARCH_RWKV6:
|
|
5511
5626
|
{
|
|
5512
5627
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -6754,7 +6869,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6754
6869
|
} else {
|
|
6755
6870
|
// Linear attention (gated delta net) specific tensors
|
|
6756
6871
|
// Create tensors with calculated dimensions
|
|
6757
|
-
|
|
6872
|
+
// note: ssm_in is used by legacy GGUF
|
|
6873
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, TENSOR_NOT_REQUIRED);
|
|
6874
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
|
|
6875
|
+
layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
|
|
6758
6876
|
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
|
|
6759
6877
|
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
|
|
6760
6878
|
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
|
|
@@ -7089,59 +7207,59 @@ void llama_model::print_info() const {
|
|
|
7089
7207
|
};
|
|
7090
7208
|
|
|
7091
7209
|
// hparams
|
|
7092
|
-
LLAMA_LOG_INFO("%s: arch
|
|
7093
|
-
LLAMA_LOG_INFO("%s: vocab_only
|
|
7094
|
-
LLAMA_LOG_INFO("%s: no_alloc
|
|
7210
|
+
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
|
|
7211
|
+
LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
|
|
7212
|
+
LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
|
|
7095
7213
|
|
|
7096
7214
|
if (!hparams.vocab_only) {
|
|
7097
|
-
LLAMA_LOG_INFO("%s: n_ctx_train
|
|
7098
|
-
LLAMA_LOG_INFO("%s: n_embd
|
|
7099
|
-
LLAMA_LOG_INFO("%s: n_embd_inp
|
|
7100
|
-
LLAMA_LOG_INFO("%s: n_layer
|
|
7101
|
-
LLAMA_LOG_INFO("%s: n_head
|
|
7102
|
-
LLAMA_LOG_INFO("%s: n_head_kv
|
|
7103
|
-
LLAMA_LOG_INFO("%s: n_rot
|
|
7104
|
-
LLAMA_LOG_INFO("%s: n_swa
|
|
7105
|
-
LLAMA_LOG_INFO("%s: is_swa_any
|
|
7106
|
-
LLAMA_LOG_INFO("%s: n_embd_head_k
|
|
7107
|
-
LLAMA_LOG_INFO("%s: n_embd_head_v
|
|
7108
|
-
LLAMA_LOG_INFO("%s: n_gqa
|
|
7109
|
-
LLAMA_LOG_INFO("%s: n_embd_k_gqa
|
|
7110
|
-
LLAMA_LOG_INFO("%s: n_embd_v_gqa
|
|
7111
|
-
LLAMA_LOG_INFO("%s: f_norm_eps
|
|
7112
|
-
LLAMA_LOG_INFO("%s: f_norm_rms_eps
|
|
7113
|
-
LLAMA_LOG_INFO("%s: f_clamp_kqv
|
|
7114
|
-
LLAMA_LOG_INFO("%s: f_max_alibi_bias
|
|
7115
|
-
LLAMA_LOG_INFO("%s: f_logit_scale
|
|
7116
|
-
LLAMA_LOG_INFO("%s: f_attn_scale
|
|
7117
|
-
LLAMA_LOG_INFO("%s: n_ff
|
|
7118
|
-
LLAMA_LOG_INFO("%s: n_expert
|
|
7119
|
-
LLAMA_LOG_INFO("%s: n_expert_used
|
|
7120
|
-
LLAMA_LOG_INFO("%s: n_expert_groups
|
|
7121
|
-
LLAMA_LOG_INFO("%s: n_group_used
|
|
7122
|
-
LLAMA_LOG_INFO("%s: causal attn
|
|
7123
|
-
LLAMA_LOG_INFO("%s: pooling type
|
|
7124
|
-
LLAMA_LOG_INFO("%s: rope type
|
|
7125
|
-
LLAMA_LOG_INFO("%s: rope scaling
|
|
7126
|
-
LLAMA_LOG_INFO("%s: freq_base_train
|
|
7127
|
-
LLAMA_LOG_INFO("%s: freq_scale_train
|
|
7215
|
+
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
|
7216
|
+
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
|
7217
|
+
LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
|
|
7218
|
+
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
|
7219
|
+
LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
|
|
7220
|
+
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
|
|
7221
|
+
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
|
7222
|
+
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
|
|
7223
|
+
LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
|
|
7224
|
+
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
|
7225
|
+
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
|
7226
|
+
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
|
|
7227
|
+
LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
|
|
7228
|
+
LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
|
|
7229
|
+
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
|
7230
|
+
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
|
7231
|
+
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
|
7232
|
+
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
|
7233
|
+
LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
|
|
7234
|
+
LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
|
|
7235
|
+
LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
|
|
7236
|
+
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
|
7237
|
+
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
|
7238
|
+
LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
|
|
7239
|
+
LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
|
|
7240
|
+
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
|
7241
|
+
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
|
7242
|
+
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
|
7243
|
+
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
|
7244
|
+
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
7245
|
+
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
7128
7246
|
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
7129
|
-
LLAMA_LOG_INFO("%s: freq_base_swa
|
|
7130
|
-
LLAMA_LOG_INFO("%s: freq_scale_swa
|
|
7247
|
+
LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
|
|
7248
|
+
LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
|
|
7131
7249
|
}
|
|
7132
|
-
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn
|
|
7133
|
-
LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
7134
|
-
LLAMA_LOG_INFO("%s: rope_finetuned
|
|
7250
|
+
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
7251
|
+
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
7252
|
+
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
7135
7253
|
// MRoPE (Multi-axis Rotary Position Embedding) sections
|
|
7136
7254
|
if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
|
|
7137
|
-
LLAMA_LOG_INFO("%s: mrope sections
|
|
7255
|
+
LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
|
|
7138
7256
|
}
|
|
7139
7257
|
if (!classifier_labels.empty()) {
|
|
7140
|
-
LLAMA_LOG_INFO("%s: n_cls_out
|
|
7258
|
+
LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
|
|
7141
7259
|
|
|
7142
7260
|
size_t i = 0;
|
|
7143
7261
|
for (auto label : classifier_labels) {
|
|
7144
|
-
LLAMA_LOG_INFO("%s: cls_label[%2zu]
|
|
7262
|
+
LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
|
|
7145
7263
|
}
|
|
7146
7264
|
}
|
|
7147
7265
|
}
|
|
@@ -7155,55 +7273,55 @@ void llama_model::print_info() const {
|
|
|
7155
7273
|
arch == LLM_ARCH_QWEN3NEXT ||
|
|
7156
7274
|
arch == LLM_ARCH_NEMOTRON_H ||
|
|
7157
7275
|
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
|
7158
|
-
LLAMA_LOG_INFO("%s: ssm_d_conv
|
|
7159
|
-
LLAMA_LOG_INFO("%s: ssm_d_inner
|
|
7160
|
-
LLAMA_LOG_INFO("%s: ssm_d_state
|
|
7161
|
-
LLAMA_LOG_INFO("%s: ssm_dt_rank
|
|
7162
|
-
LLAMA_LOG_INFO("%s: ssm_n_group
|
|
7163
|
-
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms
|
|
7276
|
+
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
7277
|
+
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
7278
|
+
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
7279
|
+
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
|
7280
|
+
LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
|
|
7281
|
+
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
|
|
7164
7282
|
}
|
|
7165
7283
|
|
|
7166
|
-
LLAMA_LOG_INFO("%s: model type
|
|
7284
|
+
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
|
|
7167
7285
|
if (pimpl->n_elements >= 1e12) {
|
|
7168
|
-
LLAMA_LOG_INFO("%s: model params
|
|
7286
|
+
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
|
|
7169
7287
|
} else if (pimpl->n_elements >= 1e9) {
|
|
7170
|
-
LLAMA_LOG_INFO("%s: model params
|
|
7288
|
+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
|
|
7171
7289
|
} else if (pimpl->n_elements >= 1e6) {
|
|
7172
|
-
LLAMA_LOG_INFO("%s: model params
|
|
7290
|
+
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
|
|
7173
7291
|
} else {
|
|
7174
|
-
LLAMA_LOG_INFO("%s: model params
|
|
7292
|
+
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
|
|
7175
7293
|
}
|
|
7176
7294
|
|
|
7177
7295
|
// general kv
|
|
7178
|
-
LLAMA_LOG_INFO("%s: general.name
|
|
7296
|
+
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
|
|
7179
7297
|
|
|
7180
7298
|
if (arch == LLM_ARCH_DEEPSEEK) {
|
|
7181
|
-
LLAMA_LOG_INFO("%s: n_layer_dense_lead
|
|
7182
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7183
|
-
LLAMA_LOG_INFO("%s: n_expert_shared
|
|
7184
|
-
LLAMA_LOG_INFO("%s: expert_weights_scale
|
|
7299
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
7300
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7301
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
7302
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
7185
7303
|
}
|
|
7186
7304
|
|
|
7187
7305
|
if (arch == LLM_ARCH_DEEPSEEK2) {
|
|
7188
|
-
LLAMA_LOG_INFO("%s: n_layer_dense_lead
|
|
7189
|
-
LLAMA_LOG_INFO("%s: n_lora_q
|
|
7190
|
-
LLAMA_LOG_INFO("%s: n_lora_kv
|
|
7191
|
-
LLAMA_LOG_INFO("%s: n_embd_head_k_mla
|
|
7192
|
-
LLAMA_LOG_INFO("%s: n_embd_head_v_mla
|
|
7193
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7194
|
-
LLAMA_LOG_INFO("%s: n_expert_shared
|
|
7195
|
-
LLAMA_LOG_INFO("%s: expert_weights_scale
|
|
7196
|
-
LLAMA_LOG_INFO("%s: expert_weights_norm
|
|
7197
|
-
LLAMA_LOG_INFO("%s: expert_gating_func
|
|
7306
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
7307
|
+
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
|
7308
|
+
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
|
7309
|
+
LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
|
|
7310
|
+
LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
|
|
7311
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7312
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
7313
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
7314
|
+
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
7315
|
+
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
7198
7316
|
}
|
|
7199
7317
|
|
|
7200
7318
|
if (arch == LLM_ARCH_QWEN2MOE) {
|
|
7201
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7202
|
-
LLAMA_LOG_INFO("%s: n_ff_shexp
|
|
7319
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7320
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
7203
7321
|
}
|
|
7204
7322
|
|
|
7205
7323
|
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
|
|
7206
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7324
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7207
7325
|
}
|
|
7208
7326
|
|
|
7209
7327
|
if (arch == LLM_ARCH_MINICPM ||
|
|
@@ -7211,41 +7329,41 @@ void llama_model::print_info() const {
|
|
|
7211
7329
|
arch == LLM_ARCH_GRANITE_MOE ||
|
|
7212
7330
|
arch == LLM_ARCH_GRANITE_HYBRID ||
|
|
7213
7331
|
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
|
7214
|
-
LLAMA_LOG_INFO("%s: f_embedding_scale
|
|
7215
|
-
LLAMA_LOG_INFO("%s: f_residual_scale
|
|
7216
|
-
LLAMA_LOG_INFO("%s: f_attention_scale
|
|
7217
|
-
LLAMA_LOG_INFO("%s: n_ff_shexp
|
|
7332
|
+
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
|
7333
|
+
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
|
7334
|
+
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
7335
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
7218
7336
|
}
|
|
7219
7337
|
|
|
7220
7338
|
if (arch == LLM_ARCH_BAILINGMOE) {
|
|
7221
|
-
LLAMA_LOG_INFO("%s: n_layer_dense_lead
|
|
7222
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7223
|
-
LLAMA_LOG_INFO("%s: n_expert_shared
|
|
7224
|
-
LLAMA_LOG_INFO("%s: expert_weights_scale
|
|
7225
|
-
LLAMA_LOG_INFO("%s: expert_weights_norm
|
|
7339
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
7340
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7341
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
7342
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
7343
|
+
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
7226
7344
|
}
|
|
7227
7345
|
|
|
7228
7346
|
if (arch == LLM_ARCH_BAILINGMOE2) {
|
|
7229
|
-
LLAMA_LOG_INFO("%s: n_layer_dense_lead
|
|
7230
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7231
|
-
LLAMA_LOG_INFO("%s: n_ff_shexp
|
|
7232
|
-
LLAMA_LOG_INFO("%s: n_expert_shared
|
|
7233
|
-
LLAMA_LOG_INFO("%s: expert_weights_scale
|
|
7234
|
-
LLAMA_LOG_INFO("%s: expert_weights_norm
|
|
7235
|
-
LLAMA_LOG_INFO("%s: expert_gating_func
|
|
7236
|
-
LLAMA_LOG_INFO("%s: nextn_predict_layers
|
|
7347
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
7348
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7349
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
7350
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
7351
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
7352
|
+
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
7353
|
+
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
7354
|
+
LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
|
|
7237
7355
|
}
|
|
7238
7356
|
|
|
7239
7357
|
if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
|
|
7240
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7241
|
-
LLAMA_LOG_INFO("%s: expert_gating_func
|
|
7358
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7359
|
+
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
7242
7360
|
}
|
|
7243
7361
|
|
|
7244
7362
|
if (arch == LLM_ARCH_GROVEMOE) {
|
|
7245
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7246
|
-
LLAMA_LOG_INFO("%s: n_ff_chexp
|
|
7247
|
-
LLAMA_LOG_INFO("%s: n_group_experts
|
|
7248
|
-
LLAMA_LOG_INFO("%s: expert_group_scale
|
|
7363
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7364
|
+
LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
|
|
7365
|
+
LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
|
|
7366
|
+
LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
|
|
7249
7367
|
}
|
|
7250
7368
|
|
|
7251
7369
|
vocab.print_info();
|
|
@@ -7799,6 +7917,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7799
7917
|
llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
|
|
7800
7918
|
}
|
|
7801
7919
|
} break;
|
|
7920
|
+
case LLM_ARCH_EXAONE_MOE:
|
|
7921
|
+
{
|
|
7922
|
+
llm = std::make_unique<llm_build_exaone_moe>(*this, params);
|
|
7923
|
+
} break;
|
|
7802
7924
|
case LLM_ARCH_RWKV6:
|
|
7803
7925
|
{
|
|
7804
7926
|
llm = std::make_unique<llm_build_rwkv6>(*this, params);
|
|
@@ -7973,6 +8095,7 @@ llama_model_params llama_model_default_params() {
|
|
|
7973
8095
|
/*.kv_overrides =*/ nullptr,
|
|
7974
8096
|
/*.vocab_only =*/ false,
|
|
7975
8097
|
/*.use_mmap =*/ true,
|
|
8098
|
+
/*.use_direct_io =*/ true,
|
|
7976
8099
|
/*.use_mlock =*/ false,
|
|
7977
8100
|
/*.check_tensors =*/ false,
|
|
7978
8101
|
/*.use_extra_bufts =*/ true,
|
|
@@ -8158,6 +8281,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
8158
8281
|
case LLM_ARCH_NEMOTRON:
|
|
8159
8282
|
case LLM_ARCH_EXAONE:
|
|
8160
8283
|
case LLM_ARCH_EXAONE4:
|
|
8284
|
+
case LLM_ARCH_EXAONE_MOE:
|
|
8161
8285
|
case LLM_ARCH_MINICPM3:
|
|
8162
8286
|
case LLM_ARCH_BAILINGMOE2:
|
|
8163
8287
|
case LLM_ARCH_DOTS1:
|
|
@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
596
596
|
}
|
|
597
597
|
|
|
598
598
|
std::vector<std::string> splits = {};
|
|
599
|
-
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
|
599
|
+
llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
|
600
600
|
ml.init_mappings(false); // no prefetching
|
|
601
601
|
|
|
602
602
|
llama_model model(llama_model_default_params());
|
|
@@ -461,6 +461,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
461
461
|
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
462
462
|
};
|
|
463
463
|
break;
|
|
464
|
+
case LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE:
|
|
465
|
+
regex_exprs = {
|
|
466
|
+
// original regex from tokenizer.json
|
|
467
|
+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
|
|
468
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
|
469
|
+
};
|
|
470
|
+
break;
|
|
464
471
|
default:
|
|
465
472
|
// default regex for BPE tokenization pre-processing
|
|
466
473
|
regex_exprs = {
|
|
@@ -1965,6 +1972,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1965
1972
|
} else if (
|
|
1966
1973
|
tokenizer_pre == "exaone4") {
|
|
1967
1974
|
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
|
1975
|
+
} else if (
|
|
1976
|
+
tokenizer_pre == "exaone-moe") {
|
|
1977
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE;
|
|
1968
1978
|
} else if (
|
|
1969
1979
|
tokenizer_pre == "chameleon") {
|
|
1970
1980
|
pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
|
|
@@ -2436,7 +2446,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2436
2446
|
auto & attr = id_to_token[t.second].attr;
|
|
2437
2447
|
|
|
2438
2448
|
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
|
|
2439
|
-
|
|
2449
|
+
LLAMA_LOG_WARN("%s: setting token '%s' (%d) attribute to USER_DEFINED (%u), old attributes: %u\n",
|
|
2450
|
+
__func__, t.first.c_str(), t.second, LLAMA_TOKEN_ATTR_USER_DEFINED, attr);
|
|
2451
|
+
|
|
2452
|
+
attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
|
2440
2453
|
}
|
|
2441
2454
|
}
|
|
2442
2455
|
|
|
@@ -2489,7 +2502,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2489
2502
|
special_eog_ids.erase(end_id);
|
|
2490
2503
|
|
|
2491
2504
|
auto & attr = id_to_token[end_id].attr;
|
|
2492
|
-
attr =
|
|
2505
|
+
attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
|
2493
2506
|
|
|
2494
2507
|
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
|
|
2495
2508
|
}
|
|
@@ -3289,34 +3302,34 @@ int32_t llama_vocab::impl::detokenize(
|
|
|
3289
3302
|
}
|
|
3290
3303
|
|
|
3291
3304
|
void llama_vocab::impl::print_info() const {
|
|
3292
|
-
LLAMA_LOG_INFO("%s: vocab type
|
|
3293
|
-
LLAMA_LOG_INFO("%s: n_vocab
|
|
3294
|
-
LLAMA_LOG_INFO("%s: n_merges
|
|
3305
|
+
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
|
|
3306
|
+
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
|
|
3307
|
+
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
|
|
3295
3308
|
|
|
3296
3309
|
// special tokens
|
|
3297
|
-
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token
|
|
3298
|
-
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token
|
|
3299
|
-
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token
|
|
3300
|
-
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token
|
|
3301
|
-
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token
|
|
3302
|
-
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token
|
|
3303
|
-
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token
|
|
3304
|
-
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token
|
|
3305
|
-
|
|
3306
|
-
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token
|
|
3307
|
-
|
|
3308
|
-
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token
|
|
3309
|
-
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token
|
|
3310
|
-
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token
|
|
3311
|
-
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token
|
|
3312
|
-
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token
|
|
3313
|
-
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token
|
|
3310
|
+
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
|
|
3311
|
+
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
|
|
3312
|
+
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
|
|
3313
|
+
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
|
|
3314
|
+
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
|
|
3315
|
+
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
|
|
3316
|
+
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
|
|
3317
|
+
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
|
|
3318
|
+
|
|
3319
|
+
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
|
|
3320
|
+
|
|
3321
|
+
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
|
|
3322
|
+
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
|
|
3323
|
+
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
|
|
3324
|
+
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
|
|
3325
|
+
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
|
|
3326
|
+
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
|
|
3314
3327
|
|
|
3315
3328
|
for (const auto & id : special_eog_ids) {
|
|
3316
|
-
LLAMA_LOG_INFO( "%s: EOG token
|
|
3329
|
+
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
|
|
3317
3330
|
}
|
|
3318
3331
|
|
|
3319
|
-
LLAMA_LOG_INFO("%s: max token length
|
|
3332
|
+
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
|
|
3320
3333
|
}
|
|
3321
3334
|
|
|
3322
3335
|
llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
|