@fugood/llama.node 1.4.13 → 1.4.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/lib/binding.ts +23 -2
  2. package/lib/index.js +2 -1
  3. package/lib/index.ts +8 -1
  4. package/lib/parallel.ts +2 -2
  5. package/package.json +15 -15
  6. package/scripts/llama.cpp.patch +9 -12
  7. package/src/LlamaContext.cpp +16 -4
  8. package/src/llama.cpp/CMakeLists.txt +24 -8
  9. package/src/llama.cpp/common/CMakeLists.txt +3 -34
  10. package/src/llama.cpp/common/arg.cpp +183 -60
  11. package/src/llama.cpp/common/arg.h +0 -8
  12. package/src/llama.cpp/common/chat-parser.cpp +115 -0
  13. package/src/llama.cpp/common/chat.cpp +67 -0
  14. package/src/llama.cpp/common/chat.h +1 -0
  15. package/src/llama.cpp/common/common.cpp +2 -1
  16. package/src/llama.cpp/common/common.h +12 -7
  17. package/src/llama.cpp/common/debug.cpp +165 -0
  18. package/src/llama.cpp/common/debug.h +43 -0
  19. package/src/llama.cpp/common/download.cpp +88 -369
  20. package/src/llama.cpp/common/download.h +32 -5
  21. package/src/llama.cpp/common/preset.cpp +87 -2
  22. package/src/llama.cpp/common/preset.h +10 -1
  23. package/src/llama.cpp/ggml/include/ggml.h +5 -0
  24. package/src/llama.cpp/include/llama.h +5 -2
  25. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  26. package/src/llama.cpp/src/llama-arch.cpp +35 -0
  27. package/src/llama.cpp/src/llama-arch.h +1 -0
  28. package/src/llama.cpp/src/llama-chat.cpp +20 -0
  29. package/src/llama.cpp/src/llama-chat.h +1 -0
  30. package/src/llama.cpp/src/llama-graph.cpp +31 -43
  31. package/src/llama.cpp/src/llama-mmap.cpp +78 -42
  32. package/src/llama.cpp/src/llama-mmap.h +5 -4
  33. package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
  34. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  35. package/src/llama.cpp/src/llama-model.cpp +225 -101
  36. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  37. package/src/llama.cpp/src/llama-sampling.cpp +1 -1
  38. package/src/llama.cpp/src/llama-vocab.cpp +37 -24
  39. package/src/llama.cpp/src/llama-vocab.h +1 -0
  40. package/src/llama.cpp/src/llama.cpp +63 -27
  41. package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
  42. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
  43. package/src/llama.cpp/src/models/models.h +13 -2
  44. package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
@@ -446,7 +446,7 @@ struct llama_model::impl {
446
446
  llama_mlocks mlock_bufs;
447
447
  llama_mlocks mlock_mmaps;
448
448
 
449
- // contexts where the model tensors metadata is stored as well ass the corresponding buffers:
449
+ // contexts where the model tensors metadata is stored as well as the corresponding buffers:
450
450
  std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
451
451
 
452
452
  buft_list_t cpu_buft_list;
@@ -1933,6 +1933,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1933
1933
  default: type = LLM_TYPE_UNKNOWN;
1934
1934
  }
1935
1935
  } break;
1936
+ case LLM_ARCH_EXAONE_MOE:
1937
+ {
1938
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1939
+ hparams.n_swa = 128;
1940
+ hparams.set_swa_pattern(4);
1941
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1942
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1943
+
1944
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1945
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1946
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1947
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
1948
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1949
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
1950
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
1951
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
1952
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1953
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1954
+
1955
+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
1956
+
1957
+ switch (hparams.n_layer) {
1958
+ case 32: type = LLM_TYPE_30B_A3B; break;
1959
+ case 48:
1960
+ case 49: type = LLM_TYPE_235B_A22B; break;
1961
+ default: type = LLM_TYPE_UNKNOWN;
1962
+ }
1963
+ } break;
1936
1964
  case LLM_ARCH_RWKV6:
1937
1965
  case LLM_ARCH_RWKV6QWEN2:
1938
1966
  {
@@ -2440,7 +2468,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2440
2468
 
2441
2469
  const bool use_mmap_buffer = true;
2442
2470
 
2443
- LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
2471
+ LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
2472
+ __func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
2444
2473
 
2445
2474
  // build a list of buffer types for the CPU and GPU devices
2446
2475
  pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
@@ -2451,6 +2480,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2451
2480
  pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
2452
2481
  }
2453
2482
 
2483
+ ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2484
+ if (cpu_dev == nullptr) {
2485
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
2486
+ }
2487
+
2454
2488
  // calculate the split points
2455
2489
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
2456
2490
  std::vector<float> splits(n_devices());
@@ -2461,6 +2495,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2461
2495
  size_t total;
2462
2496
  size_t free;
2463
2497
  ggml_backend_dev_memory(dev, &free, &total);
2498
+
2499
+ // devices can return 0 bytes for free and total memory if they do not
2500
+ // have any to report. in this case, we will use the host memory as a fallback
2501
+ // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
2502
+ if (free == 0 && total == 0) {
2503
+ ggml_backend_dev_memory(cpu_dev, &free, &total);
2504
+ }
2464
2505
  splits[i] = free;
2465
2506
  }
2466
2507
  } else {
@@ -2477,10 +2518,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2477
2518
  splits[i] /= split_sum;
2478
2519
  }
2479
2520
 
2480
- ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2481
- if (cpu_dev == nullptr) {
2482
- throw std::runtime_error(format("%s: no CPU backend found", __func__));
2483
- }
2484
2521
  const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
2485
2522
  const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
2486
2523
  auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
@@ -5507,6 +5544,84 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5507
5544
  layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
5508
5545
  }
5509
5546
  } break;
5547
+ case LLM_ARCH_EXAONE_MOE:
5548
+ {
5549
+ const int64_t n_ff_exp = hparams.n_ff_exp;
5550
+ const int64_t n_expert = hparams.n_expert;
5551
+ const int64_t n_expert_used = hparams.n_expert_used;
5552
+ const int64_t n_ff_shexp = hparams.n_ff_shexp;
5553
+ const int64_t head_dim = hparams.n_embd_head_k;
5554
+ const int64_t n_qo_dim = n_head * head_dim;
5555
+ const int64_t n_kv_dim = n_head_kv * head_dim;
5556
+
5557
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5558
+
5559
+ // output
5560
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5561
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5562
+
5563
+ if (output == NULL) {
5564
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5565
+ }
5566
+
5567
+ for (int i = 0; i < n_layer; ++i) {
5568
+ int flags = 0;
5569
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5570
+ // skip all tensors in the NextN layers
5571
+ flags |= TENSOR_SKIP;
5572
+ }
5573
+
5574
+ auto & layer = layers[i];
5575
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, flags);
5576
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, flags);
5577
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, flags);
5578
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, flags);
5579
+
5580
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0) | flags);
5581
+
5582
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
5583
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
5584
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
5585
+
5586
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
5587
+
5588
+ // dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end
5589
+ if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers)) {
5590
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
5591
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags);
5592
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
5593
+ } else {
5594
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
5595
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
5596
+
5597
+ if (n_expert == 0) {
5598
+ throw std::runtime_error("n_expert must be > 0");
5599
+ }
5600
+ if (n_expert_used == 0) {
5601
+ throw std::runtime_error("n_expert_used must be > 0");
5602
+ }
5603
+
5604
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
5605
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
5606
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
5607
+
5608
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
5609
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
5610
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
5611
+ }
5612
+
5613
+ // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
5614
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5615
+ layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
5616
+ layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, flags);
5617
+ layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, flags);
5618
+
5619
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
5620
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
5621
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
5622
+ }
5623
+ }
5624
+ } break;
5510
5625
  case LLM_ARCH_RWKV6:
5511
5626
  {
5512
5627
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -6754,7 +6869,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6754
6869
  } else {
6755
6870
  // Linear attention (gated delta net) specific tensors
6756
6871
  // Create tensors with calculated dimensions
6757
- layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0);
6872
+ // note: ssm_in is used by legacy GGUF
6873
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, TENSOR_NOT_REQUIRED);
6874
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
6875
+ layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
6758
6876
  layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
6759
6877
  layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
6760
6878
  layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
@@ -7089,59 +7207,59 @@ void llama_model::print_info() const {
7089
7207
  };
7090
7208
 
7091
7209
  // hparams
7092
- LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
7093
- LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
7094
- LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
7210
+ LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
7211
+ LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
7212
+ LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
7095
7213
 
7096
7214
  if (!hparams.vocab_only) {
7097
- LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
7098
- LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
7099
- LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
7100
- LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
7101
- LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
7102
- LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
7103
- LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
7104
- LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
7105
- LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
7106
- LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
7107
- LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
7108
- LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
7109
- LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
7110
- LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
7111
- LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
7112
- LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
7113
- LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
7114
- LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
7115
- LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
7116
- LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
7117
- LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
7118
- LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
7119
- LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
7120
- LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
7121
- LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
7122
- LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
7123
- LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
7124
- LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
7125
- LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
7126
- LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
7127
- LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
7215
+ LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
7216
+ LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
7217
+ LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
7218
+ LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
7219
+ LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
7220
+ LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
7221
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
7222
+ LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
7223
+ LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
7224
+ LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
7225
+ LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
7226
+ LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
7227
+ LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
7228
+ LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
7229
+ LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
7230
+ LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
7231
+ LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
7232
+ LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
7233
+ LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
7234
+ LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
7235
+ LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
7236
+ LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
7237
+ LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
7238
+ LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
7239
+ LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
7240
+ LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
7241
+ LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
7242
+ LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
7243
+ LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
7244
+ LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
7245
+ LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
7128
7246
  if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
7129
- LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
7130
- LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
7247
+ LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
7248
+ LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
7131
7249
  }
7132
- LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
7133
- LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
7134
- LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
7250
+ LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
7251
+ LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
7252
+ LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
7135
7253
  // MRoPE (Multi-axis Rotary Position Embedding) sections
7136
7254
  if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
7137
- LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
7255
+ LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
7138
7256
  }
7139
7257
  if (!classifier_labels.empty()) {
7140
- LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
7258
+ LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
7141
7259
 
7142
7260
  size_t i = 0;
7143
7261
  for (auto label : classifier_labels) {
7144
- LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
7262
+ LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
7145
7263
  }
7146
7264
  }
7147
7265
  }
@@ -7155,55 +7273,55 @@ void llama_model::print_info() const {
7155
7273
  arch == LLM_ARCH_QWEN3NEXT ||
7156
7274
  arch == LLM_ARCH_NEMOTRON_H ||
7157
7275
  arch == LLM_ARCH_NEMOTRON_H_MOE) {
7158
- LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
7159
- LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
7160
- LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
7161
- LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
7162
- LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
7163
- LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
7276
+ LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
7277
+ LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
7278
+ LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
7279
+ LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
7280
+ LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
7281
+ LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
7164
7282
  }
7165
7283
 
7166
- LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
7284
+ LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
7167
7285
  if (pimpl->n_elements >= 1e12) {
7168
- LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
7286
+ LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
7169
7287
  } else if (pimpl->n_elements >= 1e9) {
7170
- LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
7288
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
7171
7289
  } else if (pimpl->n_elements >= 1e6) {
7172
- LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
7290
+ LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
7173
7291
  } else {
7174
- LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
7292
+ LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
7175
7293
  }
7176
7294
 
7177
7295
  // general kv
7178
- LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
7296
+ LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
7179
7297
 
7180
7298
  if (arch == LLM_ARCH_DEEPSEEK) {
7181
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7182
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7183
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7184
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7299
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7300
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7301
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7302
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7185
7303
  }
7186
7304
 
7187
7305
  if (arch == LLM_ARCH_DEEPSEEK2) {
7188
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7189
- LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
7190
- LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
7191
- LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
7192
- LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
7193
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7194
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7195
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7196
- LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7197
- LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7306
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7307
+ LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
7308
+ LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
7309
+ LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
7310
+ LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
7311
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7312
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7313
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7314
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7315
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7198
7316
  }
7199
7317
 
7200
7318
  if (arch == LLM_ARCH_QWEN2MOE) {
7201
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7202
- LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7319
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7320
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7203
7321
  }
7204
7322
 
7205
7323
  if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
7206
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7324
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7207
7325
  }
7208
7326
 
7209
7327
  if (arch == LLM_ARCH_MINICPM ||
@@ -7211,41 +7329,41 @@ void llama_model::print_info() const {
7211
7329
  arch == LLM_ARCH_GRANITE_MOE ||
7212
7330
  arch == LLM_ARCH_GRANITE_HYBRID ||
7213
7331
  arch == LLM_ARCH_NEMOTRON_H_MOE) {
7214
- LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
7215
- LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
7216
- LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
7217
- LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7332
+ LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
7333
+ LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
7334
+ LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
7335
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7218
7336
  }
7219
7337
 
7220
7338
  if (arch == LLM_ARCH_BAILINGMOE) {
7221
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7222
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7223
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7224
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7225
- LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7339
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7340
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7341
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7342
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7343
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7226
7344
  }
7227
7345
 
7228
7346
  if (arch == LLM_ARCH_BAILINGMOE2) {
7229
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7230
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7231
- LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7232
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7233
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7234
- LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7235
- LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7236
- LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
7347
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7348
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7349
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7350
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7351
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7352
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7353
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7354
+ LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
7237
7355
  }
7238
7356
 
7239
7357
  if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
7240
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7241
- LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7358
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7359
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7242
7360
  }
7243
7361
 
7244
7362
  if (arch == LLM_ARCH_GROVEMOE) {
7245
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7246
- LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
7247
- LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
7248
- LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
7363
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7364
+ LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
7365
+ LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
7366
+ LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
7249
7367
  }
7250
7368
 
7251
7369
  vocab.print_info();
@@ -7799,6 +7917,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7799
7917
  llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
7800
7918
  }
7801
7919
  } break;
7920
+ case LLM_ARCH_EXAONE_MOE:
7921
+ {
7922
+ llm = std::make_unique<llm_build_exaone_moe>(*this, params);
7923
+ } break;
7802
7924
  case LLM_ARCH_RWKV6:
7803
7925
  {
7804
7926
  llm = std::make_unique<llm_build_rwkv6>(*this, params);
@@ -7973,6 +8095,7 @@ llama_model_params llama_model_default_params() {
7973
8095
  /*.kv_overrides =*/ nullptr,
7974
8096
  /*.vocab_only =*/ false,
7975
8097
  /*.use_mmap =*/ true,
8098
+ /*.use_direct_io =*/ true,
7976
8099
  /*.use_mlock =*/ false,
7977
8100
  /*.check_tensors =*/ false,
7978
8101
  /*.use_extra_bufts =*/ true,
@@ -8158,6 +8281,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
8158
8281
  case LLM_ARCH_NEMOTRON:
8159
8282
  case LLM_ARCH_EXAONE:
8160
8283
  case LLM_ARCH_EXAONE4:
8284
+ case LLM_ARCH_EXAONE_MOE:
8161
8285
  case LLM_ARCH_MINICPM3:
8162
8286
  case LLM_ARCH_BAILINGMOE2:
8163
8287
  case LLM_ARCH_DOTS1:
@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
596
596
  }
597
597
 
598
598
  std::vector<std::string> splits = {};
599
- llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
599
+ llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
600
600
  ml.init_mappings(false); // no prefetching
601
601
 
602
602
  llama_model model(llama_model_default_params());
@@ -2142,7 +2142,7 @@ struct llama_sampler_xtc {
2142
2142
  const uint32_t seed;
2143
2143
  uint32_t seed_cur;
2144
2144
 
2145
- std::mt19937 rng;
2145
+ std::mt19937 rng;
2146
2146
  };
2147
2147
 
2148
2148
  static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
@@ -461,6 +461,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
461
461
  "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
462
462
  };
463
463
  break;
464
+ case LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE:
465
+ regex_exprs = {
466
+ // original regex from tokenizer.json
467
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
468
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
469
+ };
470
+ break;
464
471
  default:
465
472
  // default regex for BPE tokenization pre-processing
466
473
  regex_exprs = {
@@ -1965,6 +1972,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1965
1972
  } else if (
1966
1973
  tokenizer_pre == "exaone4") {
1967
1974
  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1975
+ } else if (
1976
+ tokenizer_pre == "exaone-moe") {
1977
+ pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE;
1968
1978
  } else if (
1969
1979
  tokenizer_pre == "chameleon") {
1970
1980
  pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
@@ -2436,7 +2446,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2436
2446
  auto & attr = id_to_token[t.second].attr;
2437
2447
 
2438
2448
  if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
2439
- attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
2449
+ LLAMA_LOG_WARN("%s: setting token '%s' (%d) attribute to USER_DEFINED (%u), old attributes: %u\n",
2450
+ __func__, t.first.c_str(), t.second, LLAMA_TOKEN_ATTR_USER_DEFINED, attr);
2451
+
2452
+ attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2440
2453
  }
2441
2454
  }
2442
2455
 
@@ -2489,7 +2502,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2489
2502
  special_eog_ids.erase(end_id);
2490
2503
 
2491
2504
  auto & attr = id_to_token[end_id].attr;
2492
- attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
2505
+ attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2493
2506
 
2494
2507
  LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
2495
2508
  }
@@ -3289,34 +3302,34 @@ int32_t llama_vocab::impl::detokenize(
3289
3302
  }
3290
3303
 
3291
3304
  void llama_vocab::impl::print_info() const {
3292
- LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
3293
- LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
3294
- LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
3305
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
3306
+ LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
3307
+ LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
3295
3308
 
3296
3309
  // special tokens
3297
- if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
3298
- if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
3299
- if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
3300
- if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
3301
- if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
3302
- if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
3303
- if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
3304
- if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
3305
-
3306
- if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
3307
-
3308
- if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
3309
- if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
3310
- if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
3311
- if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
3312
- if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
3313
- if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
3310
+ if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
3311
+ if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
3312
+ if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
3313
+ if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
3314
+ if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
3315
+ if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
3316
+ if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
3317
+ if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
3318
+
3319
+ if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
3320
+
3321
+ if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
3322
+ if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
3323
+ if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
3324
+ if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
3325
+ if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
3326
+ if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
3314
3327
 
3315
3328
  for (const auto & id : special_eog_ids) {
3316
- LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
3329
+ LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
3317
3330
  }
3318
3331
 
3319
- LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
3332
+ LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
3320
3333
  }
3321
3334
 
3322
3335
  llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
@@ -53,6 +53,7 @@ enum llama_vocab_pre_type {
53
53
  LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
54
54
  LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
55
55
  LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
56
+ LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
56
57
  };
57
58
 
58
59
  struct LLM_KV;