@fugood/llama.node 1.4.14 → 1.5.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/lib/binding.ts +13 -6
  2. package/lib/index.js +2 -2
  3. package/lib/index.ts +8 -3
  4. package/package.json +15 -15
  5. package/scripts/llama.cpp.patch +77 -65
  6. package/src/LlamaContext.cpp +31 -34
  7. package/src/llama.cpp/CMakeLists.txt +24 -8
  8. package/src/llama.cpp/common/CMakeLists.txt +15 -34
  9. package/src/llama.cpp/common/arg.cpp +59 -10
  10. package/src/llama.cpp/common/chat-parser.cpp +115 -0
  11. package/src/llama.cpp/common/chat.cpp +356 -34
  12. package/src/llama.cpp/common/chat.h +17 -13
  13. package/src/llama.cpp/common/common.cpp +0 -1
  14. package/src/llama.cpp/common/common.h +30 -25
  15. package/src/llama.cpp/common/debug.cpp +165 -0
  16. package/src/llama.cpp/common/debug.h +43 -0
  17. package/src/llama.cpp/common/download.cpp +12 -342
  18. package/src/llama.cpp/common/download.h +6 -0
  19. package/src/llama.cpp/common/jinja/caps.cpp +237 -0
  20. package/src/llama.cpp/common/jinja/caps.h +24 -0
  21. package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
  22. package/src/llama.cpp/common/jinja/lexer.h +157 -0
  23. package/src/llama.cpp/common/jinja/parser.cpp +591 -0
  24. package/src/llama.cpp/common/jinja/parser.h +21 -0
  25. package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
  26. package/src/llama.cpp/common/jinja/runtime.h +628 -0
  27. package/src/llama.cpp/common/jinja/string.cpp +207 -0
  28. package/src/llama.cpp/common/jinja/string.h +58 -0
  29. package/src/llama.cpp/common/jinja/utils.h +49 -0
  30. package/src/llama.cpp/common/jinja/value.cpp +1221 -0
  31. package/src/llama.cpp/common/jinja/value.h +464 -0
  32. package/src/llama.cpp/common/preset.cpp +12 -2
  33. package/src/llama.cpp/common/sampling.cpp +52 -19
  34. package/src/llama.cpp/ggml/include/ggml.h +39 -7
  35. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
  37. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
  39. package/src/llama.cpp/include/llama-cpp.h +3 -1
  40. package/src/llama.cpp/include/llama.h +29 -2
  41. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  42. package/src/llama.cpp/src/llama-adapter.cpp +7 -13
  43. package/src/llama.cpp/src/llama-adapter.h +1 -3
  44. package/src/llama.cpp/src/llama-arch.cpp +35 -0
  45. package/src/llama.cpp/src/llama-arch.h +1 -0
  46. package/src/llama.cpp/src/llama-chat.cpp +20 -0
  47. package/src/llama.cpp/src/llama-chat.h +1 -0
  48. package/src/llama.cpp/src/llama-context.cpp +232 -144
  49. package/src/llama.cpp/src/llama-context.h +10 -0
  50. package/src/llama.cpp/src/llama-cparams.h +2 -0
  51. package/src/llama.cpp/src/llama-graph.cpp +31 -43
  52. package/src/llama.cpp/src/llama-hparams.cpp +0 -36
  53. package/src/llama.cpp/src/llama-hparams.h +38 -1
  54. package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
  55. package/src/llama.cpp/src/llama-kv-cache.h +0 -2
  56. package/src/llama.cpp/src/llama-mmap.cpp +13 -6
  57. package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
  58. package/src/llama.cpp/src/llama-model.cpp +215 -97
  59. package/src/llama.cpp/src/llama-model.h +3 -2
  60. package/src/llama.cpp/src/llama-sampling.cpp +170 -13
  61. package/src/llama.cpp/src/llama-vocab.cpp +37 -24
  62. package/src/llama.cpp/src/llama-vocab.h +1 -0
  63. package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
  64. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
  65. package/src/llama.cpp/src/models/models.h +13 -2
  66. package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
@@ -446,7 +446,7 @@ struct llama_model::impl {
446
446
  llama_mlocks mlock_bufs;
447
447
  llama_mlocks mlock_mmaps;
448
448
 
449
- // contexts where the model tensors metadata is stored as well ass the corresponding buffers:
449
+ // contexts where the model tensors metadata is stored as well as the corresponding buffers:
450
450
  std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
451
451
 
452
452
  buft_list_t cpu_buft_list;
@@ -468,7 +468,11 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi
468
468
  pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
469
469
  }
470
470
 
471
- llama_model::~llama_model() = default;
471
+ llama_model::~llama_model() {
472
+ for (auto * lora : loras) {
473
+ delete lora;
474
+ }
475
+ }
472
476
 
473
477
  void llama_model::load_stats(llama_model_loader & ml) {
474
478
  pimpl->n_elements = ml.n_elements;
@@ -1933,6 +1937,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1933
1937
  default: type = LLM_TYPE_UNKNOWN;
1934
1938
  }
1935
1939
  } break;
1940
+ case LLM_ARCH_EXAONE_MOE:
1941
+ {
1942
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1943
+ hparams.n_swa = 128;
1944
+ hparams.set_swa_pattern(4);
1945
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1946
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1947
+
1948
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1949
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1950
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1951
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
1952
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1953
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
1954
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
1955
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
1956
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1957
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1958
+
1959
+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
1960
+
1961
+ switch (hparams.n_layer) {
1962
+ case 32: type = LLM_TYPE_30B_A3B; break;
1963
+ case 48:
1964
+ case 49: type = LLM_TYPE_235B_A22B; break;
1965
+ default: type = LLM_TYPE_UNKNOWN;
1966
+ }
1967
+ } break;
1936
1968
  case LLM_ARCH_RWKV6:
1937
1969
  case LLM_ARCH_RWKV6QWEN2:
1938
1970
  {
@@ -5516,6 +5548,84 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5516
5548
  layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
5517
5549
  }
5518
5550
  } break;
5551
+ case LLM_ARCH_EXAONE_MOE:
5552
+ {
5553
+ const int64_t n_ff_exp = hparams.n_ff_exp;
5554
+ const int64_t n_expert = hparams.n_expert;
5555
+ const int64_t n_expert_used = hparams.n_expert_used;
5556
+ const int64_t n_ff_shexp = hparams.n_ff_shexp;
5557
+ const int64_t head_dim = hparams.n_embd_head_k;
5558
+ const int64_t n_qo_dim = n_head * head_dim;
5559
+ const int64_t n_kv_dim = n_head_kv * head_dim;
5560
+
5561
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5562
+
5563
+ // output
5564
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5565
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5566
+
5567
+ if (output == NULL) {
5568
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5569
+ }
5570
+
5571
+ for (int i = 0; i < n_layer; ++i) {
5572
+ int flags = 0;
5573
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5574
+ // skip all tensors in the NextN layers
5575
+ flags |= TENSOR_SKIP;
5576
+ }
5577
+
5578
+ auto & layer = layers[i];
5579
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, flags);
5580
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, flags);
5581
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, flags);
5582
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, flags);
5583
+
5584
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0) | flags);
5585
+
5586
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
5587
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
5588
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
5589
+
5590
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
5591
+
5592
+ // dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end
5593
+ if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers)) {
5594
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
5595
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags);
5596
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
5597
+ } else {
5598
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
5599
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
5600
+
5601
+ if (n_expert == 0) {
5602
+ throw std::runtime_error("n_expert must be > 0");
5603
+ }
5604
+ if (n_expert_used == 0) {
5605
+ throw std::runtime_error("n_expert_used must be > 0");
5606
+ }
5607
+
5608
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
5609
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
5610
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
5611
+
5612
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
5613
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
5614
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
5615
+ }
5616
+
5617
+ // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
5618
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5619
+ layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
5620
+ layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, flags);
5621
+ layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, flags);
5622
+
5623
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
5624
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
5625
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
5626
+ }
5627
+ }
5628
+ } break;
5519
5629
  case LLM_ARCH_RWKV6:
5520
5630
  {
5521
5631
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -6763,7 +6873,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6763
6873
  } else {
6764
6874
  // Linear attention (gated delta net) specific tensors
6765
6875
  // Create tensors with calculated dimensions
6766
- layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0);
6876
+ // note: ssm_in is used by legacy GGUF
6877
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, TENSOR_NOT_REQUIRED);
6878
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
6879
+ layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
6767
6880
  layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
6768
6881
  layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
6769
6882
  layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
@@ -7098,59 +7211,59 @@ void llama_model::print_info() const {
7098
7211
  };
7099
7212
 
7100
7213
  // hparams
7101
- LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
7102
- LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
7103
- LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
7214
+ LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
7215
+ LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
7216
+ LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
7104
7217
 
7105
7218
  if (!hparams.vocab_only) {
7106
- LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
7107
- LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
7108
- LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
7109
- LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
7110
- LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
7111
- LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
7112
- LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
7113
- LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
7114
- LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
7115
- LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
7116
- LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
7117
- LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
7118
- LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
7119
- LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
7120
- LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
7121
- LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
7122
- LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
7123
- LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
7124
- LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
7125
- LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
7126
- LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
7127
- LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
7128
- LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
7129
- LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
7130
- LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
7131
- LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
7132
- LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
7133
- LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
7134
- LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
7135
- LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
7136
- LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
7219
+ LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
7220
+ LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
7221
+ LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
7222
+ LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
7223
+ LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
7224
+ LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
7225
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
7226
+ LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
7227
+ LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
7228
+ LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
7229
+ LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
7230
+ LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
7231
+ LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
7232
+ LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
7233
+ LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
7234
+ LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
7235
+ LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
7236
+ LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
7237
+ LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
7238
+ LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
7239
+ LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
7240
+ LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
7241
+ LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
7242
+ LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
7243
+ LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
7244
+ LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
7245
+ LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
7246
+ LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
7247
+ LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
7248
+ LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
7249
+ LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
7137
7250
  if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
7138
- LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
7139
- LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
7251
+ LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
7252
+ LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
7140
7253
  }
7141
- LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
7142
- LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
7143
- LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
7254
+ LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
7255
+ LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
7256
+ LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
7144
7257
  // MRoPE (Multi-axis Rotary Position Embedding) sections
7145
7258
  if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
7146
- LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
7259
+ LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
7147
7260
  }
7148
7261
  if (!classifier_labels.empty()) {
7149
- LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
7262
+ LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
7150
7263
 
7151
7264
  size_t i = 0;
7152
7265
  for (auto label : classifier_labels) {
7153
- LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
7266
+ LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
7154
7267
  }
7155
7268
  }
7156
7269
  }
@@ -7164,55 +7277,55 @@ void llama_model::print_info() const {
7164
7277
  arch == LLM_ARCH_QWEN3NEXT ||
7165
7278
  arch == LLM_ARCH_NEMOTRON_H ||
7166
7279
  arch == LLM_ARCH_NEMOTRON_H_MOE) {
7167
- LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
7168
- LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
7169
- LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
7170
- LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
7171
- LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
7172
- LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
7280
+ LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
7281
+ LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
7282
+ LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
7283
+ LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
7284
+ LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
7285
+ LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
7173
7286
  }
7174
7287
 
7175
- LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
7288
+ LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
7176
7289
  if (pimpl->n_elements >= 1e12) {
7177
- LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
7290
+ LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
7178
7291
  } else if (pimpl->n_elements >= 1e9) {
7179
- LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
7292
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
7180
7293
  } else if (pimpl->n_elements >= 1e6) {
7181
- LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
7294
+ LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
7182
7295
  } else {
7183
- LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
7296
+ LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
7184
7297
  }
7185
7298
 
7186
7299
  // general kv
7187
- LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
7300
+ LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
7188
7301
 
7189
7302
  if (arch == LLM_ARCH_DEEPSEEK) {
7190
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7191
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7192
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7193
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7303
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7304
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7305
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7306
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7194
7307
  }
7195
7308
 
7196
7309
  if (arch == LLM_ARCH_DEEPSEEK2) {
7197
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7198
- LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
7199
- LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
7200
- LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
7201
- LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
7202
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7203
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7204
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7205
- LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7206
- LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7310
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7311
+ LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
7312
+ LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
7313
+ LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
7314
+ LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
7315
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7316
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7317
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7318
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7319
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7207
7320
  }
7208
7321
 
7209
7322
  if (arch == LLM_ARCH_QWEN2MOE) {
7210
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7211
- LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7323
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7324
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7212
7325
  }
7213
7326
 
7214
7327
  if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
7215
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7328
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7216
7329
  }
7217
7330
 
7218
7331
  if (arch == LLM_ARCH_MINICPM ||
@@ -7220,41 +7333,41 @@ void llama_model::print_info() const {
7220
7333
  arch == LLM_ARCH_GRANITE_MOE ||
7221
7334
  arch == LLM_ARCH_GRANITE_HYBRID ||
7222
7335
  arch == LLM_ARCH_NEMOTRON_H_MOE) {
7223
- LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
7224
- LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
7225
- LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
7226
- LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7336
+ LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
7337
+ LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
7338
+ LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
7339
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7227
7340
  }
7228
7341
 
7229
7342
  if (arch == LLM_ARCH_BAILINGMOE) {
7230
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7231
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7232
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7233
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7234
- LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7343
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7344
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7345
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7346
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7347
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7235
7348
  }
7236
7349
 
7237
7350
  if (arch == LLM_ARCH_BAILINGMOE2) {
7238
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7239
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7240
- LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7241
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7242
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7243
- LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7244
- LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7245
- LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
7351
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7352
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7353
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7354
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7355
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7356
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7357
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7358
+ LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
7246
7359
  }
7247
7360
 
7248
7361
  if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
7249
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7250
- LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7362
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7363
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7251
7364
  }
7252
7365
 
7253
7366
  if (arch == LLM_ARCH_GROVEMOE) {
7254
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7255
- LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
7256
- LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
7257
- LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
7367
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7368
+ LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
7369
+ LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
7370
+ LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
7258
7371
  }
7259
7372
 
7260
7373
  vocab.print_info();
@@ -7808,6 +7921,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7808
7921
  llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
7809
7922
  }
7810
7923
  } break;
7924
+ case LLM_ARCH_EXAONE_MOE:
7925
+ {
7926
+ llm = std::make_unique<llm_build_exaone_moe>(*this, params);
7927
+ } break;
7811
7928
  case LLM_ARCH_RWKV6:
7812
7929
  {
7813
7930
  llm = std::make_unique<llm_build_rwkv6>(*this, params);
@@ -8168,6 +8285,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
8168
8285
  case LLM_ARCH_NEMOTRON:
8169
8286
  case LLM_ARCH_EXAONE:
8170
8287
  case LLM_ARCH_EXAONE4:
8288
+ case LLM_ARCH_EXAONE_MOE:
8171
8289
  case LLM_ARCH_MINICPM3:
8172
8290
  case LLM_ARCH_BAILINGMOE2:
8173
8291
  case LLM_ARCH_DOTS1:
@@ -11,6 +11,7 @@
11
11
  #include <memory>
12
12
  #include <string>
13
13
  #include <unordered_map>
14
+ #include <unordered_set>
14
15
  #include <vector>
15
16
 
16
17
  struct llama_cparams;
@@ -476,8 +477,8 @@ struct llama_model {
476
477
  // for quantize-stats only
477
478
  std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
478
479
 
479
- // for keeping track of extra nodes used by lora adapters
480
- uint32_t n_lora_nodes = 0;
480
+ // for keeping track of associated LoRA adapters
481
+ std::unordered_set<llama_adapter_lora *> loras;
481
482
 
482
483
  int64_t t_load_us = 0;
483
484
  int64_t t_start_us = 0;