@fugood/llama.node 1.1.4 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +14 -14
  3. package/src/LlamaContext.cpp +3 -0
  4. package/src/llama.cpp/common/arg.cpp +60 -7
  5. package/src/llama.cpp/common/chat.cpp +6 -6
  6. package/src/llama.cpp/common/common.cpp +1 -0
  7. package/src/llama.cpp/common/common.h +14 -5
  8. package/src/llama.cpp/common/speculative.cpp +135 -54
  9. package/src/llama.cpp/common/speculative.h +8 -1
  10. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  11. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
  12. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -0
  13. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  15. package/src/llama.cpp/include/llama.h +8 -4
  16. package/src/llama.cpp/src/llama-arch.cpp +40 -0
  17. package/src/llama.cpp/src/llama-arch.h +2 -0
  18. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  19. package/src/llama.cpp/src/llama-chat.cpp +20 -1
  20. package/src/llama.cpp/src/llama-chat.h +1 -0
  21. package/src/llama.cpp/src/llama-context.cpp +11 -2
  22. package/src/llama.cpp/src/llama-context.h +4 -1
  23. package/src/llama.cpp/src/llama-graph.cpp +57 -139
  24. package/src/llama.cpp/src/llama-graph.h +31 -32
  25. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +2 -2
  26. package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
  27. package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
  28. package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
  29. package/src/llama.cpp/src/llama-model.cpp +400 -21
  30. package/src/llama.cpp/src/llama-quant.cpp +3 -3
  31. package/src/llama.cpp/src/llama-vocab.cpp +7 -1
  32. package/src/llama.cpp/src/llama-vocab.h +1 -0
@@ -290,7 +290,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
290
290
  }
291
291
 
292
292
  // CPU: ACCEL -> GPU host -> CPU extra -> CPU
293
- static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
293
+ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
294
294
  buft_list_t buft_list;
295
295
 
296
296
  // add ACCEL buffer types
@@ -319,21 +319,22 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
319
319
  }
320
320
  }
321
321
 
322
- // add extra buffer types, only if no GPU device is present
323
- // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
324
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
325
- if (cpu_dev == nullptr) {
326
- throw std::runtime_error(format("%s: no CPU backend found", __func__));
327
- }
322
+ // add extra buffer types
323
+ if (use_extra_bufts) {
324
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
325
+ if (cpu_dev == nullptr) {
326
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
327
+ }
328
328
 
329
- auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
330
- auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
331
- ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
332
- if (ggml_backend_dev_get_extra_bufts_fn) {
333
- ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
334
- while (extra_bufts && *extra_bufts) {
335
- buft_list.emplace_back(cpu_dev, *extra_bufts);
336
- ++extra_bufts;
329
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
330
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
331
+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
332
+ if (ggml_backend_dev_get_extra_bufts_fn) {
333
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
334
+ while (extra_bufts && *extra_bufts) {
335
+ buft_list.emplace_back(cpu_dev, *extra_bufts);
336
+ ++extra_bufts;
337
+ }
337
338
  }
338
339
  }
339
340
 
@@ -869,6 +870,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
869
870
  hparams.causal_attn = false;
870
871
  }
871
872
  break;
873
+ case LLM_ARCH_LLADA:
874
+ {
875
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
876
+ // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
877
+ switch (hparams.n_layer) {
878
+ case 32:
879
+ type = LLM_TYPE_8B;
880
+ break;
881
+ default:
882
+ type = LLM_TYPE_UNKNOWN;
883
+ }
884
+ // Set non-causal attention for diffusion models
885
+ hparams.causal_attn = false;
886
+ }
887
+ break;
872
888
  case LLM_ARCH_QWEN2MOE:
873
889
  {
874
890
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
@@ -883,6 +899,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
883
899
  } break;
884
900
  case LLM_ARCH_QWEN3:
885
901
  {
902
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
886
903
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
887
904
  switch (hparams.n_layer) {
888
905
  case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
@@ -1744,6 +1761,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1744
1761
  default: type = LLM_TYPE_UNKNOWN;
1745
1762
  }
1746
1763
  } break;
1764
+ case LLM_ARCH_HUNYUAN_DENSE:
1765
+ {
1766
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1767
+
1768
+ switch (hparams.n_embd) {
1769
+ case 1024: type = LLM_TYPE_0_5B; break;
1770
+ case 2048: type = LLM_TYPE_1_8B; break;
1771
+ case 3072: type = LLM_TYPE_4B; break;
1772
+ case 4096: type = LLM_TYPE_7B; break;
1773
+ default: type = LLM_TYPE_UNKNOWN;
1774
+ }
1775
+ } break;
1747
1776
  case LLM_ARCH_SMOLLM3:
1748
1777
  {
1749
1778
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1824,7 +1853,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1824
1853
  LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
1825
1854
 
1826
1855
  // build a list of buffer types for the CPU and GPU devices
1827
- pimpl->cpu_buft_list = make_cpu_buft_list(devices);
1856
+ pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
1828
1857
  for (auto * dev : devices) {
1829
1858
  buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
1830
1859
  // add CPU buffer types as a fallback
@@ -2029,7 +2058,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2029
2058
  for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
2030
2059
  std::regex pattern(overrides->pattern);
2031
2060
  if (std::regex_search(tensor_name, pattern)) {
2032
- buft = overrides->buft;
2061
+ if (overrides->buft == ggml_backend_cpu_buffer_type()) {
2062
+ // when overriding to a CPU buffer, consider the extra buffer types
2063
+ buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
2064
+ } else {
2065
+ buft = overrides->buft;
2066
+ }
2067
+
2033
2068
  LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
2034
2069
  tensor_name.c_str(),
2035
2070
  ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
@@ -2149,6 +2184,53 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2149
2184
  }
2150
2185
  }
2151
2186
  } break;
2187
+ case LLM_ARCH_LLADA:
2188
+ {
2189
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
2190
+
2191
+ // output
2192
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
2193
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
2194
+
2195
+ // if output is NULL, init from the input tok embed
2196
+ if (output == NULL) {
2197
+ output =
2198
+ create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
2199
+ }
2200
+
2201
+ for (int i = 0; i < n_layer; ++i) {
2202
+ auto & layer = layers[i];
2203
+
2204
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
2205
+
2206
+ // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
2207
+ layer.wq =
2208
+ create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
2209
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
2210
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
2211
+ // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
2212
+ layer.wo =
2213
+ create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
2214
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
2215
+
2216
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
2217
+
2218
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
2219
+ TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2220
+
2221
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
2222
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
2223
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
2224
+
2225
+ // optional MLP bias
2226
+ layer.ffn_gate_b =
2227
+ create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
2228
+ layer.ffn_down_b =
2229
+ create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
2230
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
2231
+ }
2232
+ }
2233
+ break;
2152
2234
  case LLM_ARCH_LLAMA4:
2153
2235
  {
2154
2236
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5126,6 +5208,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5126
5208
  layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
5127
5209
  }
5128
5210
  } break;
5211
+ case LLM_ARCH_HUNYUAN_DENSE:
5212
+ {
5213
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5214
+
5215
+ // output
5216
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5217
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5218
+ // if output is NULL, init from the input tok embed
5219
+ if (output == NULL) {
5220
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5221
+ }
5222
+
5223
+ for (int i = 0; i < n_layer; ++i) {
5224
+ auto & layer = layers[i];
5225
+
5226
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5227
+
5228
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5229
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5230
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5231
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5232
+
5233
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5234
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5235
+
5236
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5237
+
5238
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5239
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5240
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5241
+
5242
+ }
5243
+ } break;
5129
5244
  case LLM_ARCH_SMOLLM3:
5130
5245
  {
5131
5246
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -8042,6 +8157,106 @@ struct llm_build_dream : public llm_graph_context {
8042
8157
  }
8043
8158
  };
8044
8159
 
8160
+ struct llm_build_llada : public llm_graph_context {
8161
+ llm_build_llada(const llama_model & model, const llm_graph_params & params) :
8162
+ llm_graph_context(params) {
8163
+ // LLaDA is similar to LLaMA but uses non-causal attention for diffusion
8164
+ const int64_t n_embd_head = hparams.n_embd_head_v;
8165
+
8166
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8167
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8168
+
8169
+ ggml_tensor * cur;
8170
+ ggml_tensor * inpL;
8171
+
8172
+ inpL = build_inp_embd(model.tok_embd);
8173
+
8174
+ // inp_pos - contains the positions
8175
+ ggml_tensor * inp_pos = build_inp_pos();
8176
+
8177
+ // Non-causal attention for diffusion
8178
+ auto * inp_attn = build_attn_inp_no_cache();
8179
+
8180
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8181
+
8182
+ for (int il = 0; il < n_layer; ++il) {
8183
+ ggml_tensor * inpSA = inpL;
8184
+
8185
+ // norm
8186
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
8187
+ cb(cur, "attn_norm", il);
8188
+
8189
+ // self-attention
8190
+ {
8191
+ // compute separate Q, K, V projections without bias, matching LLaDALlamaBlock
8192
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
8193
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
8194
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
8195
+
8196
+ cb(Qcur, "Qcur", il);
8197
+ cb(Kcur, "Kcur", il);
8198
+ cb(Vcur, "Vcur", il);
8199
+
8200
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8201
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8202
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8203
+
8204
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8205
+ ext_factor, attn_factor, beta_fast, beta_slow);
8206
+
8207
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8208
+ ext_factor, attn_factor, beta_fast, beta_slow);
8209
+
8210
+ cb(Qcur, "Qcur", il);
8211
+ cb(Kcur, "Kcur", il);
8212
+ cb(Vcur, "Vcur", il);
8213
+
8214
+ cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr,
8215
+ 1.0f / sqrtf(float(n_embd_head)), il);
8216
+ }
8217
+
8218
+ if (il == n_layer - 1 && inp_out_ids) {
8219
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8220
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8221
+ }
8222
+
8223
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8224
+ cb(ffn_inp, "ffn_inp", il);
8225
+
8226
+ // feed-forward network
8227
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
8228
+ cb(cur, "ffn_norm", il);
8229
+
8230
+ cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
8231
+ model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
8232
+ cb(cur, "ffn_out", il);
8233
+
8234
+ cur = ggml_add(ctx0, cur, ffn_inp);
8235
+
8236
+ cur = build_cvec(cur, il);
8237
+ cb(cur, "l_out", il);
8238
+
8239
+ // input for next layer
8240
+ inpL = cur;
8241
+ }
8242
+
8243
+ cur = inpL;
8244
+
8245
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
8246
+
8247
+ cb(cur, "result_norm", -1);
8248
+ res->t_embd = cur;
8249
+
8250
+ // lm_head
8251
+ cur = build_lora_mm(model.output, cur);
8252
+
8253
+ cb(cur, "result_output", -1);
8254
+ res->t_logits = cur;
8255
+
8256
+ ggml_build_forward_expand(gf, cur);
8257
+ }
8258
+ };
8259
+
8045
8260
  struct llm_build_qwen2vl : public llm_graph_context {
8046
8261
  llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
8047
8262
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -16761,6 +16976,144 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
16761
16976
  }
16762
16977
  };
16763
16978
 
16979
+ struct llm_build_hunyuan_dense : public llm_graph_context {
16980
+ llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
16981
+ const int64_t n_embd_head = hparams.n_embd_head_v;
16982
+
16983
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
16984
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
16985
+
16986
+ ggml_tensor * cur;
16987
+ ggml_tensor * inpL;
16988
+
16989
+ inpL = build_inp_embd(model.tok_embd);
16990
+
16991
+ // inp_pos - contains the positions
16992
+ ggml_tensor * inp_pos = build_inp_pos();
16993
+
16994
+ auto * inp_attn = build_attn_inp_kv_unified();
16995
+
16996
+ const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
16997
+
16998
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
16999
+
17000
+ for (int il = 0; il < n_layer; ++il) {
17001
+ ggml_tensor * inpSA = inpL;
17002
+
17003
+ // norm
17004
+ cur = build_norm(inpL,
17005
+ model.layers[il].attn_norm, NULL,
17006
+ LLM_NORM_RMS, il);
17007
+ cb(cur, "attn_norm", il);
17008
+ // self-attention
17009
+ {
17010
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
17011
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
17012
+
17013
+ // compute Q and K and RoPE them
17014
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
17015
+ cb(Qcur, "Qcur", il);
17016
+ if (model.layers[il].bq) {
17017
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
17018
+ cb(Qcur, "Qcur", il);
17019
+ }
17020
+
17021
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
17022
+ cb(Kcur, "Kcur", il);
17023
+ if (model.layers[il].bk) {
17024
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
17025
+ cb(Kcur, "Kcur", il);
17026
+ }
17027
+
17028
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
17029
+ cb(Vcur, "Vcur", il);
17030
+ if (model.layers[il].bv) {
17031
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
17032
+ cb(Vcur, "Vcur", il);
17033
+ }
17034
+
17035
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
17036
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
17037
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
17038
+
17039
+ Qcur = ggml_rope_ext(
17040
+ ctx0, Qcur, inp_pos, rope_factors,
17041
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17042
+ ext_factor, attn_factor, beta_fast, beta_slow
17043
+ );
17044
+
17045
+ cb(Qcur, "Qcur", il);
17046
+ cb(Kcur, "Kcur", il);
17047
+ cb(Vcur, "Vcur", il);
17048
+
17049
+ Kcur = ggml_rope_ext(
17050
+ ctx0, Kcur, inp_pos, rope_factors,
17051
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17052
+ ext_factor, attn_factor, beta_fast, beta_slow
17053
+ );
17054
+
17055
+ Kcur = build_norm(Kcur,
17056
+ model.layers[il].attn_k_norm, nullptr,
17057
+ LLM_NORM_RMS, il);
17058
+ cb(Kcur, "Kcur_norm", il);
17059
+
17060
+ Qcur = build_norm(Qcur,
17061
+ model.layers[il].attn_q_norm, nullptr,
17062
+ LLM_NORM_RMS, il);
17063
+ cb(Qcur, "Qcur_norm", il);
17064
+
17065
+ cur = build_attn(inp_attn,
17066
+ model.layers[il].wo, model.layers[il].bo,
17067
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
17068
+ cb(cur, "attn_out", il);
17069
+ }
17070
+
17071
+ if (il == n_layer - 1 && inp_out_ids) {
17072
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
17073
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
17074
+ }
17075
+
17076
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
17077
+ cb(ffn_inp, "ffn_inp", il);
17078
+
17079
+ cur = build_norm(ffn_inp,
17080
+ model.layers[il].ffn_norm, NULL,
17081
+ LLM_NORM_RMS, il);
17082
+ cb(cur, "ffn_norm", il);
17083
+ // feed-forward network (non-MoE)
17084
+ ggml_tensor * cur_mlp = build_ffn(cur,
17085
+ model.layers[il].ffn_up, NULL, NULL,
17086
+ model.layers[il].ffn_gate, NULL, NULL,
17087
+ model.layers[il].ffn_down, NULL, NULL,
17088
+ NULL,
17089
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
17090
+ cb(cur_mlp, "ffn_out", il);
17091
+
17092
+ cur = ggml_add(ctx0, cur_mlp, ffn_inp);
17093
+
17094
+ cur = build_cvec(cur, il);
17095
+ cb(cur, "l_out", il);
17096
+
17097
+ // input for next layer
17098
+ inpL = cur;
17099
+ }
17100
+ cur = inpL;
17101
+
17102
+ cur = build_norm(cur,
17103
+ model.output_norm, NULL,
17104
+ LLM_NORM_RMS, -1);
17105
+
17106
+ cb(cur, "result_norm", -1);
17107
+ res->t_embd = cur;
17108
+ // lm_head
17109
+ cur = build_lora_mm(model.output, cur);
17110
+ cb(cur, "result_output", -1);
17111
+ res->t_logits = cur;
17112
+
17113
+ ggml_build_forward_expand(gf, cur);
17114
+ }
17115
+ };
17116
+
16764
17117
  struct llm_build_smollm3 : public llm_graph_context {
16765
17118
  llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
16766
17119
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -17158,10 +17511,18 @@ struct llm_build_smallthinker : public llm_graph_context{
17158
17511
  cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
17159
17512
  cb(cur, "ffn_norm", il);
17160
17513
 
17161
- ggml_tensor * ffn_out = build_moe_ffn_from_probs(cur, probs, model.layers[il].ffn_up_exps,
17162
- model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
17163
- nullptr, n_expert, n_expert_used,
17164
- static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
17514
+ ggml_tensor * ffn_out =
17515
+ build_moe_ffn(cur,
17516
+ nullptr,
17517
+ model.layers[il].ffn_up_exps,
17518
+ model.layers[il].ffn_gate_exps,
17519
+ model.layers[il].ffn_down_exps,
17520
+ nullptr,
17521
+ n_expert, n_expert_used,
17522
+ LLM_FFN_RELU, true,
17523
+ false, 0.0,
17524
+ static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
17525
+ il, probs);
17165
17526
 
17166
17527
  cb(ffn_out, "ffn_out", il);
17167
17528
  cur = ffn_out;
@@ -17201,6 +17562,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
17201
17562
  case LLM_ARCH_NEO_BERT:
17202
17563
  case LLM_ARCH_WAVTOKENIZER_DEC:
17203
17564
  case LLM_ARCH_DREAM:
17565
+ case LLM_ARCH_LLADA:
17204
17566
  {
17205
17567
  res = nullptr;
17206
17568
  } break;
@@ -17236,6 +17598,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
17236
17598
  /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
17237
17599
  /* n_seq_max */ cparams.n_seq_max,
17238
17600
  /* offload */ cparams.offload_kqv,
17601
+ /* unified */ cparams.kv_unified,
17239
17602
  /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
17240
17603
  /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
17241
17604
  } else {
@@ -17367,6 +17730,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17367
17730
  llm = std::make_unique<llm_build_dream>(*this, params);
17368
17731
  }
17369
17732
  break;
17733
+ case LLM_ARCH_LLADA:
17734
+ {
17735
+ llm = std::make_unique<llm_build_llada>(*this, params);
17736
+ }
17737
+ break;
17370
17738
  case LLM_ARCH_QWEN2VL:
17371
17739
  {
17372
17740
  llm = std::make_unique<llm_build_qwen2vl>(*this, params);
@@ -17614,6 +17982,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17614
17982
  {
17615
17983
  llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
17616
17984
  } break;
17985
+ case LLM_ARCH_HUNYUAN_DENSE:
17986
+ {
17987
+ llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
17988
+ } break;
17617
17989
  case LLM_ARCH_SMOLLM3:
17618
17990
  {
17619
17991
  llm = std::make_unique<llm_build_smollm3>(*this, params);
@@ -17663,6 +18035,7 @@ llama_model_params llama_model_default_params() {
17663
18035
  /*.use_mmap =*/ true,
17664
18036
  /*.use_mlock =*/ false,
17665
18037
  /*.check_tensors =*/ false,
18038
+ /*.use_extra_bufts =*/ true,
17666
18039
  };
17667
18040
 
17668
18041
  #ifdef GGML_USE_METAL
@@ -17765,6 +18138,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
17765
18138
 
17766
18139
  // use what we call a normal RoPE, operating on pairs of consecutive head values
17767
18140
  case LLM_ARCH_LLAMA:
18141
+ case LLM_ARCH_LLADA:
17768
18142
  case LLM_ARCH_LLAMA4:
17769
18143
  case LLM_ARCH_DECI:
17770
18144
  case LLM_ARCH_BAICHUAN:
@@ -17831,6 +18205,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
17831
18205
  case LLM_ARCH_MINICPM3:
17832
18206
  case LLM_ARCH_DOTS1:
17833
18207
  case LLM_ARCH_HUNYUAN_MOE:
18208
+ case LLM_ARCH_HUNYUAN_DENSE:
17834
18209
  case LLM_ARCH_LFM2:
17835
18210
  case LLM_ARCH_SMALLTHINKER:
17836
18211
  return LLAMA_ROPE_TYPE_NEOX;
@@ -17943,6 +18318,10 @@ bool llama_model_is_recurrent(const llama_model * model) {
17943
18318
  return llm_arch_is_recurrent(model->arch);
17944
18319
  }
17945
18320
 
18321
+ bool llama_model_is_diffusion(const llama_model * model) {
18322
+ return llm_arch_is_diffusion(model->arch);
18323
+ }
18324
+
17946
18325
  const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
17947
18326
  return model->tensors_by_name;
17948
18327
  }
@@ -875,9 +875,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
875
875
 
876
876
  // get more optimal quantization type based on the tensor shape, layer, etc.
877
877
  if (!params->pure && ggml_is_quantized(default_type)) {
878
+ int fallback = qs.n_fallback;
878
879
  new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
879
- // unless the user specifies a type
880
- if (params->tensor_types) {
880
+ // unless the user specifies a type, and the tensor geometry will not require fallback quantisation
881
+ if (params->tensor_types && qs.n_fallback - fallback == 0) {
881
882
  const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
882
883
  const std::string tensor_name(tensor->name);
883
884
  for (const auto & [tname, qtype] : tensor_types) {
@@ -890,7 +891,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
890
891
  }
891
892
  }
892
893
  }
893
-
894
894
  if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
895
895
  new_type = params->token_embedding_type;
896
896
  }
@@ -307,6 +307,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
307
307
  };
308
308
  break;
309
309
  case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
310
+ case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
310
311
  regex_exprs = {
311
312
  "\\p{N}{1,3}",
312
313
  "[一-龥぀-ゟ゠-ヿ]+",
@@ -1855,7 +1856,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1855
1856
  tokenizer_pre == "gigachat" ||
1856
1857
  tokenizer_pre == "jina-v2-es" ||
1857
1858
  tokenizer_pre == "jina-v2-de" ||
1858
- tokenizer_pre == "a.x-4.0") {
1859
+ tokenizer_pre == "a.x-4.0" ||
1860
+ tokenizer_pre == "mellum") {
1859
1861
  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1860
1862
  } else if (
1861
1863
  tokenizer_pre == "jina-v1-en" ||
@@ -1964,6 +1966,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1964
1966
  tokenizer_pre == "hunyuan") {
1965
1967
  pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
1966
1968
  clean_spaces = false;
1969
+ } else if (
1970
+ tokenizer_pre == "hunyuan-dense") {
1971
+ pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
1972
+ clean_spaces = false;
1967
1973
  } else if (
1968
1974
  tokenizer_pre == "kimi-k2") {
1969
1975
  pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
@@ -46,6 +46,7 @@ enum llama_vocab_pre_type {
46
46
  LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
47
47
  LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
48
48
  LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
49
+ LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
49
50
  };
50
51
 
51
52
  struct LLM_KV;