@fugood/llama.node 1.1.3 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/lib/binding.ts +8 -0
- package/package.json +14 -14
- package/src/LlamaCompletionWorker.cpp +45 -5
- package/src/LlamaContext.cpp +3 -0
- package/src/llama.cpp/common/arg.cpp +60 -7
- package/src/llama.cpp/common/chat.cpp +6 -6
- package/src/llama.cpp/common/common.cpp +1 -0
- package/src/llama.cpp/common/common.h +14 -5
- package/src/llama.cpp/common/speculative.cpp +135 -54
- package/src/llama.cpp/common/speculative.h +8 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/include/llama.h +8 -4
- package/src/llama.cpp/src/llama-arch.cpp +40 -0
- package/src/llama.cpp/src/llama-arch.h +2 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +20 -1
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +11 -2
- package/src/llama.cpp/src/llama-context.h +4 -1
- package/src/llama.cpp/src/llama-graph.cpp +57 -139
- package/src/llama.cpp/src/llama-graph.h +31 -32
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +2 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
- package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +400 -21
- package/src/llama.cpp/src/llama-quant.cpp +3 -3
- package/src/llama.cpp/src/llama-vocab.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
|
@@ -290,7 +290,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
|
|
|
290
290
|
}
|
|
291
291
|
|
|
292
292
|
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
|
|
293
|
-
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
|
|
293
|
+
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
|
|
294
294
|
buft_list_t buft_list;
|
|
295
295
|
|
|
296
296
|
// add ACCEL buffer types
|
|
@@ -319,21 +319,22 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
|
|
319
319
|
}
|
|
320
320
|
}
|
|
321
321
|
|
|
322
|
-
// add extra buffer types
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
322
|
+
// add extra buffer types
|
|
323
|
+
if (use_extra_bufts) {
|
|
324
|
+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
325
|
+
if (cpu_dev == nullptr) {
|
|
326
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
327
|
+
}
|
|
328
328
|
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
329
|
+
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
|
330
|
+
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
|
331
|
+
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
|
332
|
+
if (ggml_backend_dev_get_extra_bufts_fn) {
|
|
333
|
+
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
|
|
334
|
+
while (extra_bufts && *extra_bufts) {
|
|
335
|
+
buft_list.emplace_back(cpu_dev, *extra_bufts);
|
|
336
|
+
++extra_bufts;
|
|
337
|
+
}
|
|
337
338
|
}
|
|
338
339
|
}
|
|
339
340
|
|
|
@@ -869,6 +870,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
869
870
|
hparams.causal_attn = false;
|
|
870
871
|
}
|
|
871
872
|
break;
|
|
873
|
+
case LLM_ARCH_LLADA:
|
|
874
|
+
{
|
|
875
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
876
|
+
// LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
|
|
877
|
+
switch (hparams.n_layer) {
|
|
878
|
+
case 32:
|
|
879
|
+
type = LLM_TYPE_8B;
|
|
880
|
+
break;
|
|
881
|
+
default:
|
|
882
|
+
type = LLM_TYPE_UNKNOWN;
|
|
883
|
+
}
|
|
884
|
+
// Set non-causal attention for diffusion models
|
|
885
|
+
hparams.causal_attn = false;
|
|
886
|
+
}
|
|
887
|
+
break;
|
|
872
888
|
case LLM_ARCH_QWEN2MOE:
|
|
873
889
|
{
|
|
874
890
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
@@ -883,6 +899,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
883
899
|
} break;
|
|
884
900
|
case LLM_ARCH_QWEN3:
|
|
885
901
|
{
|
|
902
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
886
903
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
887
904
|
switch (hparams.n_layer) {
|
|
888
905
|
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
|
|
@@ -1744,6 +1761,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1744
1761
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1745
1762
|
}
|
|
1746
1763
|
} break;
|
|
1764
|
+
case LLM_ARCH_HUNYUAN_DENSE:
|
|
1765
|
+
{
|
|
1766
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1767
|
+
|
|
1768
|
+
switch (hparams.n_embd) {
|
|
1769
|
+
case 1024: type = LLM_TYPE_0_5B; break;
|
|
1770
|
+
case 2048: type = LLM_TYPE_1_8B; break;
|
|
1771
|
+
case 3072: type = LLM_TYPE_4B; break;
|
|
1772
|
+
case 4096: type = LLM_TYPE_7B; break;
|
|
1773
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1774
|
+
}
|
|
1775
|
+
} break;
|
|
1747
1776
|
case LLM_ARCH_SMOLLM3:
|
|
1748
1777
|
{
|
|
1749
1778
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1824,7 +1853,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1824
1853
|
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
|
|
1825
1854
|
|
|
1826
1855
|
// build a list of buffer types for the CPU and GPU devices
|
|
1827
|
-
pimpl->cpu_buft_list = make_cpu_buft_list(devices);
|
|
1856
|
+
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
|
|
1828
1857
|
for (auto * dev : devices) {
|
|
1829
1858
|
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
|
|
1830
1859
|
// add CPU buffer types as a fallback
|
|
@@ -2029,7 +2058,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2029
2058
|
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
|
2030
2059
|
std::regex pattern(overrides->pattern);
|
|
2031
2060
|
if (std::regex_search(tensor_name, pattern)) {
|
|
2032
|
-
|
|
2061
|
+
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
|
|
2062
|
+
// when overriding to a CPU buffer, consider the extra buffer types
|
|
2063
|
+
buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
|
|
2064
|
+
} else {
|
|
2065
|
+
buft = overrides->buft;
|
|
2066
|
+
}
|
|
2067
|
+
|
|
2033
2068
|
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
|
2034
2069
|
tensor_name.c_str(),
|
|
2035
2070
|
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
|
@@ -2149,6 +2184,53 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2149
2184
|
}
|
|
2150
2185
|
}
|
|
2151
2186
|
} break;
|
|
2187
|
+
case LLM_ARCH_LLADA:
|
|
2188
|
+
{
|
|
2189
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
2190
|
+
|
|
2191
|
+
// output
|
|
2192
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
2193
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
|
2194
|
+
|
|
2195
|
+
// if output is NULL, init from the input tok embed
|
|
2196
|
+
if (output == NULL) {
|
|
2197
|
+
output =
|
|
2198
|
+
create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
|
|
2199
|
+
}
|
|
2200
|
+
|
|
2201
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2202
|
+
auto & layer = layers[i];
|
|
2203
|
+
|
|
2204
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
|
2205
|
+
|
|
2206
|
+
// Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
|
|
2207
|
+
layer.wq =
|
|
2208
|
+
create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
|
|
2209
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
|
|
2210
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
|
|
2211
|
+
// No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
|
|
2212
|
+
layer.wo =
|
|
2213
|
+
create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
|
2214
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
|
2215
|
+
|
|
2216
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
|
|
2217
|
+
|
|
2218
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
|
|
2219
|
+
TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
2220
|
+
|
|
2221
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
|
|
2222
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
|
2223
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
|
|
2224
|
+
|
|
2225
|
+
// optional MLP bias
|
|
2226
|
+
layer.ffn_gate_b =
|
|
2227
|
+
create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
|
|
2228
|
+
layer.ffn_down_b =
|
|
2229
|
+
create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
|
2230
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
|
|
2231
|
+
}
|
|
2232
|
+
}
|
|
2233
|
+
break;
|
|
2152
2234
|
case LLM_ARCH_LLAMA4:
|
|
2153
2235
|
{
|
|
2154
2236
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5126,6 +5208,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5126
5208
|
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
|
5127
5209
|
}
|
|
5128
5210
|
} break;
|
|
5211
|
+
case LLM_ARCH_HUNYUAN_DENSE:
|
|
5212
|
+
{
|
|
5213
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5214
|
+
|
|
5215
|
+
// output
|
|
5216
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
5217
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
5218
|
+
// if output is NULL, init from the input tok embed
|
|
5219
|
+
if (output == NULL) {
|
|
5220
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
5221
|
+
}
|
|
5222
|
+
|
|
5223
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5224
|
+
auto & layer = layers[i];
|
|
5225
|
+
|
|
5226
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
5227
|
+
|
|
5228
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
5229
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
5230
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
5231
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
5232
|
+
|
|
5233
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
5234
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
5235
|
+
|
|
5236
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
5237
|
+
|
|
5238
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
5239
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
5240
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
5241
|
+
|
|
5242
|
+
}
|
|
5243
|
+
} break;
|
|
5129
5244
|
case LLM_ARCH_SMOLLM3:
|
|
5130
5245
|
{
|
|
5131
5246
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -8042,6 +8157,106 @@ struct llm_build_dream : public llm_graph_context {
|
|
|
8042
8157
|
}
|
|
8043
8158
|
};
|
|
8044
8159
|
|
|
8160
|
+
struct llm_build_llada : public llm_graph_context {
|
|
8161
|
+
llm_build_llada(const llama_model & model, const llm_graph_params & params) :
|
|
8162
|
+
llm_graph_context(params) {
|
|
8163
|
+
// LLaDA is similar to LLaMA but uses non-causal attention for diffusion
|
|
8164
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8165
|
+
|
|
8166
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
8167
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
8168
|
+
|
|
8169
|
+
ggml_tensor * cur;
|
|
8170
|
+
ggml_tensor * inpL;
|
|
8171
|
+
|
|
8172
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
8173
|
+
|
|
8174
|
+
// inp_pos - contains the positions
|
|
8175
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
8176
|
+
|
|
8177
|
+
// Non-causal attention for diffusion
|
|
8178
|
+
auto * inp_attn = build_attn_inp_no_cache();
|
|
8179
|
+
|
|
8180
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8181
|
+
|
|
8182
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
8183
|
+
ggml_tensor * inpSA = inpL;
|
|
8184
|
+
|
|
8185
|
+
// norm
|
|
8186
|
+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
8187
|
+
cb(cur, "attn_norm", il);
|
|
8188
|
+
|
|
8189
|
+
// self-attention
|
|
8190
|
+
{
|
|
8191
|
+
// compute separate Q, K, V projections without bias, matching LLaDALlamaBlock
|
|
8192
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
8193
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
8194
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
8195
|
+
|
|
8196
|
+
cb(Qcur, "Qcur", il);
|
|
8197
|
+
cb(Kcur, "Kcur", il);
|
|
8198
|
+
cb(Vcur, "Vcur", il);
|
|
8199
|
+
|
|
8200
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8201
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8202
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8203
|
+
|
|
8204
|
+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8205
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
8206
|
+
|
|
8207
|
+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8208
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
8209
|
+
|
|
8210
|
+
cb(Qcur, "Qcur", il);
|
|
8211
|
+
cb(Kcur, "Kcur", il);
|
|
8212
|
+
cb(Vcur, "Vcur", il);
|
|
8213
|
+
|
|
8214
|
+
cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr,
|
|
8215
|
+
1.0f / sqrtf(float(n_embd_head)), il);
|
|
8216
|
+
}
|
|
8217
|
+
|
|
8218
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8219
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8220
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
8221
|
+
}
|
|
8222
|
+
|
|
8223
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
8224
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
8225
|
+
|
|
8226
|
+
// feed-forward network
|
|
8227
|
+
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
|
8228
|
+
cb(cur, "ffn_norm", il);
|
|
8229
|
+
|
|
8230
|
+
cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
|
|
8231
|
+
model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
8232
|
+
cb(cur, "ffn_out", il);
|
|
8233
|
+
|
|
8234
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
8235
|
+
|
|
8236
|
+
cur = build_cvec(cur, il);
|
|
8237
|
+
cb(cur, "l_out", il);
|
|
8238
|
+
|
|
8239
|
+
// input for next layer
|
|
8240
|
+
inpL = cur;
|
|
8241
|
+
}
|
|
8242
|
+
|
|
8243
|
+
cur = inpL;
|
|
8244
|
+
|
|
8245
|
+
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
8246
|
+
|
|
8247
|
+
cb(cur, "result_norm", -1);
|
|
8248
|
+
res->t_embd = cur;
|
|
8249
|
+
|
|
8250
|
+
// lm_head
|
|
8251
|
+
cur = build_lora_mm(model.output, cur);
|
|
8252
|
+
|
|
8253
|
+
cb(cur, "result_output", -1);
|
|
8254
|
+
res->t_logits = cur;
|
|
8255
|
+
|
|
8256
|
+
ggml_build_forward_expand(gf, cur);
|
|
8257
|
+
}
|
|
8258
|
+
};
|
|
8259
|
+
|
|
8045
8260
|
struct llm_build_qwen2vl : public llm_graph_context {
|
|
8046
8261
|
llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
8047
8262
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -16761,6 +16976,144 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
|
16761
16976
|
}
|
|
16762
16977
|
};
|
|
16763
16978
|
|
|
16979
|
+
struct llm_build_hunyuan_dense : public llm_graph_context {
|
|
16980
|
+
llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
16981
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
16982
|
+
|
|
16983
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
16984
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
16985
|
+
|
|
16986
|
+
ggml_tensor * cur;
|
|
16987
|
+
ggml_tensor * inpL;
|
|
16988
|
+
|
|
16989
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
16990
|
+
|
|
16991
|
+
// inp_pos - contains the positions
|
|
16992
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
16993
|
+
|
|
16994
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
16995
|
+
|
|
16996
|
+
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
|
16997
|
+
|
|
16998
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
16999
|
+
|
|
17000
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
17001
|
+
ggml_tensor * inpSA = inpL;
|
|
17002
|
+
|
|
17003
|
+
// norm
|
|
17004
|
+
cur = build_norm(inpL,
|
|
17005
|
+
model.layers[il].attn_norm, NULL,
|
|
17006
|
+
LLM_NORM_RMS, il);
|
|
17007
|
+
cb(cur, "attn_norm", il);
|
|
17008
|
+
// self-attention
|
|
17009
|
+
{
|
|
17010
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
17011
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
17012
|
+
|
|
17013
|
+
// compute Q and K and RoPE them
|
|
17014
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
17015
|
+
cb(Qcur, "Qcur", il);
|
|
17016
|
+
if (model.layers[il].bq) {
|
|
17017
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
17018
|
+
cb(Qcur, "Qcur", il);
|
|
17019
|
+
}
|
|
17020
|
+
|
|
17021
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
17022
|
+
cb(Kcur, "Kcur", il);
|
|
17023
|
+
if (model.layers[il].bk) {
|
|
17024
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
17025
|
+
cb(Kcur, "Kcur", il);
|
|
17026
|
+
}
|
|
17027
|
+
|
|
17028
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
17029
|
+
cb(Vcur, "Vcur", il);
|
|
17030
|
+
if (model.layers[il].bv) {
|
|
17031
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
17032
|
+
cb(Vcur, "Vcur", il);
|
|
17033
|
+
}
|
|
17034
|
+
|
|
17035
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
17036
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
17037
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
17038
|
+
|
|
17039
|
+
Qcur = ggml_rope_ext(
|
|
17040
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
17041
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
17042
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
17043
|
+
);
|
|
17044
|
+
|
|
17045
|
+
cb(Qcur, "Qcur", il);
|
|
17046
|
+
cb(Kcur, "Kcur", il);
|
|
17047
|
+
cb(Vcur, "Vcur", il);
|
|
17048
|
+
|
|
17049
|
+
Kcur = ggml_rope_ext(
|
|
17050
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
17051
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
17052
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
17053
|
+
);
|
|
17054
|
+
|
|
17055
|
+
Kcur = build_norm(Kcur,
|
|
17056
|
+
model.layers[il].attn_k_norm, nullptr,
|
|
17057
|
+
LLM_NORM_RMS, il);
|
|
17058
|
+
cb(Kcur, "Kcur_norm", il);
|
|
17059
|
+
|
|
17060
|
+
Qcur = build_norm(Qcur,
|
|
17061
|
+
model.layers[il].attn_q_norm, nullptr,
|
|
17062
|
+
LLM_NORM_RMS, il);
|
|
17063
|
+
cb(Qcur, "Qcur_norm", il);
|
|
17064
|
+
|
|
17065
|
+
cur = build_attn(inp_attn,
|
|
17066
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
17067
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
17068
|
+
cb(cur, "attn_out", il);
|
|
17069
|
+
}
|
|
17070
|
+
|
|
17071
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
17072
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
17073
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
17074
|
+
}
|
|
17075
|
+
|
|
17076
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
17077
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
17078
|
+
|
|
17079
|
+
cur = build_norm(ffn_inp,
|
|
17080
|
+
model.layers[il].ffn_norm, NULL,
|
|
17081
|
+
LLM_NORM_RMS, il);
|
|
17082
|
+
cb(cur, "ffn_norm", il);
|
|
17083
|
+
// feed-forward network (non-MoE)
|
|
17084
|
+
ggml_tensor * cur_mlp = build_ffn(cur,
|
|
17085
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
17086
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
17087
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
17088
|
+
NULL,
|
|
17089
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
17090
|
+
cb(cur_mlp, "ffn_out", il);
|
|
17091
|
+
|
|
17092
|
+
cur = ggml_add(ctx0, cur_mlp, ffn_inp);
|
|
17093
|
+
|
|
17094
|
+
cur = build_cvec(cur, il);
|
|
17095
|
+
cb(cur, "l_out", il);
|
|
17096
|
+
|
|
17097
|
+
// input for next layer
|
|
17098
|
+
inpL = cur;
|
|
17099
|
+
}
|
|
17100
|
+
cur = inpL;
|
|
17101
|
+
|
|
17102
|
+
cur = build_norm(cur,
|
|
17103
|
+
model.output_norm, NULL,
|
|
17104
|
+
LLM_NORM_RMS, -1);
|
|
17105
|
+
|
|
17106
|
+
cb(cur, "result_norm", -1);
|
|
17107
|
+
res->t_embd = cur;
|
|
17108
|
+
// lm_head
|
|
17109
|
+
cur = build_lora_mm(model.output, cur);
|
|
17110
|
+
cb(cur, "result_output", -1);
|
|
17111
|
+
res->t_logits = cur;
|
|
17112
|
+
|
|
17113
|
+
ggml_build_forward_expand(gf, cur);
|
|
17114
|
+
}
|
|
17115
|
+
};
|
|
17116
|
+
|
|
16764
17117
|
struct llm_build_smollm3 : public llm_graph_context {
|
|
16765
17118
|
llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
16766
17119
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -17158,10 +17511,18 @@ struct llm_build_smallthinker : public llm_graph_context{
|
|
|
17158
17511
|
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
|
17159
17512
|
cb(cur, "ffn_norm", il);
|
|
17160
17513
|
|
|
17161
|
-
ggml_tensor * ffn_out =
|
|
17162
|
-
|
|
17163
|
-
|
|
17164
|
-
|
|
17514
|
+
ggml_tensor * ffn_out =
|
|
17515
|
+
build_moe_ffn(cur,
|
|
17516
|
+
nullptr,
|
|
17517
|
+
model.layers[il].ffn_up_exps,
|
|
17518
|
+
model.layers[il].ffn_gate_exps,
|
|
17519
|
+
model.layers[il].ffn_down_exps,
|
|
17520
|
+
nullptr,
|
|
17521
|
+
n_expert, n_expert_used,
|
|
17522
|
+
LLM_FFN_RELU, true,
|
|
17523
|
+
false, 0.0,
|
|
17524
|
+
static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
|
|
17525
|
+
il, probs);
|
|
17165
17526
|
|
|
17166
17527
|
cb(ffn_out, "ffn_out", il);
|
|
17167
17528
|
cur = ffn_out;
|
|
@@ -17201,6 +17562,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
17201
17562
|
case LLM_ARCH_NEO_BERT:
|
|
17202
17563
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
17203
17564
|
case LLM_ARCH_DREAM:
|
|
17565
|
+
case LLM_ARCH_LLADA:
|
|
17204
17566
|
{
|
|
17205
17567
|
res = nullptr;
|
|
17206
17568
|
} break;
|
|
@@ -17236,6 +17598,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
17236
17598
|
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
|
17237
17599
|
/* n_seq_max */ cparams.n_seq_max,
|
|
17238
17600
|
/* offload */ cparams.offload_kqv,
|
|
17601
|
+
/* unified */ cparams.kv_unified,
|
|
17239
17602
|
/* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
|
|
17240
17603
|
/* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
|
|
17241
17604
|
} else {
|
|
@@ -17367,6 +17730,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
17367
17730
|
llm = std::make_unique<llm_build_dream>(*this, params);
|
|
17368
17731
|
}
|
|
17369
17732
|
break;
|
|
17733
|
+
case LLM_ARCH_LLADA:
|
|
17734
|
+
{
|
|
17735
|
+
llm = std::make_unique<llm_build_llada>(*this, params);
|
|
17736
|
+
}
|
|
17737
|
+
break;
|
|
17370
17738
|
case LLM_ARCH_QWEN2VL:
|
|
17371
17739
|
{
|
|
17372
17740
|
llm = std::make_unique<llm_build_qwen2vl>(*this, params);
|
|
@@ -17614,6 +17982,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
17614
17982
|
{
|
|
17615
17983
|
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
|
|
17616
17984
|
} break;
|
|
17985
|
+
case LLM_ARCH_HUNYUAN_DENSE:
|
|
17986
|
+
{
|
|
17987
|
+
llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
|
|
17988
|
+
} break;
|
|
17617
17989
|
case LLM_ARCH_SMOLLM3:
|
|
17618
17990
|
{
|
|
17619
17991
|
llm = std::make_unique<llm_build_smollm3>(*this, params);
|
|
@@ -17663,6 +18035,7 @@ llama_model_params llama_model_default_params() {
|
|
|
17663
18035
|
/*.use_mmap =*/ true,
|
|
17664
18036
|
/*.use_mlock =*/ false,
|
|
17665
18037
|
/*.check_tensors =*/ false,
|
|
18038
|
+
/*.use_extra_bufts =*/ true,
|
|
17666
18039
|
};
|
|
17667
18040
|
|
|
17668
18041
|
#ifdef GGML_USE_METAL
|
|
@@ -17765,6 +18138,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
17765
18138
|
|
|
17766
18139
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
|
17767
18140
|
case LLM_ARCH_LLAMA:
|
|
18141
|
+
case LLM_ARCH_LLADA:
|
|
17768
18142
|
case LLM_ARCH_LLAMA4:
|
|
17769
18143
|
case LLM_ARCH_DECI:
|
|
17770
18144
|
case LLM_ARCH_BAICHUAN:
|
|
@@ -17831,6 +18205,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
17831
18205
|
case LLM_ARCH_MINICPM3:
|
|
17832
18206
|
case LLM_ARCH_DOTS1:
|
|
17833
18207
|
case LLM_ARCH_HUNYUAN_MOE:
|
|
18208
|
+
case LLM_ARCH_HUNYUAN_DENSE:
|
|
17834
18209
|
case LLM_ARCH_LFM2:
|
|
17835
18210
|
case LLM_ARCH_SMALLTHINKER:
|
|
17836
18211
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
@@ -17943,6 +18318,10 @@ bool llama_model_is_recurrent(const llama_model * model) {
|
|
|
17943
18318
|
return llm_arch_is_recurrent(model->arch);
|
|
17944
18319
|
}
|
|
17945
18320
|
|
|
18321
|
+
bool llama_model_is_diffusion(const llama_model * model) {
|
|
18322
|
+
return llm_arch_is_diffusion(model->arch);
|
|
18323
|
+
}
|
|
18324
|
+
|
|
17946
18325
|
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
|
|
17947
18326
|
return model->tensors_by_name;
|
|
17948
18327
|
}
|
|
@@ -875,9 +875,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
875
875
|
|
|
876
876
|
// get more optimal quantization type based on the tensor shape, layer, etc.
|
|
877
877
|
if (!params->pure && ggml_is_quantized(default_type)) {
|
|
878
|
+
int fallback = qs.n_fallback;
|
|
878
879
|
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
|
879
|
-
// unless the user specifies a type
|
|
880
|
-
if (params->tensor_types) {
|
|
880
|
+
// unless the user specifies a type, and the tensor geometry will not require fallback quantisation
|
|
881
|
+
if (params->tensor_types && qs.n_fallback - fallback == 0) {
|
|
881
882
|
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
|
|
882
883
|
const std::string tensor_name(tensor->name);
|
|
883
884
|
for (const auto & [tname, qtype] : tensor_types) {
|
|
@@ -890,7 +891,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
890
891
|
}
|
|
891
892
|
}
|
|
892
893
|
}
|
|
893
|
-
|
|
894
894
|
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
|
895
895
|
new_type = params->token_embedding_type;
|
|
896
896
|
}
|
|
@@ -307,6 +307,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
307
307
|
};
|
|
308
308
|
break;
|
|
309
309
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
|
|
310
|
+
case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
|
|
310
311
|
regex_exprs = {
|
|
311
312
|
"\\p{N}{1,3}",
|
|
312
313
|
"[一-龥-ゟ゠-ヿ]+",
|
|
@@ -1855,7 +1856,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1855
1856
|
tokenizer_pre == "gigachat" ||
|
|
1856
1857
|
tokenizer_pre == "jina-v2-es" ||
|
|
1857
1858
|
tokenizer_pre == "jina-v2-de" ||
|
|
1858
|
-
tokenizer_pre == "a.x-4.0"
|
|
1859
|
+
tokenizer_pre == "a.x-4.0" ||
|
|
1860
|
+
tokenizer_pre == "mellum") {
|
|
1859
1861
|
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
|
1860
1862
|
} else if (
|
|
1861
1863
|
tokenizer_pre == "jina-v1-en" ||
|
|
@@ -1964,6 +1966,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1964
1966
|
tokenizer_pre == "hunyuan") {
|
|
1965
1967
|
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
|
|
1966
1968
|
clean_spaces = false;
|
|
1969
|
+
} else if (
|
|
1970
|
+
tokenizer_pre == "hunyuan-dense") {
|
|
1971
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
|
|
1972
|
+
clean_spaces = false;
|
|
1967
1973
|
} else if (
|
|
1968
1974
|
tokenizer_pre == "kimi-k2") {
|
|
1969
1975
|
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
|