@fugood/llama.node 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +17 -13
- package/src/LlamaCompletionWorker.cpp +2 -0
- package/src/LlamaContext.cpp +3 -0
- package/src/llama.cpp/common/arg.cpp +80 -10
- package/src/llama.cpp/common/chat.cpp +52 -8
- package/src/llama.cpp/common/chat.h +7 -2
- package/src/llama.cpp/common/common.cpp +1 -0
- package/src/llama.cpp/common/common.h +16 -6
- package/src/llama.cpp/common/speculative.cpp +135 -54
- package/src/llama.cpp/common/speculative.h +8 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -2
- package/src/llama.cpp/ggml/include/ggml.h +37 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
- package/src/llama.cpp/include/llama.h +9 -4
- package/src/llama.cpp/src/llama-arch.cpp +105 -0
- package/src/llama.cpp/src/llama-arch.h +12 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +33 -1
- package/src/llama.cpp/src/llama-chat.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +19 -10
- package/src/llama.cpp/src/llama-context.h +4 -1
- package/src/llama.cpp/src/llama-graph.cpp +175 -148
- package/src/llama.cpp/src/llama-graph.h +60 -23
- package/src/llama.cpp/src/llama-hparams.h +5 -3
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +6 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
- package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +949 -75
- package/src/llama.cpp/src/llama-model.h +24 -4
- package/src/llama.cpp/src/llama-quant.cpp +40 -4
- package/src/llama.cpp/src/llama-vocab.cpp +49 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
|
@@ -109,8 +109,10 @@ const char * llm_type_name(llm_type type) {
|
|
|
109
109
|
case LLM_TYPE_A13B: return "A13B";
|
|
110
110
|
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
|
111
111
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
112
|
+
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
|
112
113
|
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
113
114
|
case LLM_TYPE_300B_A47B: return "300B.A47B";
|
|
115
|
+
case LLM_TYPE_355B_A32B: return "355B.A32B";
|
|
114
116
|
case LLM_TYPE_E2B: return "E2B";
|
|
115
117
|
case LLM_TYPE_E4B: return "E4B";
|
|
116
118
|
default: return "?B";
|
|
@@ -190,6 +192,13 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
|
|
190
192
|
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
|
191
193
|
op_tensor = ggml_add(ctx, a, w);
|
|
192
194
|
} break;
|
|
195
|
+
case GGML_OP_ADD_ID:
|
|
196
|
+
{
|
|
197
|
+
int n_expert_used = hparams.n_expert_used;
|
|
198
|
+
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
|
199
|
+
ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
|
200
|
+
op_tensor = ggml_add_id(ctx, a, w, c);
|
|
201
|
+
} break;
|
|
193
202
|
case GGML_OP_MUL:
|
|
194
203
|
{
|
|
195
204
|
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
|
@@ -258,6 +267,10 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
|
|
258
267
|
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
|
|
259
268
|
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
|
|
260
269
|
} break;
|
|
270
|
+
case GGML_OP_SCALE:
|
|
271
|
+
{
|
|
272
|
+
op_tensor = ggml_scale(ctx, w, 1.0f);
|
|
273
|
+
} break;
|
|
261
274
|
default:
|
|
262
275
|
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
|
|
263
276
|
}
|
|
@@ -290,7 +303,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
|
|
|
290
303
|
}
|
|
291
304
|
|
|
292
305
|
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
|
|
293
|
-
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
|
|
306
|
+
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
|
|
294
307
|
buft_list_t buft_list;
|
|
295
308
|
|
|
296
309
|
// add ACCEL buffer types
|
|
@@ -319,21 +332,22 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
|
|
319
332
|
}
|
|
320
333
|
}
|
|
321
334
|
|
|
322
|
-
// add extra buffer types
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
335
|
+
// add extra buffer types
|
|
336
|
+
if (use_extra_bufts) {
|
|
337
|
+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
338
|
+
if (cpu_dev == nullptr) {
|
|
339
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
340
|
+
}
|
|
328
341
|
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
342
|
+
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
|
343
|
+
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
|
344
|
+
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
|
345
|
+
if (ggml_backend_dev_get_extra_bufts_fn) {
|
|
346
|
+
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
|
|
347
|
+
while (extra_bufts && *extra_bufts) {
|
|
348
|
+
buft_list.emplace_back(cpu_dev, *extra_bufts);
|
|
349
|
+
++extra_bufts;
|
|
350
|
+
}
|
|
337
351
|
}
|
|
338
352
|
}
|
|
339
353
|
|
|
@@ -869,6 +883,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
869
883
|
hparams.causal_attn = false;
|
|
870
884
|
}
|
|
871
885
|
break;
|
|
886
|
+
case LLM_ARCH_LLADA:
|
|
887
|
+
{
|
|
888
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
889
|
+
// LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
|
|
890
|
+
switch (hparams.n_layer) {
|
|
891
|
+
case 32:
|
|
892
|
+
type = LLM_TYPE_8B;
|
|
893
|
+
break;
|
|
894
|
+
default:
|
|
895
|
+
type = LLM_TYPE_UNKNOWN;
|
|
896
|
+
}
|
|
897
|
+
// Set non-causal attention for diffusion models
|
|
898
|
+
hparams.causal_attn = false;
|
|
899
|
+
}
|
|
900
|
+
break;
|
|
872
901
|
case LLM_ARCH_QWEN2MOE:
|
|
873
902
|
{
|
|
874
903
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
@@ -883,6 +912,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
883
912
|
} break;
|
|
884
913
|
case LLM_ARCH_QWEN3:
|
|
885
914
|
{
|
|
915
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
886
916
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
887
917
|
switch (hparams.n_layer) {
|
|
888
918
|
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
|
|
@@ -1417,6 +1447,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1417
1447
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1418
1448
|
}
|
|
1419
1449
|
} break;
|
|
1450
|
+
case LLM_ARCH_GLM4_MOE:
|
|
1451
|
+
{
|
|
1452
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1453
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1454
|
+
|
|
1455
|
+
// MoE parameters
|
|
1456
|
+
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
|
1457
|
+
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
|
1458
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1459
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
|
1460
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1461
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1462
|
+
|
|
1463
|
+
// Expert gating function (GLM-4.5 uses sigmoid)
|
|
1464
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
1465
|
+
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
|
1466
|
+
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
|
|
1467
|
+
}
|
|
1468
|
+
|
|
1469
|
+
// NextN/MTP parameters
|
|
1470
|
+
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
|
1471
|
+
|
|
1472
|
+
switch (hparams.n_layer) {
|
|
1473
|
+
case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
|
|
1474
|
+
case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
|
|
1475
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1476
|
+
}
|
|
1477
|
+
} break;
|
|
1420
1478
|
case LLM_ARCH_BITNET:
|
|
1421
1479
|
{
|
|
1422
1480
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1744,6 +1802,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1744
1802
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1745
1803
|
}
|
|
1746
1804
|
} break;
|
|
1805
|
+
case LLM_ARCH_HUNYUAN_DENSE:
|
|
1806
|
+
{
|
|
1807
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1808
|
+
|
|
1809
|
+
switch (hparams.n_embd) {
|
|
1810
|
+
case 1024: type = LLM_TYPE_0_5B; break;
|
|
1811
|
+
case 2048: type = LLM_TYPE_1_8B; break;
|
|
1812
|
+
case 3072: type = LLM_TYPE_4B; break;
|
|
1813
|
+
case 4096: type = LLM_TYPE_7B; break;
|
|
1814
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1815
|
+
}
|
|
1816
|
+
} break;
|
|
1747
1817
|
case LLM_ARCH_SMOLLM3:
|
|
1748
1818
|
{
|
|
1749
1819
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1754,6 +1824,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1754
1824
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1755
1825
|
}
|
|
1756
1826
|
} break;
|
|
1827
|
+
case LLM_ARCH_OPENAI_MOE:
|
|
1828
|
+
{
|
|
1829
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1830
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1831
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1832
|
+
|
|
1833
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1834
|
+
hparams.set_swa_pattern(2);
|
|
1835
|
+
|
|
1836
|
+
// TODO: switch (hparams.n_layer)
|
|
1837
|
+
} break;
|
|
1757
1838
|
case LLM_ARCH_LFM2:
|
|
1758
1839
|
{
|
|
1759
1840
|
ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
|
|
@@ -1824,7 +1905,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1824
1905
|
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
|
|
1825
1906
|
|
|
1826
1907
|
// build a list of buffer types for the CPU and GPU devices
|
|
1827
|
-
pimpl->cpu_buft_list = make_cpu_buft_list(devices);
|
|
1908
|
+
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
|
|
1828
1909
|
for (auto * dev : devices) {
|
|
1829
1910
|
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
|
|
1830
1911
|
// add CPU buffer types as a fallback
|
|
@@ -1920,6 +2001,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1920
2001
|
|
|
1921
2002
|
const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
|
|
1922
2003
|
const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
|
|
2004
|
+
const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
|
|
1923
2005
|
|
|
1924
2006
|
// create tensors for the weights
|
|
1925
2007
|
{
|
|
@@ -1975,7 +2057,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1975
2057
|
}
|
|
1976
2058
|
|
|
1977
2059
|
// skip unused tensors
|
|
1978
|
-
if (info.op == GGML_OP_NONE) {
|
|
2060
|
+
if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
|
|
1979
2061
|
const size_t nbytes = ggml_nbytes(t_meta);
|
|
1980
2062
|
LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
|
|
1981
2063
|
|
|
@@ -1985,11 +2067,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1985
2067
|
return nullptr;
|
|
1986
2068
|
}
|
|
1987
2069
|
|
|
1988
|
-
// tensors with "bias" suffix are always used with GGML_OP_ADD
|
|
2070
|
+
// tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
|
|
1989
2071
|
ggml_op op;
|
|
1990
2072
|
bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
|
|
1991
2073
|
if (bias) {
|
|
1992
|
-
op
|
|
2074
|
+
if (info.op == GGML_OP_MUL_MAT_ID) {
|
|
2075
|
+
op = GGML_OP_ADD_ID;
|
|
2076
|
+
} else {
|
|
2077
|
+
op = GGML_OP_ADD;
|
|
2078
|
+
}
|
|
1993
2079
|
} else {
|
|
1994
2080
|
op = info.op;
|
|
1995
2081
|
}
|
|
@@ -2029,7 +2115,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2029
2115
|
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
|
2030
2116
|
std::regex pattern(overrides->pattern);
|
|
2031
2117
|
if (std::regex_search(tensor_name, pattern)) {
|
|
2032
|
-
|
|
2118
|
+
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
|
|
2119
|
+
// when overriding to a CPU buffer, consider the extra buffer types
|
|
2120
|
+
buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
|
|
2121
|
+
} else {
|
|
2122
|
+
buft = overrides->buft;
|
|
2123
|
+
}
|
|
2124
|
+
|
|
2033
2125
|
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
|
2034
2126
|
tensor_name.c_str(),
|
|
2035
2127
|
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
|
@@ -2149,6 +2241,53 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2149
2241
|
}
|
|
2150
2242
|
}
|
|
2151
2243
|
} break;
|
|
2244
|
+
case LLM_ARCH_LLADA:
|
|
2245
|
+
{
|
|
2246
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
2247
|
+
|
|
2248
|
+
// output
|
|
2249
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
2250
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
|
2251
|
+
|
|
2252
|
+
// if output is NULL, init from the input tok embed
|
|
2253
|
+
if (output == NULL) {
|
|
2254
|
+
output =
|
|
2255
|
+
create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
|
|
2256
|
+
}
|
|
2257
|
+
|
|
2258
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2259
|
+
auto & layer = layers[i];
|
|
2260
|
+
|
|
2261
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
|
2262
|
+
|
|
2263
|
+
// Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
|
|
2264
|
+
layer.wq =
|
|
2265
|
+
create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
|
|
2266
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
|
|
2267
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
|
|
2268
|
+
// No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
|
|
2269
|
+
layer.wo =
|
|
2270
|
+
create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
|
2271
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
|
2272
|
+
|
|
2273
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
|
|
2274
|
+
|
|
2275
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
|
|
2276
|
+
TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
2277
|
+
|
|
2278
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
|
|
2279
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
|
2280
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
|
|
2281
|
+
|
|
2282
|
+
// optional MLP bias
|
|
2283
|
+
layer.ffn_gate_b =
|
|
2284
|
+
create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
|
|
2285
|
+
layer.ffn_down_b =
|
|
2286
|
+
create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
|
2287
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
|
|
2288
|
+
}
|
|
2289
|
+
}
|
|
2290
|
+
break;
|
|
2152
2291
|
case LLM_ARCH_LLAMA4:
|
|
2153
2292
|
{
|
|
2154
2293
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -4345,6 +4484,105 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4345
4484
|
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
4346
4485
|
}
|
|
4347
4486
|
} break;
|
|
4487
|
+
case LLM_ARCH_GLM4_MOE:
|
|
4488
|
+
{
|
|
4489
|
+
const int64_t n_expert = hparams.n_expert;
|
|
4490
|
+
const int64_t n_expert_used = hparams.n_expert_used;
|
|
4491
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
|
4492
|
+
|
|
4493
|
+
GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
|
|
4494
|
+
GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
|
|
4495
|
+
|
|
4496
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
4497
|
+
|
|
4498
|
+
// output
|
|
4499
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
4500
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
|
4501
|
+
// if output is NULL, init from the input tok embed
|
|
4502
|
+
if (output == NULL) {
|
|
4503
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
|
|
4504
|
+
}
|
|
4505
|
+
|
|
4506
|
+
// Load ALL tensors including NextN layer to satisfy total tensor count
|
|
4507
|
+
// but only PROCESS up to last layer (skipping final NextN layer) in forward pass
|
|
4508
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4509
|
+
int flags = 0;
|
|
4510
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
4511
|
+
// skip all tensors in the NextN layers
|
|
4512
|
+
flags |= TENSOR_SKIP;
|
|
4513
|
+
}
|
|
4514
|
+
|
|
4515
|
+
auto & layer = layers[i];
|
|
4516
|
+
|
|
4517
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
|
|
4518
|
+
|
|
4519
|
+
// GLM-style attention with bias terms
|
|
4520
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
|
|
4521
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
|
|
4522
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
|
|
4523
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
|
|
4524
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
|
|
4525
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
|
|
4526
|
+
|
|
4527
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
|
|
4528
|
+
|
|
4529
|
+
// K/Q norm tensors (optional for GLM-4.5 355B variant)
|
|
4530
|
+
layer.attn_q_norm = create_tensor(
|
|
4531
|
+
tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
|
|
4532
|
+
layer.attn_k_norm = create_tensor(
|
|
4533
|
+
tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
|
|
4534
|
+
|
|
4535
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
|
|
4536
|
+
|
|
4537
|
+
// Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
|
|
4538
|
+
// GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
|
|
4539
|
+
const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
|
|
4540
|
+
|
|
4541
|
+
if (use_moe) {
|
|
4542
|
+
// MoE layers
|
|
4543
|
+
layer.ffn_gate_inp =
|
|
4544
|
+
create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
|
|
4545
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
|
|
4546
|
+
|
|
4547
|
+
// MoE branch
|
|
4548
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
4549
|
+
|
|
4550
|
+
layer.ffn_gate_exps = create_tensor(
|
|
4551
|
+
tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
|
|
4552
|
+
layer.ffn_down_exps = create_tensor(
|
|
4553
|
+
tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
|
|
4554
|
+
layer.ffn_up_exps = create_tensor(
|
|
4555
|
+
tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
|
|
4556
|
+
|
|
4557
|
+
// Shared expert
|
|
4558
|
+
if (n_expert_shared > 0) {
|
|
4559
|
+
const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
|
|
4560
|
+
layer.ffn_gate_shexp = create_tensor(
|
|
4561
|
+
tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
|
|
4562
|
+
layer.ffn_down_shexp = create_tensor(
|
|
4563
|
+
tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
|
|
4564
|
+
layer.ffn_up_shexp = create_tensor(
|
|
4565
|
+
tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
|
|
4566
|
+
}
|
|
4567
|
+
} else {
|
|
4568
|
+
// Dense layers (first k layers) - GLM uses separate gate/up projections
|
|
4569
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
|
|
4570
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
|
|
4571
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
|
|
4572
|
+
}
|
|
4573
|
+
|
|
4574
|
+
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
|
4575
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
4576
|
+
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
|
|
4577
|
+
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
|
|
4578
|
+
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
|
|
4579
|
+
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
|
|
4580
|
+
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
|
|
4581
|
+
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
|
|
4582
|
+
}
|
|
4583
|
+
}
|
|
4584
|
+
}
|
|
4585
|
+
break;
|
|
4348
4586
|
case LLM_ARCH_NEMOTRON:
|
|
4349
4587
|
{
|
|
4350
4588
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5126,6 +5364,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5126
5364
|
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
|
5127
5365
|
}
|
|
5128
5366
|
} break;
|
|
5367
|
+
case LLM_ARCH_HUNYUAN_DENSE:
|
|
5368
|
+
{
|
|
5369
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5370
|
+
|
|
5371
|
+
// output
|
|
5372
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
5373
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
5374
|
+
// if output is NULL, init from the input tok embed
|
|
5375
|
+
if (output == NULL) {
|
|
5376
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
5377
|
+
}
|
|
5378
|
+
|
|
5379
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5380
|
+
auto & layer = layers[i];
|
|
5381
|
+
|
|
5382
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
5383
|
+
|
|
5384
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
5385
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
5386
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
5387
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
5388
|
+
|
|
5389
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
5390
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
5391
|
+
|
|
5392
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
5393
|
+
|
|
5394
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
5395
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
5396
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
5397
|
+
|
|
5398
|
+
}
|
|
5399
|
+
} break;
|
|
5129
5400
|
case LLM_ARCH_SMOLLM3:
|
|
5130
5401
|
{
|
|
5131
5402
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5155,6 +5426,46 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5155
5426
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
5156
5427
|
}
|
|
5157
5428
|
} break;
|
|
5429
|
+
case LLM_ARCH_OPENAI_MOE:
|
|
5430
|
+
{
|
|
5431
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
5432
|
+
|
|
5433
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5434
|
+
|
|
5435
|
+
// output
|
|
5436
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
5437
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
5438
|
+
|
|
5439
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5440
|
+
auto & layer = layers[i];
|
|
5441
|
+
|
|
5442
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
5443
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
5444
|
+
|
|
5445
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
|
|
5446
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
|
|
5447
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
|
|
5448
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
|
|
5449
|
+
|
|
5450
|
+
layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
|
|
5451
|
+
|
|
5452
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
|
|
5453
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
5454
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
5455
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
5456
|
+
|
|
5457
|
+
// bias
|
|
5458
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0);
|
|
5459
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_head_kv * n_rot}, 0);
|
|
5460
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_head_kv * n_rot}, 0);
|
|
5461
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
5462
|
+
|
|
5463
|
+
layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
|
|
5464
|
+
layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
|
|
5465
|
+
layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0);
|
|
5466
|
+
layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
|
|
5467
|
+
}
|
|
5468
|
+
} break;
|
|
5158
5469
|
case LLM_ARCH_LFM2:
|
|
5159
5470
|
{
|
|
5160
5471
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5527,7 +5838,7 @@ void llama_model::print_info() const {
|
|
|
5527
5838
|
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
5528
5839
|
}
|
|
5529
5840
|
|
|
5530
|
-
if (arch == LLM_ARCH_QWEN3MOE) {
|
|
5841
|
+
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE) {
|
|
5531
5842
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
5532
5843
|
}
|
|
5533
5844
|
|
|
@@ -8042,8 +8353,10 @@ struct llm_build_dream : public llm_graph_context {
|
|
|
8042
8353
|
}
|
|
8043
8354
|
};
|
|
8044
8355
|
|
|
8045
|
-
struct
|
|
8046
|
-
|
|
8356
|
+
struct llm_build_llada : public llm_graph_context {
|
|
8357
|
+
llm_build_llada(const llama_model & model, const llm_graph_params & params) :
|
|
8358
|
+
llm_graph_context(params) {
|
|
8359
|
+
// LLaDA is similar to LLaMA but uses non-causal attention for diffusion
|
|
8047
8360
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8048
8361
|
|
|
8049
8362
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -8057,10 +8370,8 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
8057
8370
|
// inp_pos - contains the positions
|
|
8058
8371
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8059
8372
|
|
|
8060
|
-
|
|
8061
|
-
|
|
8062
|
-
int sections[4];
|
|
8063
|
-
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
|
8373
|
+
// Non-causal attention for diffusion
|
|
8374
|
+
auto * inp_attn = build_attn_inp_no_cache();
|
|
8064
8375
|
|
|
8065
8376
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8066
8377
|
|
|
@@ -8068,53 +8379,40 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
8068
8379
|
ggml_tensor * inpSA = inpL;
|
|
8069
8380
|
|
|
8070
8381
|
// norm
|
|
8071
|
-
cur = build_norm(inpL,
|
|
8072
|
-
model.layers[il].attn_norm, NULL,
|
|
8073
|
-
LLM_NORM_RMS, il);
|
|
8382
|
+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
8074
8383
|
cb(cur, "attn_norm", il);
|
|
8075
8384
|
|
|
8076
8385
|
// self-attention
|
|
8077
8386
|
{
|
|
8078
|
-
// compute Q
|
|
8387
|
+
// compute separate Q, K, V projections without bias, matching LLaDALlamaBlock
|
|
8079
8388
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
8080
|
-
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
8081
|
-
cb(Qcur, "Qcur", il);
|
|
8082
|
-
|
|
8083
8389
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
8084
|
-
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
8085
|
-
cb(Kcur, "Kcur", il);
|
|
8086
|
-
|
|
8087
8390
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
8088
|
-
|
|
8391
|
+
|
|
8392
|
+
cb(Qcur, "Qcur", il);
|
|
8393
|
+
cb(Kcur, "Kcur", il);
|
|
8089
8394
|
cb(Vcur, "Vcur", il);
|
|
8090
8395
|
|
|
8091
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,
|
|
8396
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8092
8397
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8093
8398
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8094
8399
|
|
|
8095
|
-
Qcur =
|
|
8096
|
-
|
|
8097
|
-
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8098
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8099
|
-
);
|
|
8400
|
+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8401
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
8100
8402
|
|
|
8101
|
-
Kcur =
|
|
8102
|
-
|
|
8103
|
-
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8104
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8105
|
-
);
|
|
8403
|
+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8404
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
8106
8405
|
|
|
8107
8406
|
cb(Qcur, "Qcur", il);
|
|
8108
8407
|
cb(Kcur, "Kcur", il);
|
|
8109
8408
|
cb(Vcur, "Vcur", il);
|
|
8110
8409
|
|
|
8111
|
-
cur = build_attn(inp_attn,
|
|
8112
|
-
|
|
8113
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8410
|
+
cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr,
|
|
8411
|
+
1.0f / sqrtf(float(n_embd_head)), il);
|
|
8114
8412
|
}
|
|
8115
8413
|
|
|
8116
8414
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
8117
|
-
cur = ggml_get_rows(ctx0,
|
|
8415
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8118
8416
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
8119
8417
|
}
|
|
8120
8418
|
|
|
@@ -8122,17 +8420,11 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
8122
8420
|
cb(ffn_inp, "ffn_inp", il);
|
|
8123
8421
|
|
|
8124
8422
|
// feed-forward network
|
|
8125
|
-
cur = build_norm(ffn_inp,
|
|
8126
|
-
model.layers[il].ffn_norm, NULL,
|
|
8127
|
-
LLM_NORM_RMS, il);
|
|
8423
|
+
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
|
8128
8424
|
cb(cur, "ffn_norm", il);
|
|
8129
8425
|
|
|
8130
|
-
cur = build_ffn(cur,
|
|
8131
|
-
|
|
8132
|
-
model.layers[il].ffn_gate, NULL, NULL,
|
|
8133
|
-
model.layers[il].ffn_down, NULL, NULL,
|
|
8134
|
-
NULL,
|
|
8135
|
-
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
8426
|
+
cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
|
|
8427
|
+
model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
8136
8428
|
cb(cur, "ffn_out", il);
|
|
8137
8429
|
|
|
8138
8430
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
@@ -8146,9 +8438,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
8146
8438
|
|
|
8147
8439
|
cur = inpL;
|
|
8148
8440
|
|
|
8149
|
-
cur = build_norm(cur,
|
|
8150
|
-
model.output_norm, NULL,
|
|
8151
|
-
LLM_NORM_RMS, -1);
|
|
8441
|
+
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
8152
8442
|
|
|
8153
8443
|
cb(cur, "result_norm", -1);
|
|
8154
8444
|
res->t_embd = cur;
|
|
@@ -8163,8 +8453,8 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
8163
8453
|
}
|
|
8164
8454
|
};
|
|
8165
8455
|
|
|
8166
|
-
struct
|
|
8167
|
-
|
|
8456
|
+
struct llm_build_qwen2vl : public llm_graph_context {
|
|
8457
|
+
llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
8168
8458
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8169
8459
|
|
|
8170
8460
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -8180,6 +8470,9 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
8180
8470
|
|
|
8181
8471
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
8182
8472
|
|
|
8473
|
+
int sections[4];
|
|
8474
|
+
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
|
8475
|
+
|
|
8183
8476
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8184
8477
|
|
|
8185
8478
|
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -8191,14 +8484,132 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
8191
8484
|
LLM_NORM_RMS, il);
|
|
8192
8485
|
cb(cur, "attn_norm", il);
|
|
8193
8486
|
|
|
8194
|
-
//
|
|
8487
|
+
// self-attention
|
|
8195
8488
|
{
|
|
8196
8489
|
// compute Q and K and RoPE them
|
|
8197
8490
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
8491
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
8198
8492
|
cb(Qcur, "Qcur", il);
|
|
8199
|
-
|
|
8200
|
-
|
|
8201
|
-
|
|
8493
|
+
|
|
8494
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
8495
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
8496
|
+
cb(Kcur, "Kcur", il);
|
|
8497
|
+
|
|
8498
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
8499
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
8500
|
+
cb(Vcur, "Vcur", il);
|
|
8501
|
+
|
|
8502
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8503
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8504
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8505
|
+
|
|
8506
|
+
Qcur = ggml_rope_multi(
|
|
8507
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
8508
|
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8509
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8510
|
+
);
|
|
8511
|
+
|
|
8512
|
+
Kcur = ggml_rope_multi(
|
|
8513
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
8514
|
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8515
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8516
|
+
);
|
|
8517
|
+
|
|
8518
|
+
cb(Qcur, "Qcur", il);
|
|
8519
|
+
cb(Kcur, "Kcur", il);
|
|
8520
|
+
cb(Vcur, "Vcur", il);
|
|
8521
|
+
|
|
8522
|
+
cur = build_attn(inp_attn,
|
|
8523
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
8524
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8525
|
+
}
|
|
8526
|
+
|
|
8527
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8528
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8529
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
8530
|
+
}
|
|
8531
|
+
|
|
8532
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
8533
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
8534
|
+
|
|
8535
|
+
// feed-forward network
|
|
8536
|
+
cur = build_norm(ffn_inp,
|
|
8537
|
+
model.layers[il].ffn_norm, NULL,
|
|
8538
|
+
LLM_NORM_RMS, il);
|
|
8539
|
+
cb(cur, "ffn_norm", il);
|
|
8540
|
+
|
|
8541
|
+
cur = build_ffn(cur,
|
|
8542
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
8543
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
8544
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
8545
|
+
NULL,
|
|
8546
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
8547
|
+
cb(cur, "ffn_out", il);
|
|
8548
|
+
|
|
8549
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
8550
|
+
|
|
8551
|
+
cur = build_cvec(cur, il);
|
|
8552
|
+
cb(cur, "l_out", il);
|
|
8553
|
+
|
|
8554
|
+
// input for next layer
|
|
8555
|
+
inpL = cur;
|
|
8556
|
+
}
|
|
8557
|
+
|
|
8558
|
+
cur = inpL;
|
|
8559
|
+
|
|
8560
|
+
cur = build_norm(cur,
|
|
8561
|
+
model.output_norm, NULL,
|
|
8562
|
+
LLM_NORM_RMS, -1);
|
|
8563
|
+
|
|
8564
|
+
cb(cur, "result_norm", -1);
|
|
8565
|
+
res->t_embd = cur;
|
|
8566
|
+
|
|
8567
|
+
// lm_head
|
|
8568
|
+
cur = build_lora_mm(model.output, cur);
|
|
8569
|
+
|
|
8570
|
+
cb(cur, "result_output", -1);
|
|
8571
|
+
res->t_logits = cur;
|
|
8572
|
+
|
|
8573
|
+
ggml_build_forward_expand(gf, cur);
|
|
8574
|
+
}
|
|
8575
|
+
};
|
|
8576
|
+
|
|
8577
|
+
struct llm_build_qwen2moe : public llm_graph_context {
|
|
8578
|
+
llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
8579
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8580
|
+
|
|
8581
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
8582
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
8583
|
+
|
|
8584
|
+
ggml_tensor * cur;
|
|
8585
|
+
ggml_tensor * inpL;
|
|
8586
|
+
|
|
8587
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
8588
|
+
|
|
8589
|
+
// inp_pos - contains the positions
|
|
8590
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
8591
|
+
|
|
8592
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
8593
|
+
|
|
8594
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8595
|
+
|
|
8596
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
8597
|
+
ggml_tensor * inpSA = inpL;
|
|
8598
|
+
|
|
8599
|
+
// norm
|
|
8600
|
+
cur = build_norm(inpL,
|
|
8601
|
+
model.layers[il].attn_norm, NULL,
|
|
8602
|
+
LLM_NORM_RMS, il);
|
|
8603
|
+
cb(cur, "attn_norm", il);
|
|
8604
|
+
|
|
8605
|
+
// self_attention
|
|
8606
|
+
{
|
|
8607
|
+
// compute Q and K and RoPE them
|
|
8608
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
8609
|
+
cb(Qcur, "Qcur", il);
|
|
8610
|
+
if (model.layers[il].bq) {
|
|
8611
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
8612
|
+
cb(Qcur, "Qcur", il);
|
|
8202
8613
|
}
|
|
8203
8614
|
|
|
8204
8615
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
@@ -13349,6 +13760,165 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13349
13760
|
}
|
|
13350
13761
|
};
|
|
13351
13762
|
|
|
13763
|
+
struct llm_build_glm4_moe : public llm_graph_context {
|
|
13764
|
+
llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
13765
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
13766
|
+
|
|
13767
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
13768
|
+
|
|
13769
|
+
ggml_tensor * cur;
|
|
13770
|
+
ggml_tensor * inpL;
|
|
13771
|
+
|
|
13772
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
13773
|
+
|
|
13774
|
+
// inp_pos - contains the positions
|
|
13775
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
13776
|
+
|
|
13777
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13778
|
+
|
|
13779
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13780
|
+
|
|
13781
|
+
// Only process up to last layer (skip final NextN layer)
|
|
13782
|
+
// Final layer tensors are loaded but not processed in forward pass
|
|
13783
|
+
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
|
|
13784
|
+
for (int il = 0; il < n_transformer_layers; ++il) {
|
|
13785
|
+
ggml_tensor * inpSA = inpL;
|
|
13786
|
+
|
|
13787
|
+
// Pre-attention norm
|
|
13788
|
+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
13789
|
+
cb(cur, "attn_norm", il);
|
|
13790
|
+
|
|
13791
|
+
// self-attention
|
|
13792
|
+
{
|
|
13793
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
13794
|
+
if (model.layers[il].bq) {
|
|
13795
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
13796
|
+
}
|
|
13797
|
+
cb(Qcur, "Qcur", il);
|
|
13798
|
+
|
|
13799
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
13800
|
+
if (model.layers[il].bk) {
|
|
13801
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
13802
|
+
}
|
|
13803
|
+
cb(Kcur, "Kcur", il);
|
|
13804
|
+
|
|
13805
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
13806
|
+
if (model.layers[il].bv) {
|
|
13807
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
13808
|
+
}
|
|
13809
|
+
cb(Vcur, "Vcur", il);
|
|
13810
|
+
|
|
13811
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13812
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13813
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13814
|
+
|
|
13815
|
+
// Apply Q/K norm if available (GLM-4.5 355B variant)
|
|
13816
|
+
if (model.layers[il].attn_q_norm) {
|
|
13817
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
13818
|
+
cb(Qcur, "Qcur_normed", il);
|
|
13819
|
+
}
|
|
13820
|
+
if (model.layers[il].attn_k_norm) {
|
|
13821
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
13822
|
+
cb(Kcur, "Kcur_normed", il);
|
|
13823
|
+
}
|
|
13824
|
+
|
|
13825
|
+
Qcur = ggml_rope_ext(
|
|
13826
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
13827
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13828
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13829
|
+
);
|
|
13830
|
+
|
|
13831
|
+
Kcur = ggml_rope_ext(
|
|
13832
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
13833
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13834
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13835
|
+
);
|
|
13836
|
+
|
|
13837
|
+
cb(Qcur, "Qcur", il);
|
|
13838
|
+
cb(Kcur, "Kcur", il);
|
|
13839
|
+
cb(Vcur, "Vcur", il);
|
|
13840
|
+
|
|
13841
|
+
cur = build_attn(inp_attn,
|
|
13842
|
+
model.layers[il].wo, NULL,
|
|
13843
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13844
|
+
}
|
|
13845
|
+
|
|
13846
|
+
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
|
13847
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13848
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13849
|
+
}
|
|
13850
|
+
|
|
13851
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
13852
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
13853
|
+
|
|
13854
|
+
// Post-attention norm
|
|
13855
|
+
cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
|
|
13856
|
+
cb(cur, "post_attn_norm", il);
|
|
13857
|
+
|
|
13858
|
+
// Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
|
|
13859
|
+
if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
|
|
13860
|
+
// Dense FFN layer
|
|
13861
|
+
cur = build_ffn(cur,
|
|
13862
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
13863
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
13864
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
13865
|
+
NULL,
|
|
13866
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
13867
|
+
cb(cur, "ffn_out", il);
|
|
13868
|
+
} else {
|
|
13869
|
+
// Process routed experts using existing MoE infrastructure
|
|
13870
|
+
ggml_tensor * routed_out = build_moe_ffn(cur,
|
|
13871
|
+
model.layers[il].ffn_gate_inp,
|
|
13872
|
+
model.layers[il].ffn_up_exps,
|
|
13873
|
+
model.layers[il].ffn_gate_exps,
|
|
13874
|
+
model.layers[il].ffn_down_exps,
|
|
13875
|
+
model.layers[il].ffn_exp_probs_b,
|
|
13876
|
+
n_expert, n_expert_used,
|
|
13877
|
+
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
13878
|
+
true, hparams.expert_weights_scale,
|
|
13879
|
+
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
|
13880
|
+
il);
|
|
13881
|
+
cb(routed_out, "ffn_moe_out", il);
|
|
13882
|
+
|
|
13883
|
+
// Process shared expert on original input
|
|
13884
|
+
ggml_tensor * shared_out = build_ffn(cur,
|
|
13885
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
13886
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
13887
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
13888
|
+
NULL,
|
|
13889
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
13890
|
+
cb(shared_out, "ffn_shexp_out", il);
|
|
13891
|
+
|
|
13892
|
+
// Final output: routed_output + shared_output
|
|
13893
|
+
cur = ggml_add(ctx0, routed_out, shared_out);
|
|
13894
|
+
cb(cur, "ffn_out", il);
|
|
13895
|
+
}
|
|
13896
|
+
|
|
13897
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
13898
|
+
|
|
13899
|
+
cur = build_cvec(cur, il);
|
|
13900
|
+
cb(cur, "l_out", il);
|
|
13901
|
+
|
|
13902
|
+
// input for next layer
|
|
13903
|
+
inpL = cur;
|
|
13904
|
+
}
|
|
13905
|
+
|
|
13906
|
+
cur = inpL;
|
|
13907
|
+
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
13908
|
+
|
|
13909
|
+
cb(cur, "result_norm", -1);
|
|
13910
|
+
res->t_embd = cur;
|
|
13911
|
+
|
|
13912
|
+
// lm_head
|
|
13913
|
+
cur = build_lora_mm(model.output, cur);
|
|
13914
|
+
|
|
13915
|
+
cb(cur, "result_output", -1);
|
|
13916
|
+
res->t_logits = cur;
|
|
13917
|
+
|
|
13918
|
+
ggml_build_forward_expand(gf, cur);
|
|
13919
|
+
}
|
|
13920
|
+
};
|
|
13921
|
+
|
|
13352
13922
|
struct llm_build_nemotron : public llm_graph_context {
|
|
13353
13923
|
llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
13354
13924
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -16761,6 +17331,144 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
|
16761
17331
|
}
|
|
16762
17332
|
};
|
|
16763
17333
|
|
|
17334
|
+
struct llm_build_hunyuan_dense : public llm_graph_context {
|
|
17335
|
+
llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
17336
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
17337
|
+
|
|
17338
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
17339
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
17340
|
+
|
|
17341
|
+
ggml_tensor * cur;
|
|
17342
|
+
ggml_tensor * inpL;
|
|
17343
|
+
|
|
17344
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
17345
|
+
|
|
17346
|
+
// inp_pos - contains the positions
|
|
17347
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
17348
|
+
|
|
17349
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
17350
|
+
|
|
17351
|
+
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
|
17352
|
+
|
|
17353
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
17354
|
+
|
|
17355
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
17356
|
+
ggml_tensor * inpSA = inpL;
|
|
17357
|
+
|
|
17358
|
+
// norm
|
|
17359
|
+
cur = build_norm(inpL,
|
|
17360
|
+
model.layers[il].attn_norm, NULL,
|
|
17361
|
+
LLM_NORM_RMS, il);
|
|
17362
|
+
cb(cur, "attn_norm", il);
|
|
17363
|
+
// self-attention
|
|
17364
|
+
{
|
|
17365
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
17366
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
17367
|
+
|
|
17368
|
+
// compute Q and K and RoPE them
|
|
17369
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
17370
|
+
cb(Qcur, "Qcur", il);
|
|
17371
|
+
if (model.layers[il].bq) {
|
|
17372
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
17373
|
+
cb(Qcur, "Qcur", il);
|
|
17374
|
+
}
|
|
17375
|
+
|
|
17376
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
17377
|
+
cb(Kcur, "Kcur", il);
|
|
17378
|
+
if (model.layers[il].bk) {
|
|
17379
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
17380
|
+
cb(Kcur, "Kcur", il);
|
|
17381
|
+
}
|
|
17382
|
+
|
|
17383
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
17384
|
+
cb(Vcur, "Vcur", il);
|
|
17385
|
+
if (model.layers[il].bv) {
|
|
17386
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
17387
|
+
cb(Vcur, "Vcur", il);
|
|
17388
|
+
}
|
|
17389
|
+
|
|
17390
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
17391
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
17392
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
17393
|
+
|
|
17394
|
+
Qcur = ggml_rope_ext(
|
|
17395
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
17396
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
17397
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
17398
|
+
);
|
|
17399
|
+
|
|
17400
|
+
cb(Qcur, "Qcur", il);
|
|
17401
|
+
cb(Kcur, "Kcur", il);
|
|
17402
|
+
cb(Vcur, "Vcur", il);
|
|
17403
|
+
|
|
17404
|
+
Kcur = ggml_rope_ext(
|
|
17405
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
17406
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
17407
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
17408
|
+
);
|
|
17409
|
+
|
|
17410
|
+
Kcur = build_norm(Kcur,
|
|
17411
|
+
model.layers[il].attn_k_norm, nullptr,
|
|
17412
|
+
LLM_NORM_RMS, il);
|
|
17413
|
+
cb(Kcur, "Kcur_norm", il);
|
|
17414
|
+
|
|
17415
|
+
Qcur = build_norm(Qcur,
|
|
17416
|
+
model.layers[il].attn_q_norm, nullptr,
|
|
17417
|
+
LLM_NORM_RMS, il);
|
|
17418
|
+
cb(Qcur, "Qcur_norm", il);
|
|
17419
|
+
|
|
17420
|
+
cur = build_attn(inp_attn,
|
|
17421
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
17422
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
17423
|
+
cb(cur, "attn_out", il);
|
|
17424
|
+
}
|
|
17425
|
+
|
|
17426
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
17427
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
17428
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
17429
|
+
}
|
|
17430
|
+
|
|
17431
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
17432
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
17433
|
+
|
|
17434
|
+
cur = build_norm(ffn_inp,
|
|
17435
|
+
model.layers[il].ffn_norm, NULL,
|
|
17436
|
+
LLM_NORM_RMS, il);
|
|
17437
|
+
cb(cur, "ffn_norm", il);
|
|
17438
|
+
// feed-forward network (non-MoE)
|
|
17439
|
+
ggml_tensor * cur_mlp = build_ffn(cur,
|
|
17440
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
17441
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
17442
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
17443
|
+
NULL,
|
|
17444
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
17445
|
+
cb(cur_mlp, "ffn_out", il);
|
|
17446
|
+
|
|
17447
|
+
cur = ggml_add(ctx0, cur_mlp, ffn_inp);
|
|
17448
|
+
|
|
17449
|
+
cur = build_cvec(cur, il);
|
|
17450
|
+
cb(cur, "l_out", il);
|
|
17451
|
+
|
|
17452
|
+
// input for next layer
|
|
17453
|
+
inpL = cur;
|
|
17454
|
+
}
|
|
17455
|
+
cur = inpL;
|
|
17456
|
+
|
|
17457
|
+
cur = build_norm(cur,
|
|
17458
|
+
model.output_norm, NULL,
|
|
17459
|
+
LLM_NORM_RMS, -1);
|
|
17460
|
+
|
|
17461
|
+
cb(cur, "result_norm", -1);
|
|
17462
|
+
res->t_embd = cur;
|
|
17463
|
+
// lm_head
|
|
17464
|
+
cur = build_lora_mm(model.output, cur);
|
|
17465
|
+
cb(cur, "result_output", -1);
|
|
17466
|
+
res->t_logits = cur;
|
|
17467
|
+
|
|
17468
|
+
ggml_build_forward_expand(gf, cur);
|
|
17469
|
+
}
|
|
17470
|
+
};
|
|
17471
|
+
|
|
16764
17472
|
struct llm_build_smollm3 : public llm_graph_context {
|
|
16765
17473
|
llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
16766
17474
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -16898,6 +17606,136 @@ struct llm_build_smollm3 : public llm_graph_context {
|
|
|
16898
17606
|
}
|
|
16899
17607
|
};
|
|
16900
17608
|
|
|
17609
|
+
struct llm_build_openai_moe_iswa : public llm_graph_context {
|
|
17610
|
+
llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
17611
|
+
ggml_tensor * cur;
|
|
17612
|
+
ggml_tensor * inpL;
|
|
17613
|
+
|
|
17614
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
17615
|
+
|
|
17616
|
+
// inp_pos - contains the positions
|
|
17617
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
17618
|
+
|
|
17619
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
17620
|
+
|
|
17621
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
17622
|
+
ggml_tensor * inpSA = inpL;
|
|
17623
|
+
|
|
17624
|
+
// norm
|
|
17625
|
+
cur = build_norm(inpL,
|
|
17626
|
+
model.layers[il].attn_norm, nullptr,
|
|
17627
|
+
LLM_NORM_RMS, il);
|
|
17628
|
+
cb(cur, "attn_norm", il);
|
|
17629
|
+
|
|
17630
|
+
// self-attention
|
|
17631
|
+
{
|
|
17632
|
+
// compute Q and K and RoPE them
|
|
17633
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
17634
|
+
cb(Qcur, "Qcur", il);
|
|
17635
|
+
if (model.layers[il].bq) {
|
|
17636
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
17637
|
+
cb(Qcur, "Qcur", il);
|
|
17638
|
+
}
|
|
17639
|
+
|
|
17640
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
17641
|
+
cb(Kcur, "Kcur", il);
|
|
17642
|
+
if (model.layers[il].bk) {
|
|
17643
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
17644
|
+
cb(Kcur, "Kcur", il);
|
|
17645
|
+
}
|
|
17646
|
+
|
|
17647
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
17648
|
+
cb(Vcur, "Vcur", il);
|
|
17649
|
+
if (model.layers[il].bv) {
|
|
17650
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
17651
|
+
cb(Vcur, "Vcur", il);
|
|
17652
|
+
}
|
|
17653
|
+
|
|
17654
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
|
|
17655
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
|
|
17656
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
|
|
17657
|
+
|
|
17658
|
+
Qcur = ggml_rope_ext(
|
|
17659
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
17660
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
17661
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
17662
|
+
);
|
|
17663
|
+
|
|
17664
|
+
Kcur = ggml_rope_ext(
|
|
17665
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
17666
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
17667
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
17668
|
+
);
|
|
17669
|
+
|
|
17670
|
+
cb(Qcur, "Qcur", il);
|
|
17671
|
+
cb(Kcur, "Kcur", il);
|
|
17672
|
+
cb(Vcur, "Vcur", il);
|
|
17673
|
+
|
|
17674
|
+
cur = build_attn_with_sinks(inp_attn,
|
|
17675
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
17676
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].attn_sinks, 1.0f/sqrtf(float(n_rot)), il);
|
|
17677
|
+
|
|
17678
|
+
cb(cur, "attn_out", il);
|
|
17679
|
+
}
|
|
17680
|
+
|
|
17681
|
+
if (il == n_layer - 1) {
|
|
17682
|
+
// skip computing output for unused tokens
|
|
17683
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
17684
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
17685
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
17686
|
+
}
|
|
17687
|
+
|
|
17688
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
17689
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
17690
|
+
|
|
17691
|
+
cur = ffn_inp;
|
|
17692
|
+
cur = build_norm(cur,
|
|
17693
|
+
model.layers[il].attn_post_norm, nullptr,
|
|
17694
|
+
LLM_NORM_RMS, il);
|
|
17695
|
+
cb(cur, "attn_post_norm", il);
|
|
17696
|
+
|
|
17697
|
+
// MoE branch
|
|
17698
|
+
cur = build_moe_ffn(cur,
|
|
17699
|
+
model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b,
|
|
17700
|
+
model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b,
|
|
17701
|
+
model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b,
|
|
17702
|
+
model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b,
|
|
17703
|
+
nullptr,
|
|
17704
|
+
n_expert, n_expert_used,
|
|
17705
|
+
LLM_FFN_SWIGLU_OAI_MOE, false,
|
|
17706
|
+
false, 0.0,
|
|
17707
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
|
|
17708
|
+
il);
|
|
17709
|
+
cb(cur, "ffn_moe_out", il);
|
|
17710
|
+
|
|
17711
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
17712
|
+
|
|
17713
|
+
cur = build_cvec(cur, il);
|
|
17714
|
+
cb(cur, "l_out", il);
|
|
17715
|
+
|
|
17716
|
+
// input for next layer
|
|
17717
|
+
inpL = cur;
|
|
17718
|
+
}
|
|
17719
|
+
|
|
17720
|
+
cur = inpL;
|
|
17721
|
+
|
|
17722
|
+
cur = build_norm(cur,
|
|
17723
|
+
model.output_norm, NULL,
|
|
17724
|
+
LLM_NORM_RMS, -1);
|
|
17725
|
+
|
|
17726
|
+
cb(cur, "result_norm", -1);
|
|
17727
|
+
res->t_embd = cur;
|
|
17728
|
+
|
|
17729
|
+
// lm_head
|
|
17730
|
+
cur = build_lora_mm(model.output, cur);
|
|
17731
|
+
|
|
17732
|
+
cb(cur, "result_output", -1);
|
|
17733
|
+
res->t_logits = cur;
|
|
17734
|
+
|
|
17735
|
+
ggml_build_forward_expand(gf, cur);
|
|
17736
|
+
}
|
|
17737
|
+
};
|
|
17738
|
+
|
|
16901
17739
|
struct llm_build_lfm2 : public llm_graph_context {
|
|
16902
17740
|
const llama_model & model;
|
|
16903
17741
|
|
|
@@ -17158,10 +17996,18 @@ struct llm_build_smallthinker : public llm_graph_context{
|
|
|
17158
17996
|
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
|
17159
17997
|
cb(cur, "ffn_norm", il);
|
|
17160
17998
|
|
|
17161
|
-
ggml_tensor * ffn_out =
|
|
17162
|
-
|
|
17163
|
-
|
|
17164
|
-
|
|
17999
|
+
ggml_tensor * ffn_out =
|
|
18000
|
+
build_moe_ffn(cur,
|
|
18001
|
+
nullptr,
|
|
18002
|
+
model.layers[il].ffn_up_exps,
|
|
18003
|
+
model.layers[il].ffn_gate_exps,
|
|
18004
|
+
model.layers[il].ffn_down_exps,
|
|
18005
|
+
nullptr,
|
|
18006
|
+
n_expert, n_expert_used,
|
|
18007
|
+
LLM_FFN_RELU, true,
|
|
18008
|
+
false, 0.0,
|
|
18009
|
+
static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
|
|
18010
|
+
il, probs);
|
|
17165
18011
|
|
|
17166
18012
|
cb(ffn_out, "ffn_out", il);
|
|
17167
18013
|
cur = ffn_out;
|
|
@@ -17201,6 +18047,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
17201
18047
|
case LLM_ARCH_NEO_BERT:
|
|
17202
18048
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
17203
18049
|
case LLM_ARCH_DREAM:
|
|
18050
|
+
case LLM_ARCH_LLADA:
|
|
17204
18051
|
{
|
|
17205
18052
|
res = nullptr;
|
|
17206
18053
|
} break;
|
|
@@ -17236,6 +18083,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
17236
18083
|
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
|
17237
18084
|
/* n_seq_max */ cparams.n_seq_max,
|
|
17238
18085
|
/* offload */ cparams.offload_kqv,
|
|
18086
|
+
/* unified */ cparams.kv_unified,
|
|
17239
18087
|
/* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
|
|
17240
18088
|
/* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
|
|
17241
18089
|
} else {
|
|
@@ -17367,6 +18215,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
17367
18215
|
llm = std::make_unique<llm_build_dream>(*this, params);
|
|
17368
18216
|
}
|
|
17369
18217
|
break;
|
|
18218
|
+
case LLM_ARCH_LLADA:
|
|
18219
|
+
{
|
|
18220
|
+
llm = std::make_unique<llm_build_llada>(*this, params);
|
|
18221
|
+
}
|
|
18222
|
+
break;
|
|
17370
18223
|
case LLM_ARCH_QWEN2VL:
|
|
17371
18224
|
{
|
|
17372
18225
|
llm = std::make_unique<llm_build_qwen2vl>(*this, params);
|
|
@@ -17509,6 +18362,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
17509
18362
|
{
|
|
17510
18363
|
llm = std::make_unique<llm_build_glm4>(*this, params);
|
|
17511
18364
|
} break;
|
|
18365
|
+
case LLM_ARCH_GLM4_MOE:
|
|
18366
|
+
{
|
|
18367
|
+
llm = std::make_unique<llm_build_glm4_moe>(*this, params);
|
|
18368
|
+
} break;
|
|
17512
18369
|
case LLM_ARCH_BITNET:
|
|
17513
18370
|
{
|
|
17514
18371
|
llm = std::make_unique<llm_build_bitnet>(*this, params);
|
|
@@ -17614,10 +18471,18 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
17614
18471
|
{
|
|
17615
18472
|
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
|
|
17616
18473
|
} break;
|
|
18474
|
+
case LLM_ARCH_HUNYUAN_DENSE:
|
|
18475
|
+
{
|
|
18476
|
+
llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
|
|
18477
|
+
} break;
|
|
17617
18478
|
case LLM_ARCH_SMOLLM3:
|
|
17618
18479
|
{
|
|
17619
18480
|
llm = std::make_unique<llm_build_smollm3>(*this, params);
|
|
17620
18481
|
} break;
|
|
18482
|
+
case LLM_ARCH_OPENAI_MOE:
|
|
18483
|
+
{
|
|
18484
|
+
llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
|
|
18485
|
+
} break;
|
|
17621
18486
|
case LLM_ARCH_FALCON_H1:
|
|
17622
18487
|
{
|
|
17623
18488
|
llm = std::make_unique<llm_build_falcon_h1>(*this, params);
|
|
@@ -17663,6 +18528,7 @@ llama_model_params llama_model_default_params() {
|
|
|
17663
18528
|
/*.use_mmap =*/ true,
|
|
17664
18529
|
/*.use_mlock =*/ false,
|
|
17665
18530
|
/*.check_tensors =*/ false,
|
|
18531
|
+
/*.use_extra_bufts =*/ true,
|
|
17666
18532
|
};
|
|
17667
18533
|
|
|
17668
18534
|
#ifdef GGML_USE_METAL
|
|
@@ -17765,6 +18631,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
17765
18631
|
|
|
17766
18632
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
|
17767
18633
|
case LLM_ARCH_LLAMA:
|
|
18634
|
+
case LLM_ARCH_LLADA:
|
|
17768
18635
|
case LLM_ARCH_LLAMA4:
|
|
17769
18636
|
case LLM_ARCH_DECI:
|
|
17770
18637
|
case LLM_ARCH_BAICHUAN:
|
|
@@ -17831,8 +18698,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
17831
18698
|
case LLM_ARCH_MINICPM3:
|
|
17832
18699
|
case LLM_ARCH_DOTS1:
|
|
17833
18700
|
case LLM_ARCH_HUNYUAN_MOE:
|
|
18701
|
+
case LLM_ARCH_OPENAI_MOE:
|
|
18702
|
+
case LLM_ARCH_HUNYUAN_DENSE:
|
|
17834
18703
|
case LLM_ARCH_LFM2:
|
|
17835
18704
|
case LLM_ARCH_SMALLTHINKER:
|
|
18705
|
+
case LLM_ARCH_GLM4_MOE:
|
|
17836
18706
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
17837
18707
|
|
|
17838
18708
|
case LLM_ARCH_QWEN2VL:
|
|
@@ -17943,6 +18813,10 @@ bool llama_model_is_recurrent(const llama_model * model) {
|
|
|
17943
18813
|
return llm_arch_is_recurrent(model->arch);
|
|
17944
18814
|
}
|
|
17945
18815
|
|
|
18816
|
+
bool llama_model_is_diffusion(const llama_model * model) {
|
|
18817
|
+
return llm_arch_is_diffusion(model->arch);
|
|
18818
|
+
}
|
|
18819
|
+
|
|
17946
18820
|
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
|
|
17947
18821
|
return model->tensors_by_name;
|
|
17948
18822
|
}
|