@fugood/llama.node 1.1.5 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +17 -13
- package/src/LlamaCompletionWorker.cpp +2 -0
- package/src/llama.cpp/common/arg.cpp +28 -11
- package/src/llama.cpp/common/chat.cpp +46 -2
- package/src/llama.cpp/common/chat.h +7 -2
- package/src/llama.cpp/common/common.h +3 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
- package/src/llama.cpp/ggml/include/ggml.h +37 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
- package/src/llama.cpp/include/llama.h +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +65 -0
- package/src/llama.cpp/src/llama-arch.h +10 -0
- package/src/llama.cpp/src/llama-chat.cpp +13 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +8 -8
- package/src/llama.cpp/src/llama-graph.cpp +118 -9
- package/src/llama.cpp/src/llama-graph.h +38 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -3
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +4 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +499 -4
- package/src/llama.cpp/src/llama-model.h +24 -4
- package/src/llama.cpp/src/llama-quant.cpp +37 -1
- package/src/llama.cpp/src/llama-vocab.cpp +42 -0
|
@@ -109,8 +109,10 @@ const char * llm_type_name(llm_type type) {
|
|
|
109
109
|
case LLM_TYPE_A13B: return "A13B";
|
|
110
110
|
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
|
111
111
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
112
|
+
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
|
112
113
|
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
113
114
|
case LLM_TYPE_300B_A47B: return "300B.A47B";
|
|
115
|
+
case LLM_TYPE_355B_A32B: return "355B.A32B";
|
|
114
116
|
case LLM_TYPE_E2B: return "E2B";
|
|
115
117
|
case LLM_TYPE_E4B: return "E4B";
|
|
116
118
|
default: return "?B";
|
|
@@ -190,6 +192,13 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
|
|
190
192
|
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
|
191
193
|
op_tensor = ggml_add(ctx, a, w);
|
|
192
194
|
} break;
|
|
195
|
+
case GGML_OP_ADD_ID:
|
|
196
|
+
{
|
|
197
|
+
int n_expert_used = hparams.n_expert_used;
|
|
198
|
+
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
|
199
|
+
ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
|
200
|
+
op_tensor = ggml_add_id(ctx, a, w, c);
|
|
201
|
+
} break;
|
|
193
202
|
case GGML_OP_MUL:
|
|
194
203
|
{
|
|
195
204
|
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
|
@@ -258,6 +267,10 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
|
|
258
267
|
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
|
|
259
268
|
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
|
|
260
269
|
} break;
|
|
270
|
+
case GGML_OP_SCALE:
|
|
271
|
+
{
|
|
272
|
+
op_tensor = ggml_scale(ctx, w, 1.0f);
|
|
273
|
+
} break;
|
|
261
274
|
default:
|
|
262
275
|
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
|
|
263
276
|
}
|
|
@@ -1434,6 +1447,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1434
1447
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1435
1448
|
}
|
|
1436
1449
|
} break;
|
|
1450
|
+
case LLM_ARCH_GLM4_MOE:
|
|
1451
|
+
{
|
|
1452
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1453
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1454
|
+
|
|
1455
|
+
// MoE parameters
|
|
1456
|
+
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
|
1457
|
+
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
|
1458
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1459
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
|
1460
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1461
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1462
|
+
|
|
1463
|
+
// Expert gating function (GLM-4.5 uses sigmoid)
|
|
1464
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
1465
|
+
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
|
1466
|
+
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
|
|
1467
|
+
}
|
|
1468
|
+
|
|
1469
|
+
// NextN/MTP parameters
|
|
1470
|
+
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
|
1471
|
+
|
|
1472
|
+
switch (hparams.n_layer) {
|
|
1473
|
+
case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
|
|
1474
|
+
case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
|
|
1475
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1476
|
+
}
|
|
1477
|
+
} break;
|
|
1437
1478
|
case LLM_ARCH_BITNET:
|
|
1438
1479
|
{
|
|
1439
1480
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1783,6 +1824,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1783
1824
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1784
1825
|
}
|
|
1785
1826
|
} break;
|
|
1827
|
+
case LLM_ARCH_OPENAI_MOE:
|
|
1828
|
+
{
|
|
1829
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1830
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1831
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1832
|
+
|
|
1833
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1834
|
+
hparams.set_swa_pattern(2);
|
|
1835
|
+
|
|
1836
|
+
// TODO: switch (hparams.n_layer)
|
|
1837
|
+
} break;
|
|
1786
1838
|
case LLM_ARCH_LFM2:
|
|
1787
1839
|
{
|
|
1788
1840
|
ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
|
|
@@ -1949,6 +2001,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1949
2001
|
|
|
1950
2002
|
const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
|
|
1951
2003
|
const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
|
|
2004
|
+
const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
|
|
1952
2005
|
|
|
1953
2006
|
// create tensors for the weights
|
|
1954
2007
|
{
|
|
@@ -2004,7 +2057,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2004
2057
|
}
|
|
2005
2058
|
|
|
2006
2059
|
// skip unused tensors
|
|
2007
|
-
if (info.op == GGML_OP_NONE) {
|
|
2060
|
+
if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
|
|
2008
2061
|
const size_t nbytes = ggml_nbytes(t_meta);
|
|
2009
2062
|
LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
|
|
2010
2063
|
|
|
@@ -2014,11 +2067,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2014
2067
|
return nullptr;
|
|
2015
2068
|
}
|
|
2016
2069
|
|
|
2017
|
-
// tensors with "bias" suffix are always used with GGML_OP_ADD
|
|
2070
|
+
// tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
|
|
2018
2071
|
ggml_op op;
|
|
2019
2072
|
bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
|
|
2020
2073
|
if (bias) {
|
|
2021
|
-
op
|
|
2074
|
+
if (info.op == GGML_OP_MUL_MAT_ID) {
|
|
2075
|
+
op = GGML_OP_ADD_ID;
|
|
2076
|
+
} else {
|
|
2077
|
+
op = GGML_OP_ADD;
|
|
2078
|
+
}
|
|
2022
2079
|
} else {
|
|
2023
2080
|
op = info.op;
|
|
2024
2081
|
}
|
|
@@ -4427,6 +4484,105 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4427
4484
|
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
4428
4485
|
}
|
|
4429
4486
|
} break;
|
|
4487
|
+
case LLM_ARCH_GLM4_MOE:
|
|
4488
|
+
{
|
|
4489
|
+
const int64_t n_expert = hparams.n_expert;
|
|
4490
|
+
const int64_t n_expert_used = hparams.n_expert_used;
|
|
4491
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
|
4492
|
+
|
|
4493
|
+
GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
|
|
4494
|
+
GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
|
|
4495
|
+
|
|
4496
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
4497
|
+
|
|
4498
|
+
// output
|
|
4499
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
4500
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
|
4501
|
+
// if output is NULL, init from the input tok embed
|
|
4502
|
+
if (output == NULL) {
|
|
4503
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
|
|
4504
|
+
}
|
|
4505
|
+
|
|
4506
|
+
// Load ALL tensors including NextN layer to satisfy total tensor count
|
|
4507
|
+
// but only PROCESS up to last layer (skipping final NextN layer) in forward pass
|
|
4508
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4509
|
+
int flags = 0;
|
|
4510
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
4511
|
+
// skip all tensors in the NextN layers
|
|
4512
|
+
flags |= TENSOR_SKIP;
|
|
4513
|
+
}
|
|
4514
|
+
|
|
4515
|
+
auto & layer = layers[i];
|
|
4516
|
+
|
|
4517
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
|
|
4518
|
+
|
|
4519
|
+
// GLM-style attention with bias terms
|
|
4520
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
|
|
4521
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
|
|
4522
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
|
|
4523
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
|
|
4524
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
|
|
4525
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
|
|
4526
|
+
|
|
4527
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
|
|
4528
|
+
|
|
4529
|
+
// K/Q norm tensors (optional for GLM-4.5 355B variant)
|
|
4530
|
+
layer.attn_q_norm = create_tensor(
|
|
4531
|
+
tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
|
|
4532
|
+
layer.attn_k_norm = create_tensor(
|
|
4533
|
+
tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
|
|
4534
|
+
|
|
4535
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
|
|
4536
|
+
|
|
4537
|
+
// Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
|
|
4538
|
+
// GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
|
|
4539
|
+
const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
|
|
4540
|
+
|
|
4541
|
+
if (use_moe) {
|
|
4542
|
+
// MoE layers
|
|
4543
|
+
layer.ffn_gate_inp =
|
|
4544
|
+
create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
|
|
4545
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
|
|
4546
|
+
|
|
4547
|
+
// MoE branch
|
|
4548
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
4549
|
+
|
|
4550
|
+
layer.ffn_gate_exps = create_tensor(
|
|
4551
|
+
tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
|
|
4552
|
+
layer.ffn_down_exps = create_tensor(
|
|
4553
|
+
tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
|
|
4554
|
+
layer.ffn_up_exps = create_tensor(
|
|
4555
|
+
tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
|
|
4556
|
+
|
|
4557
|
+
// Shared expert
|
|
4558
|
+
if (n_expert_shared > 0) {
|
|
4559
|
+
const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
|
|
4560
|
+
layer.ffn_gate_shexp = create_tensor(
|
|
4561
|
+
tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
|
|
4562
|
+
layer.ffn_down_shexp = create_tensor(
|
|
4563
|
+
tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
|
|
4564
|
+
layer.ffn_up_shexp = create_tensor(
|
|
4565
|
+
tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
|
|
4566
|
+
}
|
|
4567
|
+
} else {
|
|
4568
|
+
// Dense layers (first k layers) - GLM uses separate gate/up projections
|
|
4569
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
|
|
4570
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
|
|
4571
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
|
|
4572
|
+
}
|
|
4573
|
+
|
|
4574
|
+
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
|
4575
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
4576
|
+
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
|
|
4577
|
+
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
|
|
4578
|
+
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
|
|
4579
|
+
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
|
|
4580
|
+
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
|
|
4581
|
+
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
|
|
4582
|
+
}
|
|
4583
|
+
}
|
|
4584
|
+
}
|
|
4585
|
+
break;
|
|
4430
4586
|
case LLM_ARCH_NEMOTRON:
|
|
4431
4587
|
{
|
|
4432
4588
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5270,6 +5426,46 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5270
5426
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
5271
5427
|
}
|
|
5272
5428
|
} break;
|
|
5429
|
+
case LLM_ARCH_OPENAI_MOE:
|
|
5430
|
+
{
|
|
5431
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
5432
|
+
|
|
5433
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5434
|
+
|
|
5435
|
+
// output
|
|
5436
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
5437
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
5438
|
+
|
|
5439
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5440
|
+
auto & layer = layers[i];
|
|
5441
|
+
|
|
5442
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
5443
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
5444
|
+
|
|
5445
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
|
|
5446
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
|
|
5447
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
|
|
5448
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
|
|
5449
|
+
|
|
5450
|
+
layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
|
|
5451
|
+
|
|
5452
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
|
|
5453
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
5454
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
5455
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
5456
|
+
|
|
5457
|
+
// bias
|
|
5458
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0);
|
|
5459
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_head_kv * n_rot}, 0);
|
|
5460
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_head_kv * n_rot}, 0);
|
|
5461
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
5462
|
+
|
|
5463
|
+
layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
|
|
5464
|
+
layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
|
|
5465
|
+
layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0);
|
|
5466
|
+
layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
|
|
5467
|
+
}
|
|
5468
|
+
} break;
|
|
5273
5469
|
case LLM_ARCH_LFM2:
|
|
5274
5470
|
{
|
|
5275
5471
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5642,7 +5838,7 @@ void llama_model::print_info() const {
|
|
|
5642
5838
|
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
5643
5839
|
}
|
|
5644
5840
|
|
|
5645
|
-
if (arch == LLM_ARCH_QWEN3MOE) {
|
|
5841
|
+
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE) {
|
|
5646
5842
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
5647
5843
|
}
|
|
5648
5844
|
|
|
@@ -13564,6 +13760,165 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13564
13760
|
}
|
|
13565
13761
|
};
|
|
13566
13762
|
|
|
13763
|
+
struct llm_build_glm4_moe : public llm_graph_context {
|
|
13764
|
+
llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
13765
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
13766
|
+
|
|
13767
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
13768
|
+
|
|
13769
|
+
ggml_tensor * cur;
|
|
13770
|
+
ggml_tensor * inpL;
|
|
13771
|
+
|
|
13772
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
13773
|
+
|
|
13774
|
+
// inp_pos - contains the positions
|
|
13775
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
13776
|
+
|
|
13777
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13778
|
+
|
|
13779
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13780
|
+
|
|
13781
|
+
// Only process up to last layer (skip final NextN layer)
|
|
13782
|
+
// Final layer tensors are loaded but not processed in forward pass
|
|
13783
|
+
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
|
|
13784
|
+
for (int il = 0; il < n_transformer_layers; ++il) {
|
|
13785
|
+
ggml_tensor * inpSA = inpL;
|
|
13786
|
+
|
|
13787
|
+
// Pre-attention norm
|
|
13788
|
+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
13789
|
+
cb(cur, "attn_norm", il);
|
|
13790
|
+
|
|
13791
|
+
// self-attention
|
|
13792
|
+
{
|
|
13793
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
13794
|
+
if (model.layers[il].bq) {
|
|
13795
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
13796
|
+
}
|
|
13797
|
+
cb(Qcur, "Qcur", il);
|
|
13798
|
+
|
|
13799
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
13800
|
+
if (model.layers[il].bk) {
|
|
13801
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
13802
|
+
}
|
|
13803
|
+
cb(Kcur, "Kcur", il);
|
|
13804
|
+
|
|
13805
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
13806
|
+
if (model.layers[il].bv) {
|
|
13807
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
13808
|
+
}
|
|
13809
|
+
cb(Vcur, "Vcur", il);
|
|
13810
|
+
|
|
13811
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13812
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13813
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13814
|
+
|
|
13815
|
+
// Apply Q/K norm if available (GLM-4.5 355B variant)
|
|
13816
|
+
if (model.layers[il].attn_q_norm) {
|
|
13817
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
13818
|
+
cb(Qcur, "Qcur_normed", il);
|
|
13819
|
+
}
|
|
13820
|
+
if (model.layers[il].attn_k_norm) {
|
|
13821
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
13822
|
+
cb(Kcur, "Kcur_normed", il);
|
|
13823
|
+
}
|
|
13824
|
+
|
|
13825
|
+
Qcur = ggml_rope_ext(
|
|
13826
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
13827
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13828
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13829
|
+
);
|
|
13830
|
+
|
|
13831
|
+
Kcur = ggml_rope_ext(
|
|
13832
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
13833
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13834
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13835
|
+
);
|
|
13836
|
+
|
|
13837
|
+
cb(Qcur, "Qcur", il);
|
|
13838
|
+
cb(Kcur, "Kcur", il);
|
|
13839
|
+
cb(Vcur, "Vcur", il);
|
|
13840
|
+
|
|
13841
|
+
cur = build_attn(inp_attn,
|
|
13842
|
+
model.layers[il].wo, NULL,
|
|
13843
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13844
|
+
}
|
|
13845
|
+
|
|
13846
|
+
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
|
13847
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13848
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13849
|
+
}
|
|
13850
|
+
|
|
13851
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
13852
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
13853
|
+
|
|
13854
|
+
// Post-attention norm
|
|
13855
|
+
cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
|
|
13856
|
+
cb(cur, "post_attn_norm", il);
|
|
13857
|
+
|
|
13858
|
+
// Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
|
|
13859
|
+
if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
|
|
13860
|
+
// Dense FFN layer
|
|
13861
|
+
cur = build_ffn(cur,
|
|
13862
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
13863
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
13864
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
13865
|
+
NULL,
|
|
13866
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
13867
|
+
cb(cur, "ffn_out", il);
|
|
13868
|
+
} else {
|
|
13869
|
+
// Process routed experts using existing MoE infrastructure
|
|
13870
|
+
ggml_tensor * routed_out = build_moe_ffn(cur,
|
|
13871
|
+
model.layers[il].ffn_gate_inp,
|
|
13872
|
+
model.layers[il].ffn_up_exps,
|
|
13873
|
+
model.layers[il].ffn_gate_exps,
|
|
13874
|
+
model.layers[il].ffn_down_exps,
|
|
13875
|
+
model.layers[il].ffn_exp_probs_b,
|
|
13876
|
+
n_expert, n_expert_used,
|
|
13877
|
+
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
13878
|
+
true, hparams.expert_weights_scale,
|
|
13879
|
+
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
|
13880
|
+
il);
|
|
13881
|
+
cb(routed_out, "ffn_moe_out", il);
|
|
13882
|
+
|
|
13883
|
+
// Process shared expert on original input
|
|
13884
|
+
ggml_tensor * shared_out = build_ffn(cur,
|
|
13885
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
13886
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
13887
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
13888
|
+
NULL,
|
|
13889
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
13890
|
+
cb(shared_out, "ffn_shexp_out", il);
|
|
13891
|
+
|
|
13892
|
+
// Final output: routed_output + shared_output
|
|
13893
|
+
cur = ggml_add(ctx0, routed_out, shared_out);
|
|
13894
|
+
cb(cur, "ffn_out", il);
|
|
13895
|
+
}
|
|
13896
|
+
|
|
13897
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
13898
|
+
|
|
13899
|
+
cur = build_cvec(cur, il);
|
|
13900
|
+
cb(cur, "l_out", il);
|
|
13901
|
+
|
|
13902
|
+
// input for next layer
|
|
13903
|
+
inpL = cur;
|
|
13904
|
+
}
|
|
13905
|
+
|
|
13906
|
+
cur = inpL;
|
|
13907
|
+
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
13908
|
+
|
|
13909
|
+
cb(cur, "result_norm", -1);
|
|
13910
|
+
res->t_embd = cur;
|
|
13911
|
+
|
|
13912
|
+
// lm_head
|
|
13913
|
+
cur = build_lora_mm(model.output, cur);
|
|
13914
|
+
|
|
13915
|
+
cb(cur, "result_output", -1);
|
|
13916
|
+
res->t_logits = cur;
|
|
13917
|
+
|
|
13918
|
+
ggml_build_forward_expand(gf, cur);
|
|
13919
|
+
}
|
|
13920
|
+
};
|
|
13921
|
+
|
|
13567
13922
|
struct llm_build_nemotron : public llm_graph_context {
|
|
13568
13923
|
llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
13569
13924
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -17251,6 +17606,136 @@ struct llm_build_smollm3 : public llm_graph_context {
|
|
|
17251
17606
|
}
|
|
17252
17607
|
};
|
|
17253
17608
|
|
|
17609
|
+
struct llm_build_openai_moe_iswa : public llm_graph_context {
|
|
17610
|
+
llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
17611
|
+
ggml_tensor * cur;
|
|
17612
|
+
ggml_tensor * inpL;
|
|
17613
|
+
|
|
17614
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
17615
|
+
|
|
17616
|
+
// inp_pos - contains the positions
|
|
17617
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
17618
|
+
|
|
17619
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
17620
|
+
|
|
17621
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
17622
|
+
ggml_tensor * inpSA = inpL;
|
|
17623
|
+
|
|
17624
|
+
// norm
|
|
17625
|
+
cur = build_norm(inpL,
|
|
17626
|
+
model.layers[il].attn_norm, nullptr,
|
|
17627
|
+
LLM_NORM_RMS, il);
|
|
17628
|
+
cb(cur, "attn_norm", il);
|
|
17629
|
+
|
|
17630
|
+
// self-attention
|
|
17631
|
+
{
|
|
17632
|
+
// compute Q and K and RoPE them
|
|
17633
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
17634
|
+
cb(Qcur, "Qcur", il);
|
|
17635
|
+
if (model.layers[il].bq) {
|
|
17636
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
17637
|
+
cb(Qcur, "Qcur", il);
|
|
17638
|
+
}
|
|
17639
|
+
|
|
17640
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
17641
|
+
cb(Kcur, "Kcur", il);
|
|
17642
|
+
if (model.layers[il].bk) {
|
|
17643
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
17644
|
+
cb(Kcur, "Kcur", il);
|
|
17645
|
+
}
|
|
17646
|
+
|
|
17647
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
17648
|
+
cb(Vcur, "Vcur", il);
|
|
17649
|
+
if (model.layers[il].bv) {
|
|
17650
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
17651
|
+
cb(Vcur, "Vcur", il);
|
|
17652
|
+
}
|
|
17653
|
+
|
|
17654
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
|
|
17655
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
|
|
17656
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
|
|
17657
|
+
|
|
17658
|
+
Qcur = ggml_rope_ext(
|
|
17659
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
17660
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
17661
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
17662
|
+
);
|
|
17663
|
+
|
|
17664
|
+
Kcur = ggml_rope_ext(
|
|
17665
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
17666
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
17667
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
17668
|
+
);
|
|
17669
|
+
|
|
17670
|
+
cb(Qcur, "Qcur", il);
|
|
17671
|
+
cb(Kcur, "Kcur", il);
|
|
17672
|
+
cb(Vcur, "Vcur", il);
|
|
17673
|
+
|
|
17674
|
+
cur = build_attn_with_sinks(inp_attn,
|
|
17675
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
17676
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].attn_sinks, 1.0f/sqrtf(float(n_rot)), il);
|
|
17677
|
+
|
|
17678
|
+
cb(cur, "attn_out", il);
|
|
17679
|
+
}
|
|
17680
|
+
|
|
17681
|
+
if (il == n_layer - 1) {
|
|
17682
|
+
// skip computing output for unused tokens
|
|
17683
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
17684
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
17685
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
17686
|
+
}
|
|
17687
|
+
|
|
17688
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
17689
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
17690
|
+
|
|
17691
|
+
cur = ffn_inp;
|
|
17692
|
+
cur = build_norm(cur,
|
|
17693
|
+
model.layers[il].attn_post_norm, nullptr,
|
|
17694
|
+
LLM_NORM_RMS, il);
|
|
17695
|
+
cb(cur, "attn_post_norm", il);
|
|
17696
|
+
|
|
17697
|
+
// MoE branch
|
|
17698
|
+
cur = build_moe_ffn(cur,
|
|
17699
|
+
model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b,
|
|
17700
|
+
model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b,
|
|
17701
|
+
model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b,
|
|
17702
|
+
model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b,
|
|
17703
|
+
nullptr,
|
|
17704
|
+
n_expert, n_expert_used,
|
|
17705
|
+
LLM_FFN_SWIGLU_OAI_MOE, false,
|
|
17706
|
+
false, 0.0,
|
|
17707
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
|
|
17708
|
+
il);
|
|
17709
|
+
cb(cur, "ffn_moe_out", il);
|
|
17710
|
+
|
|
17711
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
17712
|
+
|
|
17713
|
+
cur = build_cvec(cur, il);
|
|
17714
|
+
cb(cur, "l_out", il);
|
|
17715
|
+
|
|
17716
|
+
// input for next layer
|
|
17717
|
+
inpL = cur;
|
|
17718
|
+
}
|
|
17719
|
+
|
|
17720
|
+
cur = inpL;
|
|
17721
|
+
|
|
17722
|
+
cur = build_norm(cur,
|
|
17723
|
+
model.output_norm, NULL,
|
|
17724
|
+
LLM_NORM_RMS, -1);
|
|
17725
|
+
|
|
17726
|
+
cb(cur, "result_norm", -1);
|
|
17727
|
+
res->t_embd = cur;
|
|
17728
|
+
|
|
17729
|
+
// lm_head
|
|
17730
|
+
cur = build_lora_mm(model.output, cur);
|
|
17731
|
+
|
|
17732
|
+
cb(cur, "result_output", -1);
|
|
17733
|
+
res->t_logits = cur;
|
|
17734
|
+
|
|
17735
|
+
ggml_build_forward_expand(gf, cur);
|
|
17736
|
+
}
|
|
17737
|
+
};
|
|
17738
|
+
|
|
17254
17739
|
struct llm_build_lfm2 : public llm_graph_context {
|
|
17255
17740
|
const llama_model & model;
|
|
17256
17741
|
|
|
@@ -17877,6 +18362,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
17877
18362
|
{
|
|
17878
18363
|
llm = std::make_unique<llm_build_glm4>(*this, params);
|
|
17879
18364
|
} break;
|
|
18365
|
+
case LLM_ARCH_GLM4_MOE:
|
|
18366
|
+
{
|
|
18367
|
+
llm = std::make_unique<llm_build_glm4_moe>(*this, params);
|
|
18368
|
+
} break;
|
|
17880
18369
|
case LLM_ARCH_BITNET:
|
|
17881
18370
|
{
|
|
17882
18371
|
llm = std::make_unique<llm_build_bitnet>(*this, params);
|
|
@@ -17990,6 +18479,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
17990
18479
|
{
|
|
17991
18480
|
llm = std::make_unique<llm_build_smollm3>(*this, params);
|
|
17992
18481
|
} break;
|
|
18482
|
+
case LLM_ARCH_OPENAI_MOE:
|
|
18483
|
+
{
|
|
18484
|
+
llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
|
|
18485
|
+
} break;
|
|
17993
18486
|
case LLM_ARCH_FALCON_H1:
|
|
17994
18487
|
{
|
|
17995
18488
|
llm = std::make_unique<llm_build_falcon_h1>(*this, params);
|
|
@@ -18205,9 +18698,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
18205
18698
|
case LLM_ARCH_MINICPM3:
|
|
18206
18699
|
case LLM_ARCH_DOTS1:
|
|
18207
18700
|
case LLM_ARCH_HUNYUAN_MOE:
|
|
18701
|
+
case LLM_ARCH_OPENAI_MOE:
|
|
18208
18702
|
case LLM_ARCH_HUNYUAN_DENSE:
|
|
18209
18703
|
case LLM_ARCH_LFM2:
|
|
18210
18704
|
case LLM_ARCH_SMALLTHINKER:
|
|
18705
|
+
case LLM_ARCH_GLM4_MOE:
|
|
18211
18706
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
18212
18707
|
|
|
18213
18708
|
case LLM_ARCH_QWEN2VL:
|
|
@@ -101,8 +101,10 @@ enum llm_type {
|
|
|
101
101
|
LLM_TYPE_A13B,
|
|
102
102
|
LLM_TYPE_21B_A3B, // Ernie MoE small
|
|
103
103
|
LLM_TYPE_30B_A3B,
|
|
104
|
+
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
|
104
105
|
LLM_TYPE_235B_A22B,
|
|
105
106
|
LLM_TYPE_300B_A47B, // Ernie MoE big
|
|
107
|
+
LLM_TYPE_355B_A32B, // GLM-4.5
|
|
106
108
|
LLM_TYPE_E2B,
|
|
107
109
|
LLM_TYPE_E4B,
|
|
108
110
|
};
|
|
@@ -166,6 +168,15 @@ struct llama_layer_shortconv {
|
|
|
166
168
|
struct ggml_tensor * out_proj = nullptr;
|
|
167
169
|
};
|
|
168
170
|
|
|
171
|
+
struct llama_layer_nextn {
|
|
172
|
+
struct ggml_tensor * eh_proj = nullptr;
|
|
173
|
+
struct ggml_tensor * embed_tokens = nullptr;
|
|
174
|
+
struct ggml_tensor * enorm = nullptr;
|
|
175
|
+
struct ggml_tensor * hnorm = nullptr;
|
|
176
|
+
struct ggml_tensor * shared_head_head = nullptr;
|
|
177
|
+
struct ggml_tensor * shared_head_norm = nullptr;
|
|
178
|
+
};
|
|
179
|
+
|
|
169
180
|
struct llama_layer {
|
|
170
181
|
// normalization
|
|
171
182
|
struct ggml_tensor * attn_norm = nullptr;
|
|
@@ -241,10 +252,14 @@ struct llama_layer {
|
|
|
241
252
|
struct ggml_tensor * ffn_up_enc = nullptr;
|
|
242
253
|
|
|
243
254
|
// ff MoE
|
|
244
|
-
struct ggml_tensor * ffn_gate_inp
|
|
245
|
-
struct ggml_tensor * ffn_gate_exps
|
|
246
|
-
struct ggml_tensor * ffn_down_exps
|
|
247
|
-
struct ggml_tensor * ffn_up_exps
|
|
255
|
+
struct ggml_tensor * ffn_gate_inp = nullptr;
|
|
256
|
+
struct ggml_tensor * ffn_gate_exps = nullptr;
|
|
257
|
+
struct ggml_tensor * ffn_down_exps = nullptr;
|
|
258
|
+
struct ggml_tensor * ffn_up_exps = nullptr;
|
|
259
|
+
struct ggml_tensor * ffn_gate_inp_b = nullptr;
|
|
260
|
+
struct ggml_tensor * ffn_gate_exps_b = nullptr;
|
|
261
|
+
struct ggml_tensor * ffn_down_exps_b = nullptr;
|
|
262
|
+
struct ggml_tensor * ffn_up_exps_b = nullptr;
|
|
248
263
|
|
|
249
264
|
// ff shared expert (shexp)
|
|
250
265
|
struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
|
|
@@ -349,11 +364,16 @@ struct llama_layer {
|
|
|
349
364
|
struct ggml_tensor * laurel_r = nullptr;
|
|
350
365
|
struct ggml_tensor * laurel_post_norm = nullptr;
|
|
351
366
|
|
|
367
|
+
// openai-moe
|
|
368
|
+
struct ggml_tensor * attn_sinks = nullptr;
|
|
369
|
+
|
|
352
370
|
struct llama_layer_posnet posnet;
|
|
353
371
|
|
|
354
372
|
struct llama_layer_convnext convnext;
|
|
355
373
|
|
|
356
374
|
struct llama_layer_shortconv shortconv;
|
|
375
|
+
|
|
376
|
+
struct llama_layer_nextn nextn;
|
|
357
377
|
};
|
|
358
378
|
|
|
359
379
|
struct llama_model {
|