@fugood/llama.node 1.1.5 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +17 -13
- package/src/LlamaCompletionWorker.cpp +2 -0
- package/src/llama.cpp/common/arg.cpp +28 -11
- package/src/llama.cpp/common/chat.cpp +46 -2
- package/src/llama.cpp/common/chat.h +7 -2
- package/src/llama.cpp/common/common.h +3 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
- package/src/llama.cpp/ggml/include/ggml.h +37 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
- package/src/llama.cpp/include/llama.h +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +65 -0
- package/src/llama.cpp/src/llama-arch.h +10 -0
- package/src/llama.cpp/src/llama-chat.cpp +13 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +8 -8
- package/src/llama.cpp/src/llama-graph.cpp +118 -9
- package/src/llama.cpp/src/llama-graph.h +38 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -3
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +4 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +499 -4
- package/src/llama.cpp/src/llama-model.h +24 -4
- package/src/llama.cpp/src/llama-quant.cpp +37 -1
- package/src/llama.cpp/src/llama-vocab.cpp +42 -0
|
@@ -55,7 +55,22 @@ inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x)
|
|
|
55
55
|
|
|
56
56
|
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
57
57
|
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
58
|
-
|
|
58
|
+
|
|
59
|
+
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
|
|
60
|
+
int i = 0;
|
|
61
|
+
#if defined(__AVX2__)
|
|
62
|
+
for (; i + 7 < n; i += 8) {
|
|
63
|
+
__m256 vx = _mm256_loadu_ps(x + i);
|
|
64
|
+
__m256 vy = _mm256_loadu_ps(y + i);
|
|
65
|
+
__m256 vz = _mm256_add_ps(vx, vy);
|
|
66
|
+
_mm256_storeu_ps(z + i, vz);
|
|
67
|
+
}
|
|
68
|
+
#endif
|
|
69
|
+
for (; i < n; ++i) {
|
|
70
|
+
z[i] = x[i] + y[i];
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
59
74
|
inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
60
75
|
for (int i = 0; i < n; ++i) {
|
|
61
76
|
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
|
|
@@ -992,9 +1007,9 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
|
|
|
992
1007
|
|
|
993
1008
|
inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
|
994
1009
|
for (int i = 0; i < n; ++i) {
|
|
995
|
-
float
|
|
996
|
-
float
|
|
997
|
-
y[i] = GGML_CPU_FP32_TO_FP16((
|
|
1010
|
+
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
1011
|
+
float gi = GGML_CPU_FP16_TO_FP32(g[i]);
|
|
1012
|
+
y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
|
|
998
1013
|
}
|
|
999
1014
|
}
|
|
1000
1015
|
|
|
@@ -152,6 +152,7 @@ extern "C" {
|
|
|
152
152
|
//LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
|
|
153
153
|
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
|
154
154
|
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
|
155
|
+
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
|
|
155
156
|
|
|
156
157
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
|
157
158
|
};
|
|
@@ -62,6 +62,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
62
62
|
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
|
63
63
|
{ LLM_ARCH_CHATGLM, "chatglm" },
|
|
64
64
|
{ LLM_ARCH_GLM4, "glm4" },
|
|
65
|
+
{ LLM_ARCH_GLM4_MOE, "glm4moe" },
|
|
65
66
|
{ LLM_ARCH_BITNET, "bitnet" },
|
|
66
67
|
{ LLM_ARCH_T5, "t5" },
|
|
67
68
|
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
|
@@ -87,6 +88,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
87
88
|
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
|
|
88
89
|
{ LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
|
|
89
90
|
{ LLM_ARCH_SMOLLM3, "smollm3" },
|
|
91
|
+
{ LLM_ARCH_OPENAI_MOE, "gpt-oss" },
|
|
90
92
|
{ LLM_ARCH_LFM2, "lfm2" },
|
|
91
93
|
{ LLM_ARCH_DREAM, "dream" },
|
|
92
94
|
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
|
|
@@ -127,6 +129,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
127
129
|
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
|
128
130
|
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
|
129
131
|
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
|
132
|
+
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
|
|
130
133
|
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
|
131
134
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
|
132
135
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
|
@@ -1391,6 +1394,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1391
1394
|
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|
1392
1395
|
},
|
|
1393
1396
|
},
|
|
1397
|
+
{
|
|
1398
|
+
LLM_ARCH_GLM4_MOE,
|
|
1399
|
+
{
|
|
1400
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1401
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1402
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1403
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1404
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|
1405
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1406
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1407
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1408
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1409
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
1410
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
1411
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1412
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1413
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1414
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1415
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1416
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1417
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1418
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
|
1419
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
|
1420
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
1421
|
+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
|
1422
|
+
// NextN/MTP tensors - preserved but unused (in final layer, dynamic layer number)
|
|
1423
|
+
{ LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" },
|
|
1424
|
+
{ LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" },
|
|
1425
|
+
{ LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" },
|
|
1426
|
+
{ LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" },
|
|
1427
|
+
{ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" },
|
|
1428
|
+
{ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
|
|
1429
|
+
},
|
|
1430
|
+
},
|
|
1394
1431
|
{
|
|
1395
1432
|
LLM_ARCH_BITNET,
|
|
1396
1433
|
{
|
|
@@ -1935,6 +1972,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1935
1972
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1936
1973
|
},
|
|
1937
1974
|
},
|
|
1975
|
+
{
|
|
1976
|
+
LLM_ARCH_OPENAI_MOE,
|
|
1977
|
+
{
|
|
1978
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1979
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1980
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1981
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1982
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|
1983
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1984
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1985
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1986
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1987
|
+
{ LLM_TENSOR_ATTN_SINKS, "blk.%d.attn_sinks" },
|
|
1988
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1989
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1990
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1991
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1992
|
+
},
|
|
1993
|
+
},
|
|
1938
1994
|
{
|
|
1939
1995
|
LLM_ARCH_LFM2,
|
|
1940
1996
|
{
|
|
@@ -2050,6 +2106,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
2050
2106
|
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2051
2107
|
{LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2052
2108
|
{LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2109
|
+
{LLM_TENSOR_ATTN_SINKS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
|
|
2053
2110
|
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2054
2111
|
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2055
2112
|
{LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
@@ -2181,6 +2238,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
2181
2238
|
{LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
|
|
2182
2239
|
{LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2183
2240
|
{LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2241
|
+
// NextN/MTP tensors are currently ignored (reserved for future MTP support)
|
|
2242
|
+
// These tensors only exist in the last layer(s) and are treated as output tensors
|
|
2243
|
+
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
|
2244
|
+
{LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
|
2245
|
+
{LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
|
2246
|
+
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
|
2247
|
+
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
|
2248
|
+
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
|
2184
2249
|
};
|
|
2185
2250
|
|
|
2186
2251
|
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
|
@@ -66,6 +66,7 @@ enum llm_arch {
|
|
|
66
66
|
LLM_ARCH_DEEPSEEK2,
|
|
67
67
|
LLM_ARCH_CHATGLM,
|
|
68
68
|
LLM_ARCH_GLM4,
|
|
69
|
+
LLM_ARCH_GLM4_MOE,
|
|
69
70
|
LLM_ARCH_BITNET,
|
|
70
71
|
LLM_ARCH_T5,
|
|
71
72
|
LLM_ARCH_T5ENCODER,
|
|
@@ -91,6 +92,7 @@ enum llm_arch {
|
|
|
91
92
|
LLM_ARCH_HUNYUAN_MOE,
|
|
92
93
|
LLM_ARCH_HUNYUAN_DENSE,
|
|
93
94
|
LLM_ARCH_SMOLLM3,
|
|
95
|
+
LLM_ARCH_OPENAI_MOE,
|
|
94
96
|
LLM_ARCH_LFM2,
|
|
95
97
|
LLM_ARCH_DREAM,
|
|
96
98
|
LLM_ARCH_SMALLTHINKER,
|
|
@@ -131,6 +133,7 @@ enum llm_kv {
|
|
|
131
133
|
LLM_KV_EXPERT_WEIGHTS_NORM,
|
|
132
134
|
LLM_KV_EXPERT_GATING_FUNC,
|
|
133
135
|
LLM_KV_MOE_EVERY_N_LAYERS,
|
|
136
|
+
LLM_KV_NEXTN_PREDICT_LAYERS,
|
|
134
137
|
LLM_KV_POOLING_TYPE,
|
|
135
138
|
LLM_KV_LOGIT_SCALE,
|
|
136
139
|
LLM_KV_DECODER_START_TOKEN_ID,
|
|
@@ -263,6 +266,7 @@ enum llm_tensor {
|
|
|
263
266
|
LLM_TENSOR_ATTN_OUT_NORM,
|
|
264
267
|
LLM_TENSOR_ATTN_POST_NORM,
|
|
265
268
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
|
269
|
+
LLM_TENSOR_ATTN_SINKS,
|
|
266
270
|
LLM_TENSOR_FFN_GATE_INP,
|
|
267
271
|
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
|
268
272
|
LLM_TENSOR_FFN_NORM,
|
|
@@ -409,6 +413,12 @@ enum llm_tensor {
|
|
|
409
413
|
LLM_TENSOR_SHORTCONV_CONV,
|
|
410
414
|
LLM_TENSOR_SHORTCONV_INPROJ,
|
|
411
415
|
LLM_TENSOR_SHORTCONV_OUTPROJ,
|
|
416
|
+
LLM_TENSOR_NEXTN_EH_PROJ,
|
|
417
|
+
LLM_TENSOR_NEXTN_EMBED_TOKENS,
|
|
418
|
+
LLM_TENSOR_NEXTN_ENORM,
|
|
419
|
+
LLM_TENSOR_NEXTN_HNORM,
|
|
420
|
+
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
|
|
421
|
+
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
|
|
412
422
|
};
|
|
413
423
|
|
|
414
424
|
enum llm_tensor_layer {
|
|
@@ -66,6 +66,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
|
66
66
|
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
|
67
67
|
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
|
|
68
68
|
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
|
|
69
|
+
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
|
|
69
70
|
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
|
|
70
71
|
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
|
71
72
|
};
|
|
@@ -194,6 +195,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
194
195
|
return LLM_CHAT_TEMPLATE_DOTS1;
|
|
195
196
|
} else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
|
|
196
197
|
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
|
|
198
|
+
} else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
|
|
199
|
+
return LLM_CHAT_TEMPLATE_OPENAI_MOE;
|
|
197
200
|
} else if (tmpl_contains("<|hy_place▁holder▁no▁2|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
|
|
198
201
|
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
|
|
199
202
|
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
|
@@ -706,6 +709,16 @@ int32_t llm_chat_apply_template(
|
|
|
706
709
|
ss << "<|startoftext|>" << message->content << "<|extra_0|>";
|
|
707
710
|
}
|
|
708
711
|
}
|
|
712
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_OPENAI_MOE) {
|
|
713
|
+
// OpenAI MoE (based on Harmony chat template)
|
|
714
|
+
for (auto message : chat) {
|
|
715
|
+
std::string role(message->role);
|
|
716
|
+
ss << "<|start|>" << role << "<|message|>" << message->content;
|
|
717
|
+
ss << (role == "assistant" ? "<|return|>" : "<|end|>");
|
|
718
|
+
}
|
|
719
|
+
if (add_ass) {
|
|
720
|
+
ss << "<|start|>assistant";
|
|
721
|
+
}
|
|
709
722
|
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_DENSE) {
|
|
710
723
|
// tencent/Hunyuan-4B-Instruct
|
|
711
724
|
for (size_t i = 0; i < chat.size(); i++) {
|
|
@@ -786,7 +786,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|
|
786
786
|
const auto & hparams = model.hparams;
|
|
787
787
|
|
|
788
788
|
const int64_t n_embd = hparams.n_embd;
|
|
789
|
-
const
|
|
789
|
+
const int64_t n_vocab = model.vocab.n_tokens();
|
|
790
790
|
|
|
791
791
|
// note: during encode, we always pass the full sequence starting from pos = 0
|
|
792
792
|
if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
|
|
@@ -959,7 +959,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
959
959
|
const auto & vocab = model.vocab;
|
|
960
960
|
const auto & hparams = model.hparams;
|
|
961
961
|
|
|
962
|
-
const
|
|
962
|
+
const int64_t n_vocab = vocab.n_tokens();
|
|
963
963
|
const int64_t n_embd = hparams.n_embd;
|
|
964
964
|
|
|
965
965
|
// when computing embeddings, all tokens are output
|
|
@@ -1328,21 +1328,21 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|
|
1328
1328
|
}
|
|
1329
1329
|
|
|
1330
1330
|
void llama_context::output_reorder() {
|
|
1331
|
-
const
|
|
1331
|
+
const uint64_t n_vocab = model.vocab.n_tokens();
|
|
1332
1332
|
const uint64_t n_embd = model.hparams.n_embd;
|
|
1333
1333
|
|
|
1334
|
-
for (
|
|
1335
|
-
const
|
|
1336
|
-
const
|
|
1334
|
+
for (size_t s = 0; s < output_swaps.size(); ++s) {
|
|
1335
|
+
const uint64_t i0 = output_swaps[s].i0;
|
|
1336
|
+
const uint64_t i1 = output_swaps[s].i1;
|
|
1337
1337
|
|
|
1338
1338
|
if (logits_size > 0) {
|
|
1339
|
-
for (
|
|
1339
|
+
for (uint64_t k = 0; k < n_vocab; k++) {
|
|
1340
1340
|
std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
|
|
1341
1341
|
}
|
|
1342
1342
|
}
|
|
1343
1343
|
|
|
1344
1344
|
if (embd_size > 0) {
|
|
1345
|
-
for (
|
|
1345
|
+
for (uint64_t k = 0; k < n_embd; k++) {
|
|
1346
1346
|
std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
|
|
1347
1347
|
}
|
|
1348
1348
|
}
|
|
@@ -740,6 +740,8 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|
|
740
740
|
cur = ggml_reglu(ctx0, cur);
|
|
741
741
|
cb(cur, "ffn_reglu", il);
|
|
742
742
|
} break;
|
|
743
|
+
default:
|
|
744
|
+
GGML_ABORT("fatal error");
|
|
743
745
|
}
|
|
744
746
|
|
|
745
747
|
if (gate && type_gate == LLM_FFN_PAR) {
|
|
@@ -749,8 +751,8 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|
|
749
751
|
|
|
750
752
|
if (down) {
|
|
751
753
|
cur = build_lora_mm(down, cur);
|
|
752
|
-
if (arch == LLM_ARCH_GLM4) {
|
|
753
|
-
// GLM4
|
|
754
|
+
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
|
|
755
|
+
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
|
|
754
756
|
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
|
755
757
|
}
|
|
756
758
|
}
|
|
@@ -787,6 +789,45 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
787
789
|
llama_expert_gating_func_type gating_op,
|
|
788
790
|
int il,
|
|
789
791
|
ggml_tensor * probs_in) const {
|
|
792
|
+
return build_moe_ffn(
|
|
793
|
+
cur,
|
|
794
|
+
gate_inp, /* gate_inp_b */ nullptr,
|
|
795
|
+
up_exps, /* up_exps_b */ nullptr,
|
|
796
|
+
gate_exps, /* gate_exps_b */ nullptr,
|
|
797
|
+
down_exps, /* down_exps_b */ nullptr,
|
|
798
|
+
exp_probs_b,
|
|
799
|
+
n_expert,
|
|
800
|
+
n_expert_used,
|
|
801
|
+
type_op,
|
|
802
|
+
norm_w,
|
|
803
|
+
scale_w,
|
|
804
|
+
w_scale,
|
|
805
|
+
gating_op,
|
|
806
|
+
il,
|
|
807
|
+
probs_in
|
|
808
|
+
);
|
|
809
|
+
}
|
|
810
|
+
|
|
811
|
+
ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
812
|
+
ggml_tensor * cur,
|
|
813
|
+
ggml_tensor * gate_inp,
|
|
814
|
+
ggml_tensor * gate_inp_b,
|
|
815
|
+
ggml_tensor * up_exps,
|
|
816
|
+
ggml_tensor * up_exps_b,
|
|
817
|
+
ggml_tensor * gate_exps,
|
|
818
|
+
ggml_tensor * gate_exps_b,
|
|
819
|
+
ggml_tensor * down_exps,
|
|
820
|
+
ggml_tensor * down_exps_b,
|
|
821
|
+
ggml_tensor * exp_probs_b,
|
|
822
|
+
int64_t n_expert,
|
|
823
|
+
int64_t n_expert_used,
|
|
824
|
+
llm_ffn_op_type type_op,
|
|
825
|
+
bool norm_w,
|
|
826
|
+
bool scale_w,
|
|
827
|
+
float w_scale,
|
|
828
|
+
llama_expert_gating_func_type gating_op,
|
|
829
|
+
int il,
|
|
830
|
+
ggml_tensor * probs_in) const {
|
|
790
831
|
const int64_t n_embd = cur->ne[0];
|
|
791
832
|
const int64_t n_tokens = cur->ne[1];
|
|
792
833
|
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
|
|
@@ -800,6 +841,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
800
841
|
logits = probs_in;
|
|
801
842
|
}
|
|
802
843
|
|
|
844
|
+
if (gate_inp_b) {
|
|
845
|
+
logits = ggml_add(ctx0, logits, gate_inp_b);
|
|
846
|
+
cb(logits, "ffn_moe_logits_biased", il);
|
|
847
|
+
}
|
|
848
|
+
|
|
803
849
|
ggml_tensor * probs = nullptr;
|
|
804
850
|
switch (gating_op) {
|
|
805
851
|
case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
|
|
@@ -810,6 +856,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
810
856
|
{
|
|
811
857
|
probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
|
|
812
858
|
} break;
|
|
859
|
+
case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT:
|
|
860
|
+
{
|
|
861
|
+
probs = logits; // [n_expert, n_tokens]
|
|
862
|
+
} break;
|
|
813
863
|
default:
|
|
814
864
|
GGML_ABORT("fatal error");
|
|
815
865
|
}
|
|
@@ -838,6 +888,13 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
838
888
|
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
|
|
839
889
|
cb(weights, "ffn_moe_weights", il);
|
|
840
890
|
|
|
891
|
+
if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
|
|
892
|
+
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
|
|
893
|
+
weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens]
|
|
894
|
+
weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
|
|
895
|
+
cb(weights, "ffn_moe_weights_softmax", il);
|
|
896
|
+
}
|
|
897
|
+
|
|
841
898
|
if (norm_w) {
|
|
842
899
|
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
|
|
843
900
|
|
|
@@ -866,6 +923,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
866
923
|
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
|
867
924
|
cb(up, "ffn_moe_up", il);
|
|
868
925
|
|
|
926
|
+
if (up_exps_b) {
|
|
927
|
+
up = ggml_add_id(ctx0, up, up_exps_b, selected_experts);
|
|
928
|
+
cb(up, "ffn_moe_up_biased", il);
|
|
929
|
+
}
|
|
930
|
+
|
|
869
931
|
ggml_tensor * experts = nullptr;
|
|
870
932
|
if (gate_exps) {
|
|
871
933
|
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
|
@@ -874,6 +936,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
874
936
|
cur = up;
|
|
875
937
|
}
|
|
876
938
|
|
|
939
|
+
if (gate_exps_b) {
|
|
940
|
+
cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
|
|
941
|
+
cb(cur, "ffn_moe_gate_biased", il);
|
|
942
|
+
}
|
|
943
|
+
|
|
877
944
|
switch (type_op) {
|
|
878
945
|
case LLM_FFN_SILU:
|
|
879
946
|
if (gate_exps) {
|
|
@@ -891,6 +958,14 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
891
958
|
cur = ggml_gelu(ctx0, cur);
|
|
892
959
|
cb(cur, "ffn_moe_gelu", il);
|
|
893
960
|
} break;
|
|
961
|
+
case LLM_FFN_SWIGLU_OAI_MOE:
|
|
962
|
+
{
|
|
963
|
+
// TODO: move to hparams?
|
|
964
|
+
constexpr float alpha = 1.702f;
|
|
965
|
+
constexpr float limit = 7.0f;
|
|
966
|
+
cur = ggml_swiglu_oai(ctx0, cur, up, alpha, limit);
|
|
967
|
+
cb(cur, "ffn_moe_swiglu_oai", il);
|
|
968
|
+
} break;
|
|
894
969
|
case LLM_FFN_RELU:
|
|
895
970
|
if (gate_exps) {
|
|
896
971
|
cur = ggml_reglu_split(ctx0, cur, up);
|
|
@@ -906,6 +981,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
906
981
|
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
|
907
982
|
cb(experts, "ffn_moe_down", il);
|
|
908
983
|
|
|
984
|
+
if (down_exps_b) {
|
|
985
|
+
experts = ggml_add_id(ctx0, experts, down_exps_b, selected_experts);
|
|
986
|
+
cb(experts, "ffn_moe_down_biased", il);
|
|
987
|
+
}
|
|
988
|
+
|
|
909
989
|
if (!weight_before_ffn) {
|
|
910
990
|
experts = ggml_mul(ctx0, experts, weights);
|
|
911
991
|
cb(cur, "ffn_moe_weighted", il);
|
|
@@ -1144,6 +1224,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
|
1144
1224
|
ggml_tensor * kq_b,
|
|
1145
1225
|
ggml_tensor * kq_mask,
|
|
1146
1226
|
ggml_tensor * v_mla,
|
|
1227
|
+
ggml_tensor * sinks,
|
|
1147
1228
|
float kq_scale) const {
|
|
1148
1229
|
const bool v_trans = v->nb[1] > v->nb[2];
|
|
1149
1230
|
|
|
@@ -1180,7 +1261,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
|
1180
1261
|
cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
|
|
1181
1262
|
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
|
|
1182
1263
|
|
|
1183
|
-
|
|
1264
|
+
ggml_flash_attn_ext_add_sinks(cur, sinks);
|
|
1265
|
+
ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
|
|
1184
1266
|
|
|
1185
1267
|
if (v_mla) {
|
|
1186
1268
|
#if 0
|
|
@@ -1228,6 +1310,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
|
1228
1310
|
}
|
|
1229
1311
|
|
|
1230
1312
|
kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
|
1313
|
+
ggml_soft_max_add_sinks(kq, sinks);
|
|
1231
1314
|
|
|
1232
1315
|
if (!v_trans) {
|
|
1233
1316
|
// note: avoid this branch
|
|
@@ -1298,7 +1381,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
|
1298
1381
|
ggml_tensor * k = k_cur;
|
|
1299
1382
|
ggml_tensor * v = v_cur;
|
|
1300
1383
|
|
|
1301
|
-
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
|
1384
|
+
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
|
|
1302
1385
|
cb(cur, "kqv_out", il);
|
|
1303
1386
|
|
|
1304
1387
|
if (wo) {
|
|
@@ -1386,13 +1469,13 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
|
1386
1469
|
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
|
1387
1470
|
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
|
1388
1471
|
|
|
1389
|
-
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
|
1472
|
+
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
|
|
1390
1473
|
cb(cur, "kqv_out", il);
|
|
1391
1474
|
|
|
1392
1475
|
if (wo) {
|
|
1393
1476
|
cur = build_lora_mm(wo, cur);
|
|
1394
|
-
if (arch == LLM_ARCH_GLM4) {
|
|
1395
|
-
// GLM4
|
|
1477
|
+
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
|
|
1478
|
+
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
|
|
1396
1479
|
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
|
1397
1480
|
}
|
|
1398
1481
|
}
|
|
@@ -1415,6 +1498,32 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
|
1415
1498
|
ggml_tensor * v_mla,
|
|
1416
1499
|
float kq_scale,
|
|
1417
1500
|
int il) const {
|
|
1501
|
+
return build_attn_with_sinks(
|
|
1502
|
+
inp,
|
|
1503
|
+
wo,
|
|
1504
|
+
wo_b,
|
|
1505
|
+
q_cur,
|
|
1506
|
+
k_cur,
|
|
1507
|
+
v_cur,
|
|
1508
|
+
kq_b,
|
|
1509
|
+
v_mla,
|
|
1510
|
+
nullptr,
|
|
1511
|
+
kq_scale,
|
|
1512
|
+
il);
|
|
1513
|
+
}
|
|
1514
|
+
|
|
1515
|
+
ggml_tensor * llm_graph_context::build_attn_with_sinks(
|
|
1516
|
+
llm_graph_input_attn_kv_unified_iswa * inp,
|
|
1517
|
+
ggml_tensor * wo,
|
|
1518
|
+
ggml_tensor * wo_b,
|
|
1519
|
+
ggml_tensor * q_cur,
|
|
1520
|
+
ggml_tensor * k_cur,
|
|
1521
|
+
ggml_tensor * v_cur,
|
|
1522
|
+
ggml_tensor * kq_b,
|
|
1523
|
+
ggml_tensor * v_mla,
|
|
1524
|
+
ggml_tensor * sinks,
|
|
1525
|
+
float kq_scale,
|
|
1526
|
+
int il) const {
|
|
1418
1527
|
// these nodes are added to the graph together so that they are not reordered
|
|
1419
1528
|
// by doing so, the number of splits in the graph is reduced
|
|
1420
1529
|
ggml_build_forward_expand(gf, q_cur);
|
|
@@ -1452,7 +1561,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
|
1452
1561
|
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
|
1453
1562
|
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
|
1454
1563
|
|
|
1455
|
-
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
|
1564
|
+
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, sinks, kq_scale);
|
|
1456
1565
|
cb(cur, "kqv_out", il);
|
|
1457
1566
|
|
|
1458
1567
|
if (wo) {
|
|
@@ -1506,7 +1615,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
|
1506
1615
|
ggml_tensor * k = k_cur;
|
|
1507
1616
|
ggml_tensor * v = v_cur;
|
|
1508
1617
|
|
|
1509
|
-
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
|
1618
|
+
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
|
|
1510
1619
|
cb(cur, "kqv_out", il);
|
|
1511
1620
|
|
|
1512
1621
|
if (wo) {
|
|
@@ -39,6 +39,7 @@ enum llm_ffn_op_type {
|
|
|
39
39
|
LLM_FFN_SWIGLU,
|
|
40
40
|
LLM_FFN_GEGLU,
|
|
41
41
|
LLM_FFN_REGLU,
|
|
42
|
+
LLM_FFN_SWIGLU_OAI_MOE,
|
|
42
43
|
};
|
|
43
44
|
|
|
44
45
|
enum llm_ffn_gate_type {
|
|
@@ -619,6 +620,7 @@ struct llm_graph_context {
|
|
|
619
620
|
llm_ffn_gate_type type_gate,
|
|
620
621
|
int il) const;
|
|
621
622
|
|
|
623
|
+
// build MoE FFN without bias tensors
|
|
622
624
|
ggml_tensor * build_moe_ffn(
|
|
623
625
|
ggml_tensor * cur,
|
|
624
626
|
ggml_tensor * gate_inp,
|
|
@@ -636,6 +638,27 @@ struct llm_graph_context {
|
|
|
636
638
|
int il,
|
|
637
639
|
ggml_tensor * probs_in = nullptr) const;
|
|
638
640
|
|
|
641
|
+
ggml_tensor * build_moe_ffn(
|
|
642
|
+
ggml_tensor * cur,
|
|
643
|
+
ggml_tensor * gate_inp,
|
|
644
|
+
ggml_tensor * gate_inp_b,
|
|
645
|
+
ggml_tensor * up_exps,
|
|
646
|
+
ggml_tensor * up_exps_b,
|
|
647
|
+
ggml_tensor * gate_exps,
|
|
648
|
+
ggml_tensor * gate_exps_b,
|
|
649
|
+
ggml_tensor * down_exps,
|
|
650
|
+
ggml_tensor * down_exps_b,
|
|
651
|
+
ggml_tensor * exp_probs_b,
|
|
652
|
+
int64_t n_expert,
|
|
653
|
+
int64_t n_expert_used,
|
|
654
|
+
llm_ffn_op_type type_op,
|
|
655
|
+
bool norm_w,
|
|
656
|
+
bool scale_w,
|
|
657
|
+
float w_scale,
|
|
658
|
+
llama_expert_gating_func_type gating_op,
|
|
659
|
+
int il,
|
|
660
|
+
ggml_tensor * probs_in = nullptr) const;
|
|
661
|
+
|
|
639
662
|
//
|
|
640
663
|
// inputs
|
|
641
664
|
//
|
|
@@ -662,6 +685,7 @@ struct llm_graph_context {
|
|
|
662
685
|
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
|
|
663
686
|
ggml_tensor * kq_b,
|
|
664
687
|
ggml_tensor * kq_mask,
|
|
688
|
+
ggml_tensor * sinks,
|
|
665
689
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
666
690
|
float kq_scale) const;
|
|
667
691
|
|
|
@@ -708,6 +732,20 @@ struct llm_graph_context {
|
|
|
708
732
|
float kq_scale,
|
|
709
733
|
int il) const;
|
|
710
734
|
|
|
735
|
+
// TODO: temporary to keep the diff small. after the code is public will refactor to simplify this
|
|
736
|
+
ggml_tensor * build_attn_with_sinks(
|
|
737
|
+
llm_graph_input_attn_kv_unified_iswa * inp,
|
|
738
|
+
ggml_tensor * wo,
|
|
739
|
+
ggml_tensor * wo_b,
|
|
740
|
+
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
741
|
+
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
|
|
742
|
+
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
|
|
743
|
+
ggml_tensor * kq_b,
|
|
744
|
+
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
745
|
+
ggml_tensor * sinks, // [n_head_q]
|
|
746
|
+
float kq_scale,
|
|
747
|
+
int il) const;
|
|
748
|
+
|
|
711
749
|
llm_graph_input_attn_cross * build_attn_inp_cross() const;
|
|
712
750
|
|
|
713
751
|
ggml_tensor * build_attn(
|
|
@@ -9,9 +9,10 @@
|
|
|
9
9
|
#define LLAMA_MAX_EXPERTS 384 // Kimi-K2
|
|
10
10
|
|
|
11
11
|
enum llama_expert_gating_func_type {
|
|
12
|
-
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE
|
|
13
|
-
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX
|
|
14
|
-
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID
|
|
12
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
|
|
13
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
|
|
14
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
|
|
15
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits
|
|
15
16
|
};
|
|
16
17
|
|
|
17
18
|
enum llama_swa_type {
|
|
@@ -73,6 +74,7 @@ struct llama_hparams {
|
|
|
73
74
|
bool expert_weights_norm = false;
|
|
74
75
|
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
|
75
76
|
uint32_t moe_every_n_layers = 0;
|
|
77
|
+
uint32_t nextn_predict_layers = 0;
|
|
76
78
|
|
|
77
79
|
float f_norm_eps;
|
|
78
80
|
float f_norm_rms_eps;
|
|
@@ -39,6 +39,10 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|
|
39
39
|
if (model.arch == LLM_ARCH_GEMMA3N) {
|
|
40
40
|
n_layer_cache = 20;
|
|
41
41
|
}
|
|
42
|
+
if (model.arch == LLM_ARCH_GLM4_MOE) {
|
|
43
|
+
// GLM-4.5: Only process up to last layer, skip final NextN layer
|
|
44
|
+
n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers;
|
|
45
|
+
}
|
|
42
46
|
|
|
43
47
|
// create a context for each buffer type
|
|
44
48
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
|
@@ -35,6 +35,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
|
35
35
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
|
36
36
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
|
37
37
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
|
38
|
+
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
|
|
38
39
|
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
|
39
40
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
|
40
41
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
|
@@ -58,8 +58,9 @@ struct llama_model_loader {
|
|
|
58
58
|
}
|
|
59
59
|
};
|
|
60
60
|
|
|
61
|
-
static const int TENSOR_NOT_REQUIRED = 1;
|
|
62
|
-
static const int TENSOR_DUPLICATED =
|
|
61
|
+
static const int TENSOR_NOT_REQUIRED = 1 << 0;
|
|
62
|
+
static const int TENSOR_DUPLICATED = 1 << 1;
|
|
63
|
+
static const int TENSOR_SKIP = 1 << 2;
|
|
63
64
|
|
|
64
65
|
int n_kv = 0;
|
|
65
66
|
int n_tensors = 0;
|