@fugood/llama.node 1.2.3 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +33 -11
- package/src/llama.cpp/CMakeLists.txt +1 -0
- package/src/llama.cpp/common/CMakeLists.txt +46 -2
- package/src/llama.cpp/common/arg.cpp +484 -204
- package/src/llama.cpp/common/arg.h +0 -1
- package/src/llama.cpp/common/chat-parser.cpp +156 -15
- package/src/llama.cpp/common/chat-parser.h +3 -0
- package/src/llama.cpp/common/chat.cpp +217 -6
- package/src/llama.cpp/common/chat.h +5 -3
- package/src/llama.cpp/common/common.cpp +22 -6
- package/src/llama.cpp/common/common.h +6 -4
- package/src/llama.cpp/common/http.h +73 -0
- package/src/llama.cpp/common/json-partial.cpp +51 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
- package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
- package/src/llama.cpp/ggml/include/ggml.h +22 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +17 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +11 -9
- package/src/llama.cpp/include/llama.h +8 -0
- package/src/llama.cpp/src/llama-arch.cpp +93 -0
- package/src/llama.cpp/src/llama-arch.h +22 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -1
- package/src/llama.cpp/src/llama-context.cpp +6 -0
- package/src/llama.cpp/src/llama-graph.cpp +57 -22
- package/src/llama.cpp/src/llama-graph.h +10 -1
- package/src/llama.cpp/src/llama-hparams.cpp +5 -1
- package/src/llama.cpp/src/llama-hparams.h +17 -2
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +2 -2
- package/src/llama.cpp/src/llama-kv-cache.cpp +2 -5
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +11 -9
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +11 -3
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model.cpp +572 -45
- package/src/llama.cpp/src/llama-model.h +18 -0
- package/src/llama.cpp/src/llama-sampling.cpp +5 -0
- package/src/llama.cpp/src/llama-vocab.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.h +41 -40
- package/src/llama.cpp/src/unicode.h +43 -0
|
@@ -114,6 +114,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
114
114
|
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
|
115
115
|
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
|
116
116
|
case LLM_TYPE_A13B: return "A13B";
|
|
117
|
+
case LLM_TYPE_8B_A1B: return "8B.A1B";
|
|
117
118
|
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
|
118
119
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
119
120
|
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
|
@@ -310,7 +311,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
|
|
|
310
311
|
}
|
|
311
312
|
|
|
312
313
|
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
|
|
313
|
-
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
|
|
314
|
+
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
|
|
314
315
|
buft_list_t buft_list;
|
|
315
316
|
|
|
316
317
|
// add ACCEL buffer types
|
|
@@ -331,11 +332,13 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
|
|
331
332
|
// generally, this will be done using the first device in the list
|
|
332
333
|
// a better approach would be to handle this on a weight-by-weight basis using the offload_op
|
|
333
334
|
// function of the device to determine if it would benefit from being stored in a host buffer
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
335
|
+
if (!no_host) {
|
|
336
|
+
for (auto * dev : devices) {
|
|
337
|
+
ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
|
|
338
|
+
if (buft) {
|
|
339
|
+
buft_list.emplace_back(dev, buft);
|
|
340
|
+
break;
|
|
341
|
+
}
|
|
339
342
|
}
|
|
340
343
|
}
|
|
341
344
|
|
|
@@ -512,9 +515,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
512
515
|
llm_arch_is_recurrent(ml.get_arch()));
|
|
513
516
|
|
|
514
517
|
std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
|
|
515
|
-
|
|
516
518
|
std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
|
|
517
519
|
|
|
520
|
+
std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f);
|
|
521
|
+
std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
|
|
522
|
+
std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
|
|
523
|
+
std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
|
|
524
|
+
|
|
518
525
|
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
|
519
526
|
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
|
520
527
|
|
|
@@ -675,10 +682,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
675
682
|
} break;
|
|
676
683
|
case LLM_ARCH_MINICPM:
|
|
677
684
|
{
|
|
685
|
+
// Backward-compatible defaults for older MiniCPM GGUFs
|
|
686
|
+
hparams.f_embedding_scale = 12.0f;
|
|
687
|
+
hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer));
|
|
688
|
+
hparams.f_logit_scale = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f;
|
|
689
|
+
|
|
678
690
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
ml.get_key(
|
|
691
|
+
|
|
692
|
+
// Optional KV reads, override defaults if present in newer GGUF exports
|
|
693
|
+
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /*required=*/false);
|
|
694
|
+
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /*required=*/false);
|
|
695
|
+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /*required=*/false);
|
|
682
696
|
|
|
683
697
|
// MiniCPM uses rope by default, unlike Granite which uses it as a switch
|
|
684
698
|
hparams.rope_finetuned = true;
|
|
@@ -1077,7 +1091,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1077
1091
|
}
|
|
1078
1092
|
break;
|
|
1079
1093
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1080
|
-
|
|
1094
|
+
}
|
|
1095
|
+
|
|
1096
|
+
// Load attention parameters
|
|
1097
|
+
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
|
|
1098
|
+
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
|
|
1081
1099
|
} break;
|
|
1082
1100
|
case LLM_ARCH_GPT2:
|
|
1083
1101
|
{
|
|
@@ -1200,12 +1218,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1200
1218
|
hparams.set_swa_pattern(6);
|
|
1201
1219
|
|
|
1202
1220
|
hparams.causal_attn = false; // embeddings do not use causal attention
|
|
1203
|
-
hparams.rope_freq_base_train_swa
|
|
1221
|
+
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1204
1222
|
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1205
1223
|
|
|
1206
|
-
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,
|
|
1224
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1207
1225
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1208
|
-
ml.get_key(LLM_KV_POOLING_TYPE,
|
|
1226
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
1227
|
+
|
|
1228
|
+
//applied only if model converted with --sentence-transformers-dense-modules
|
|
1229
|
+
ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
|
|
1230
|
+
ml.get_key(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out, false);
|
|
1231
|
+
ml.get_key(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in, false);
|
|
1232
|
+
ml.get_key(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out, false);
|
|
1233
|
+
|
|
1234
|
+
GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
|
|
1235
|
+
GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");
|
|
1209
1236
|
|
|
1210
1237
|
switch (hparams.n_layer) {
|
|
1211
1238
|
case 24: type = LLM_TYPE_0_3B; break;
|
|
@@ -1978,13 +2005,28 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1978
2005
|
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
|
|
1979
2006
|
hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
|
|
1980
2007
|
}
|
|
2008
|
+
hparams.n_layer_dense_lead = hparams.n_layer;
|
|
1981
2009
|
switch (hparams.n_ff()) {
|
|
1982
2010
|
case 4608: type = LLM_TYPE_350M; break;
|
|
1983
2011
|
case 6912: type = LLM_TYPE_700M; break;
|
|
1984
2012
|
case 8192: type = LLM_TYPE_1_2B; break;
|
|
1985
2013
|
case 10752: type = LLM_TYPE_2_6B; break;
|
|
1986
|
-
default:
|
|
2014
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
2015
|
+
}
|
|
2016
|
+
} break;
|
|
2017
|
+
case LLM_ARCH_LFM2MOE:
|
|
2018
|
+
{
|
|
2019
|
+
ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
|
|
2020
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2021
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
2022
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
2023
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
|
2024
|
+
|
|
2025
|
+
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
|
|
2026
|
+
hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
|
|
1987
2027
|
}
|
|
2028
|
+
|
|
2029
|
+
type = LLM_TYPE_8B_A1B;
|
|
1988
2030
|
} break;
|
|
1989
2031
|
case LLM_ARCH_SMALLTHINKER:
|
|
1990
2032
|
{
|
|
@@ -2009,6 +2051,32 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2009
2051
|
default: type = LLM_TYPE_UNKNOWN;
|
|
2010
2052
|
}
|
|
2011
2053
|
} break;
|
|
2054
|
+
case LLM_ARCH_GROVEMOE:
|
|
2055
|
+
{
|
|
2056
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
2057
|
+
ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp);
|
|
2058
|
+
ml.get_key(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale);
|
|
2059
|
+
ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
|
|
2060
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2061
|
+
|
|
2062
|
+
switch (hparams.n_layer) {
|
|
2063
|
+
case 48: type = LLM_TYPE_30B_A3B; break;
|
|
2064
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
2065
|
+
}
|
|
2066
|
+
} break;
|
|
2067
|
+
case LLM_ARCH_APERTUS:
|
|
2068
|
+
{
|
|
2069
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2070
|
+
ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer);
|
|
2071
|
+
ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer);
|
|
2072
|
+
ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer);
|
|
2073
|
+
ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer);
|
|
2074
|
+
|
|
2075
|
+
switch (hparams.n_layer) {
|
|
2076
|
+
case 32: type = LLM_TYPE_8B; break;
|
|
2077
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
2078
|
+
}
|
|
2079
|
+
} break;
|
|
2012
2080
|
default: throw std::runtime_error("unsupported model architecture");
|
|
2013
2081
|
}
|
|
2014
2082
|
|
|
@@ -2042,7 +2110,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2042
2110
|
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
|
|
2043
2111
|
|
|
2044
2112
|
// build a list of buffer types for the CPU and GPU devices
|
|
2045
|
-
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
|
|
2113
|
+
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
|
|
2046
2114
|
for (auto * dev : devices) {
|
|
2047
2115
|
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
|
|
2048
2116
|
// add CPU buffer types as a fallback
|
|
@@ -3167,6 +3235,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3167
3235
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3168
3236
|
}
|
|
3169
3237
|
|
|
3238
|
+
// output rerank head
|
|
3239
|
+
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
3240
|
+
|
|
3170
3241
|
for (int i = 0; i < n_layer; ++i) {
|
|
3171
3242
|
auto & layer = layers[i];
|
|
3172
3243
|
|
|
@@ -3369,17 +3440,17 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3369
3440
|
} break;
|
|
3370
3441
|
case LLM_ARCH_PLAMO2:
|
|
3371
3442
|
{
|
|
3443
|
+
// mamba parameters
|
|
3372
3444
|
const uint32_t d_conv = hparams.ssm_d_conv;
|
|
3373
3445
|
const uint32_t d_state = hparams.ssm_d_state;
|
|
3374
3446
|
const uint32_t num_heads = hparams.ssm_dt_rank;
|
|
3375
3447
|
const uint32_t intermediate_size = hparams.ssm_d_inner;
|
|
3376
|
-
const uint32_t head_dim = intermediate_size / num_heads;
|
|
3377
|
-
const uint32_t qk_dim = head_dim;
|
|
3378
|
-
const uint32_t v_dim = head_dim;
|
|
3379
|
-
const int64_t num_attention_heads = hparams.n_head();
|
|
3380
|
-
const int64_t q_num_heads = num_attention_heads;
|
|
3381
3448
|
const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
|
|
3382
3449
|
|
|
3450
|
+
// attention parameters
|
|
3451
|
+
const uint32_t qk_dim = hparams.n_embd_head_k;
|
|
3452
|
+
const uint32_t v_dim = hparams.n_embd_head_v;
|
|
3453
|
+
|
|
3383
3454
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3384
3455
|
|
|
3385
3456
|
// output
|
|
@@ -3413,6 +3484,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3413
3484
|
layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
|
|
3414
3485
|
layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
|
|
3415
3486
|
} else {
|
|
3487
|
+
const int64_t num_attention_heads = hparams.n_head(i);
|
|
3488
|
+
const int64_t q_num_heads = num_attention_heads;
|
|
3416
3489
|
const int64_t num_key_value_heads = hparams.n_head_kv(i);
|
|
3417
3490
|
const int64_t k_num_heads = num_key_value_heads;
|
|
3418
3491
|
const int64_t v_num_heads = num_key_value_heads;
|
|
@@ -3421,8 +3494,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3421
3494
|
const int64_t v_proj_dim = v_num_heads * v_dim;
|
|
3422
3495
|
|
|
3423
3496
|
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
|
|
3424
|
-
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {
|
|
3425
|
-
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {
|
|
3497
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {qk_dim, num_attention_heads}, 0);
|
|
3498
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {qk_dim, k_num_heads}, 0);
|
|
3426
3499
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
|
|
3427
3500
|
}
|
|
3428
3501
|
|
|
@@ -3622,6 +3695,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3622
3695
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3623
3696
|
}
|
|
3624
3697
|
|
|
3698
|
+
// Dense linear weights
|
|
3699
|
+
dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.dense_2_feat_out}, TENSOR_NOT_REQUIRED);
|
|
3700
|
+
dense_3_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_3_OUT, "weight"), {hparams.dense_3_feat_in, n_embd}, TENSOR_NOT_REQUIRED);
|
|
3701
|
+
|
|
3702
|
+
|
|
3625
3703
|
for (int i = 0; i < n_layer; ++i) {
|
|
3626
3704
|
auto & layer = layers[i];
|
|
3627
3705
|
|
|
@@ -4802,11 +4880,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4802
4880
|
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
|
4803
4881
|
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
4804
4882
|
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
|
|
4805
|
-
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
|
|
4806
4883
|
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
|
|
4807
4884
|
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
|
|
4808
|
-
|
|
4809
|
-
|
|
4885
|
+
|
|
4886
|
+
// Optional tensors
|
|
4887
|
+
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
|
|
4888
|
+
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
|
|
4889
|
+
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
|
|
4810
4890
|
}
|
|
4811
4891
|
}
|
|
4812
4892
|
}
|
|
@@ -5764,6 +5844,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5764
5844
|
}
|
|
5765
5845
|
} break;
|
|
5766
5846
|
case LLM_ARCH_LFM2:
|
|
5847
|
+
case LLM_ARCH_LFM2MOE:
|
|
5767
5848
|
{
|
|
5768
5849
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5769
5850
|
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
|
@@ -5775,11 +5856,23 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5775
5856
|
|
|
5776
5857
|
for (int i = 0; i < n_layer; ++i) {
|
|
5777
5858
|
auto & layer = layers[i];
|
|
5778
|
-
|
|
5859
|
+
|
|
5860
|
+
const bool is_moe_layer = i >= static_cast<int>(hparams.n_layer_dense_lead);
|
|
5861
|
+
|
|
5862
|
+
// ffn/moe is same for transformer and conv layers
|
|
5779
5863
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
5780
|
-
|
|
5781
|
-
|
|
5782
|
-
|
|
5864
|
+
if (is_moe_layer) {
|
|
5865
|
+
GGML_ASSERT(n_expert && n_expert_used);
|
|
5866
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
5867
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
|
|
5868
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {hparams.n_ff_exp, n_embd, n_expert}, 0);
|
|
5869
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
|
|
5870
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
|
|
5871
|
+
} else { // dense
|
|
5872
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
5873
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
5874
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
5875
|
+
}
|
|
5783
5876
|
|
|
5784
5877
|
// for operator_norm
|
|
5785
5878
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
@@ -5837,6 +5930,95 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5837
5930
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
|
|
5838
5931
|
}
|
|
5839
5932
|
} break;
|
|
5933
|
+
case LLM_ARCH_GROVEMOE:
|
|
5934
|
+
{
|
|
5935
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5936
|
+
|
|
5937
|
+
// output
|
|
5938
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
5939
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
5940
|
+
// if output is NULL, init from the input tok embed
|
|
5941
|
+
if (output == NULL) {
|
|
5942
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
5943
|
+
}
|
|
5944
|
+
|
|
5945
|
+
GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE");
|
|
5946
|
+
GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE");
|
|
5947
|
+
GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE");
|
|
5948
|
+
|
|
5949
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5950
|
+
auto & layer = layers[i];
|
|
5951
|
+
|
|
5952
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
5953
|
+
|
|
5954
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
5955
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
5956
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
5957
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
5958
|
+
|
|
5959
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
5960
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
5961
|
+
|
|
5962
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
5963
|
+
|
|
5964
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
5965
|
+
|
|
5966
|
+
// MoE branch
|
|
5967
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
5968
|
+
const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k;
|
|
5969
|
+
const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
|
|
5970
|
+
|
|
5971
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
5972
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
5973
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
5974
|
+
|
|
5975
|
+
layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
|
|
5976
|
+
layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp, n_embd, n_chunk_expert}, 0);
|
|
5977
|
+
layer.ffn_up_chexps = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
|
|
5978
|
+
}
|
|
5979
|
+
} break;
|
|
5980
|
+
case LLM_ARCH_APERTUS:
|
|
5981
|
+
{
|
|
5982
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
5983
|
+
|
|
5984
|
+
// output
|
|
5985
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
5986
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
|
|
5987
|
+
|
|
5988
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5989
|
+
auto & layer = layers[i];
|
|
5990
|
+
|
|
5991
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
|
5992
|
+
|
|
5993
|
+
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
|
5994
|
+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
5995
|
+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
5996
|
+
} else {
|
|
5997
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
5998
|
+
}
|
|
5999
|
+
|
|
6000
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
|
|
6001
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
|
|
6002
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
|
|
6003
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
|
6004
|
+
|
|
6005
|
+
// optional bias tensors
|
|
6006
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
|
6007
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
|
|
6008
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
|
|
6009
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
|
6010
|
+
|
|
6011
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
|
|
6012
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
|
6013
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
|
|
6014
|
+
|
|
6015
|
+
// Q and K layernorms for Apertus
|
|
6016
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
|
|
6017
|
+
layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
|
|
6018
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
|
|
6019
|
+
layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
|
|
6020
|
+
}
|
|
6021
|
+
} break;
|
|
5840
6022
|
default:
|
|
5841
6023
|
throw std::runtime_error("unknown architecture");
|
|
5842
6024
|
}
|
|
@@ -6171,11 +6353,18 @@ void llama_model::print_info() const {
|
|
|
6171
6353
|
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
6172
6354
|
}
|
|
6173
6355
|
|
|
6174
|
-
if (arch == LLM_ARCH_SMALLTHINKER) {
|
|
6356
|
+
if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
|
|
6175
6357
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
6176
6358
|
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
6177
6359
|
}
|
|
6178
6360
|
|
|
6361
|
+
if (arch == LLM_ARCH_GROVEMOE) {
|
|
6362
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
6363
|
+
LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
|
|
6364
|
+
LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
|
|
6365
|
+
LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
|
|
6366
|
+
}
|
|
6367
|
+
|
|
6179
6368
|
vocab.print_info();
|
|
6180
6369
|
}
|
|
6181
6370
|
|
|
@@ -7699,6 +7888,8 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7699
7888
|
}
|
|
7700
7889
|
|
|
7701
7890
|
if (model.layers[il].attn_q_norm) {
|
|
7891
|
+
Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head*n_head, n_tokens);
|
|
7892
|
+
|
|
7702
7893
|
Qcur = build_norm(Qcur,
|
|
7703
7894
|
model.layers[il].attn_q_norm,
|
|
7704
7895
|
model.layers[il].attn_q_norm_b,
|
|
@@ -7708,6 +7899,8 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7708
7899
|
}
|
|
7709
7900
|
|
|
7710
7901
|
if (model.layers[il].attn_k_norm) {
|
|
7902
|
+
Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_head*n_head_kv, n_tokens);
|
|
7903
|
+
|
|
7711
7904
|
Kcur = build_norm(Kcur,
|
|
7712
7905
|
model.layers[il].attn_k_norm,
|
|
7713
7906
|
model.layers[il].attn_k_norm_b,
|
|
@@ -8090,6 +8283,9 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
8090
8283
|
|
|
8091
8284
|
// Q/K Layernorm
|
|
8092
8285
|
if (model.layers[il].attn_q_norm) {
|
|
8286
|
+
Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head*n_head, n_tokens);
|
|
8287
|
+
Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_head*n_head_kv, n_tokens);
|
|
8288
|
+
|
|
8093
8289
|
Qcur = build_norm(Qcur,
|
|
8094
8290
|
model.layers[il].attn_q_norm,
|
|
8095
8291
|
model.layers[il].attn_q_norm_b,
|
|
@@ -11674,6 +11870,7 @@ struct llm_graph_context_mamba : public llm_graph_context {
|
|
|
11674
11870
|
// TODO: skip computing output earlier for unused tokens
|
|
11675
11871
|
|
|
11676
11872
|
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
|
|
11873
|
+
cb(y, "mamba2_y_add_d", il);
|
|
11677
11874
|
y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
|
|
11678
11875
|
|
|
11679
11876
|
// grouped RMS norm
|
|
@@ -14628,6 +14825,7 @@ struct llm_build_nemotron_h : public llm_graph_context_mamba {
|
|
|
14628
14825
|
ggml_tensor * inpL;
|
|
14629
14826
|
|
|
14630
14827
|
inpL = build_inp_embd(model.tok_embd);
|
|
14828
|
+
ggml_build_forward_expand(gf, inpL);
|
|
14631
14829
|
|
|
14632
14830
|
auto * inp = build_inp_mem_hybrid();
|
|
14633
14831
|
|
|
@@ -14659,7 +14857,7 @@ struct llm_build_nemotron_h : public llm_graph_context_mamba {
|
|
|
14659
14857
|
|
|
14660
14858
|
// add residual
|
|
14661
14859
|
cur = ggml_add(ctx0, cur, inpSA);
|
|
14662
|
-
cb(cur, "
|
|
14860
|
+
cb(cur, "nemotron_h_block_out", il);
|
|
14663
14861
|
|
|
14664
14862
|
// input for next layer
|
|
14665
14863
|
inpL = cur;
|
|
@@ -16115,10 +16313,10 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
|
|
16115
16313
|
}
|
|
16116
16314
|
|
|
16117
16315
|
ggml_tensor * build_layer_ffn(
|
|
16118
|
-
ggml_tensor
|
|
16119
|
-
ggml_tensor
|
|
16120
|
-
const llama_model
|
|
16121
|
-
const int
|
|
16316
|
+
ggml_tensor * cur,
|
|
16317
|
+
ggml_tensor * inpSA,
|
|
16318
|
+
const llama_model & model,
|
|
16319
|
+
const int il) {
|
|
16122
16320
|
|
|
16123
16321
|
// For Granite architectures - scale residual
|
|
16124
16322
|
if (hparams.f_residual_scale) {
|
|
@@ -17530,6 +17728,7 @@ private:
|
|
|
17530
17728
|
const int64_t n_embd_head_q = hparams.n_embd_head_k;
|
|
17531
17729
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
17532
17730
|
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
|
17731
|
+
int32_t n_head = hparams.n_head(il);
|
|
17533
17732
|
int32_t n_head_kv = hparams.n_head_kv(il);
|
|
17534
17733
|
|
|
17535
17734
|
const int64_t q_offset = 0;
|
|
@@ -18446,6 +18645,8 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
18446
18645
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
18447
18646
|
|
|
18448
18647
|
for (int il = 0; il < n_layer; ++il) {
|
|
18648
|
+
const bool is_moe_layer = il >= static_cast<int>(hparams.n_layer_dense_lead);
|
|
18649
|
+
|
|
18449
18650
|
auto * prev_cur = cur;
|
|
18450
18651
|
cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
18451
18652
|
cb(cur, "model.layers.{}.operator_norm", il);
|
|
@@ -18460,7 +18661,16 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
18460
18661
|
}
|
|
18461
18662
|
|
|
18462
18663
|
cur = ggml_add(ctx0, prev_cur, cur);
|
|
18463
|
-
|
|
18664
|
+
|
|
18665
|
+
auto * ffn_norm_out = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
|
18666
|
+
cb(ffn_norm_out, "model.layers.{}.ffn_norm", il);
|
|
18667
|
+
|
|
18668
|
+
ggml_tensor * ffn_out = is_moe_layer ?
|
|
18669
|
+
build_moe_feed_forward(ffn_norm_out, il) :
|
|
18670
|
+
build_dense_feed_forward(ffn_norm_out, il);
|
|
18671
|
+
cb(ffn_norm_out, "model.layers.{}.ffn_out", il);
|
|
18672
|
+
|
|
18673
|
+
cur = ggml_add(ctx0, cur, ffn_out);
|
|
18464
18674
|
}
|
|
18465
18675
|
|
|
18466
18676
|
cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1);
|
|
@@ -18475,23 +18685,32 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
18475
18685
|
ggml_build_forward_expand(gf, cur);
|
|
18476
18686
|
}
|
|
18477
18687
|
|
|
18478
|
-
ggml_tensor *
|
|
18479
|
-
|
|
18480
|
-
|
|
18481
|
-
|
|
18688
|
+
ggml_tensor * build_moe_feed_forward(ggml_tensor * cur,
|
|
18689
|
+
int il) const {
|
|
18690
|
+
return build_moe_ffn(cur,
|
|
18691
|
+
model.layers[il].ffn_gate_inp,
|
|
18692
|
+
model.layers[il].ffn_up_exps,
|
|
18693
|
+
model.layers[il].ffn_gate_exps,
|
|
18694
|
+
model.layers[il].ffn_down_exps,
|
|
18695
|
+
model.layers[il].ffn_exp_probs_b,
|
|
18696
|
+
n_expert, n_expert_used,
|
|
18697
|
+
LLM_FFN_SILU, true,
|
|
18698
|
+
false, 0.0,
|
|
18699
|
+
static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
|
|
18700
|
+
il);
|
|
18701
|
+
}
|
|
18482
18702
|
|
|
18703
|
+
ggml_tensor * build_dense_feed_forward(ggml_tensor * cur,
|
|
18704
|
+
int il) const {
|
|
18483
18705
|
GGML_ASSERT(!model.layers[il].ffn_up_b);
|
|
18484
18706
|
GGML_ASSERT(!model.layers[il].ffn_gate_b);
|
|
18485
18707
|
GGML_ASSERT(!model.layers[il].ffn_down_b);
|
|
18486
|
-
|
|
18708
|
+
return build_ffn(cur,
|
|
18487
18709
|
model.layers[il].ffn_up, NULL, NULL,
|
|
18488
18710
|
model.layers[il].ffn_gate, NULL, NULL,
|
|
18489
18711
|
model.layers[il].ffn_down, NULL, NULL,
|
|
18490
18712
|
NULL,
|
|
18491
18713
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
18492
|
-
cb(cur, "model.layers.{}.feed_forward.w2", il);
|
|
18493
|
-
|
|
18494
|
-
return cur;
|
|
18495
18714
|
}
|
|
18496
18715
|
|
|
18497
18716
|
ggml_tensor * build_attn_block(ggml_tensor * cur,
|
|
@@ -18861,6 +19080,291 @@ struct llm_build_smallthinker : public llm_graph_context{
|
|
|
18861
19080
|
}
|
|
18862
19081
|
};
|
|
18863
19082
|
|
|
19083
|
+
struct llm_build_grovemoe : public llm_graph_context {
|
|
19084
|
+
llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
19085
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
19086
|
+
const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
|
|
19087
|
+
|
|
19088
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
19089
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
19090
|
+
|
|
19091
|
+
ggml_tensor * cur;
|
|
19092
|
+
ggml_tensor * inpL;
|
|
19093
|
+
|
|
19094
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
19095
|
+
|
|
19096
|
+
// inp_pos - contains the positions
|
|
19097
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
19098
|
+
|
|
19099
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
19100
|
+
|
|
19101
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
19102
|
+
|
|
19103
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
19104
|
+
ggml_tensor * inpSA = inpL;
|
|
19105
|
+
|
|
19106
|
+
// norm
|
|
19107
|
+
cur = build_norm(inpL,
|
|
19108
|
+
model.layers[il].attn_norm, NULL,
|
|
19109
|
+
LLM_NORM_RMS, il);
|
|
19110
|
+
cb(cur, "attn_norm", il);
|
|
19111
|
+
|
|
19112
|
+
// self_attention
|
|
19113
|
+
{
|
|
19114
|
+
// compute Q and K and RoPE them
|
|
19115
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
19116
|
+
cb(Qcur, "Qcur", il);
|
|
19117
|
+
|
|
19118
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
19119
|
+
cb(Kcur, "Kcur", il);
|
|
19120
|
+
|
|
19121
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
19122
|
+
cb(Vcur, "Vcur", il);
|
|
19123
|
+
|
|
19124
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
19125
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
19126
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
19127
|
+
|
|
19128
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
19129
|
+
cb(Qcur, "Qcur_normed", il);
|
|
19130
|
+
|
|
19131
|
+
Qcur = ggml_rope_ext(
|
|
19132
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
19133
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
19134
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
19135
|
+
);
|
|
19136
|
+
|
|
19137
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
19138
|
+
cb(Kcur, "Kcur_normed", il);
|
|
19139
|
+
|
|
19140
|
+
Kcur = ggml_rope_ext(
|
|
19141
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
19142
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
19143
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
19144
|
+
);
|
|
19145
|
+
|
|
19146
|
+
cb(Qcur, "Qcur", il);
|
|
19147
|
+
cb(Kcur, "Kcur", il);
|
|
19148
|
+
cb(Vcur, "Vcur", il);
|
|
19149
|
+
|
|
19150
|
+
cur = build_attn(inp_attn,
|
|
19151
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
19152
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
19153
|
+
}
|
|
19154
|
+
|
|
19155
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
19156
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
19157
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
19158
|
+
}
|
|
19159
|
+
|
|
19160
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
19161
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
19162
|
+
|
|
19163
|
+
// MoE branch
|
|
19164
|
+
cur = build_norm(ffn_inp,
|
|
19165
|
+
model.layers[il].ffn_norm, NULL,
|
|
19166
|
+
LLM_NORM_RMS, il);
|
|
19167
|
+
cb(cur, "ffn_norm", il);
|
|
19168
|
+
|
|
19169
|
+
ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, cur); // [n_expert, n_tokens]
|
|
19170
|
+
cb(probs, "ffn_moe_logits", il);
|
|
19171
|
+
|
|
19172
|
+
ggml_tensor * moe_out =
|
|
19173
|
+
build_moe_ffn(cur,
|
|
19174
|
+
nullptr,
|
|
19175
|
+
model.layers[il].ffn_up_exps,
|
|
19176
|
+
model.layers[il].ffn_gate_exps,
|
|
19177
|
+
model.layers[il].ffn_down_exps,
|
|
19178
|
+
nullptr,
|
|
19179
|
+
n_expert, n_expert_used,
|
|
19180
|
+
LLM_FFN_SILU, true,
|
|
19181
|
+
false, 0.0,
|
|
19182
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
19183
|
+
il, probs);
|
|
19184
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
19185
|
+
cur = moe_out;
|
|
19186
|
+
|
|
19187
|
+
// TODO: Only do the expert selection and weights once
|
|
19188
|
+
moe_out =
|
|
19189
|
+
build_moe_ffn(cur,
|
|
19190
|
+
nullptr,
|
|
19191
|
+
model.layers[il].ffn_up_chexps,
|
|
19192
|
+
model.layers[il].ffn_gate_chexps,
|
|
19193
|
+
model.layers[il].ffn_down_chexps,
|
|
19194
|
+
nullptr,
|
|
19195
|
+
n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used,
|
|
19196
|
+
LLM_FFN_SILU, true,
|
|
19197
|
+
false, 0.0,
|
|
19198
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
19199
|
+
il, probs);
|
|
19200
|
+
cb(moe_out, "ffn_adj_moe_out", il);
|
|
19201
|
+
|
|
19202
|
+
cur = ggml_add(ctx0, cur, ggml_scale(ctx0, moe_out, hparams.expert_group_scale));
|
|
19203
|
+
cb(cur, "ffn_final_moe_out", il);
|
|
19204
|
+
|
|
19205
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
19206
|
+
|
|
19207
|
+
cur = build_cvec(cur, il);
|
|
19208
|
+
cb(cur, "l_out", il);
|
|
19209
|
+
|
|
19210
|
+
// input for next layer
|
|
19211
|
+
inpL = cur;
|
|
19212
|
+
}
|
|
19213
|
+
|
|
19214
|
+
cur = inpL;
|
|
19215
|
+
|
|
19216
|
+
cur = build_norm(cur,
|
|
19217
|
+
model.output_norm, NULL,
|
|
19218
|
+
LLM_NORM_RMS, -1);
|
|
19219
|
+
|
|
19220
|
+
cb(cur, "result_norm", -1);
|
|
19221
|
+
res->t_embd = cur;
|
|
19222
|
+
|
|
19223
|
+
// lm_head
|
|
19224
|
+
cur = build_lora_mm(model.output, cur);
|
|
19225
|
+
|
|
19226
|
+
cb(cur, "result_output", -1);
|
|
19227
|
+
res->t_logits = cur;
|
|
19228
|
+
|
|
19229
|
+
ggml_build_forward_expand(gf, cur);
|
|
19230
|
+
}
|
|
19231
|
+
};
|
|
19232
|
+
|
|
19233
|
+
struct llm_build_apertus : public llm_graph_context {
|
|
19234
|
+
llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
19235
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
19236
|
+
|
|
19237
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
19238
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
19239
|
+
|
|
19240
|
+
ggml_tensor * cur;
|
|
19241
|
+
ggml_tensor * inpL;
|
|
19242
|
+
|
|
19243
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
19244
|
+
|
|
19245
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
19246
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
19247
|
+
|
|
19248
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
19249
|
+
|
|
19250
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
19251
|
+
|
|
19252
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
19253
|
+
ggml_tensor * inpSA = inpL;
|
|
19254
|
+
|
|
19255
|
+
cur = build_norm(inpL,
|
|
19256
|
+
model.layers[il].attn_norm, nullptr,
|
|
19257
|
+
LLM_NORM_RMS, il);
|
|
19258
|
+
cb(cur, "attn_norm", il);
|
|
19259
|
+
|
|
19260
|
+
// self-attention
|
|
19261
|
+
{
|
|
19262
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
19263
|
+
|
|
19264
|
+
// compute Q and K and RoPE them
|
|
19265
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
19266
|
+
cb(Qcur, "Qcur", il);
|
|
19267
|
+
|
|
19268
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
19269
|
+
cb(Kcur, "Kcur", il);
|
|
19270
|
+
|
|
19271
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
19272
|
+
cb(Vcur, "Vcur", il);
|
|
19273
|
+
|
|
19274
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
19275
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
19276
|
+
cb(Qcur, "Qcur_normed", il);
|
|
19277
|
+
|
|
19278
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
19279
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
19280
|
+
cb(Kcur, "Kcur_normed", il);
|
|
19281
|
+
|
|
19282
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
19283
|
+
|
|
19284
|
+
Qcur = ggml_rope_ext(
|
|
19285
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
19286
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
19287
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
19288
|
+
);
|
|
19289
|
+
|
|
19290
|
+
Kcur = ggml_rope_ext(
|
|
19291
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
19292
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
19293
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
19294
|
+
);
|
|
19295
|
+
|
|
19296
|
+
cb(Qcur, "Qcur_pos", il);
|
|
19297
|
+
cb(Kcur, "Kcur_pos", il);
|
|
19298
|
+
cb(Vcur, "Vcur_pos", il);
|
|
19299
|
+
|
|
19300
|
+
cur = build_attn(inp_attn,
|
|
19301
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
19302
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
19303
|
+
cb(cur, "attn_out", il);
|
|
19304
|
+
}
|
|
19305
|
+
|
|
19306
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
19307
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
19308
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
19309
|
+
}
|
|
19310
|
+
|
|
19311
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
19312
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
19313
|
+
|
|
19314
|
+
// feed-forward network with xIELU activation
|
|
19315
|
+
{
|
|
19316
|
+
cur = build_norm(ffn_inp,
|
|
19317
|
+
model.layers[il].ffn_norm, nullptr,
|
|
19318
|
+
LLM_NORM_RMS, il);
|
|
19319
|
+
cb(cur, "ffn_norm", il);
|
|
19320
|
+
|
|
19321
|
+
// Up projection
|
|
19322
|
+
ggml_tensor * up = build_lora_mm(model.layers[il].ffn_up, cur);
|
|
19323
|
+
cb(up, "ffn_up", il);
|
|
19324
|
+
|
|
19325
|
+
float alpha_n_val = hparams.xielu_alpha_n[il];
|
|
19326
|
+
float alpha_p_val = hparams.xielu_alpha_p[il];
|
|
19327
|
+
float beta_val = hparams.xielu_beta[il];
|
|
19328
|
+
float eps_val = hparams.xielu_eps[il];
|
|
19329
|
+
|
|
19330
|
+
// Apply xIELU activation
|
|
19331
|
+
ggml_tensor * activated = ggml_xielu(ctx0, up, alpha_n_val, alpha_p_val, beta_val, eps_val);
|
|
19332
|
+
cb(activated, "ffn_xielu", il);
|
|
19333
|
+
|
|
19334
|
+
// Down projection
|
|
19335
|
+
cur = build_lora_mm(model.layers[il].ffn_down, activated);
|
|
19336
|
+
cb(cur, "ffn_down", il);
|
|
19337
|
+
}
|
|
19338
|
+
|
|
19339
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
19340
|
+
cb(cur, "ffn_out", il);
|
|
19341
|
+
|
|
19342
|
+
cur = build_cvec(cur, il);
|
|
19343
|
+
cb(cur, "l_out", il);
|
|
19344
|
+
|
|
19345
|
+
// input for next layer
|
|
19346
|
+
inpL = cur;
|
|
19347
|
+
}
|
|
19348
|
+
|
|
19349
|
+
cur = inpL;
|
|
19350
|
+
|
|
19351
|
+
cur = build_norm(cur,
|
|
19352
|
+
model.output_norm, nullptr,
|
|
19353
|
+
LLM_NORM_RMS, -1);
|
|
19354
|
+
|
|
19355
|
+
cb(cur, "result_norm", -1);
|
|
19356
|
+
res->t_embd = cur;
|
|
19357
|
+
|
|
19358
|
+
// lm_head
|
|
19359
|
+
cur = build_lora_mm(model.output, cur);
|
|
19360
|
+
|
|
19361
|
+
cb(cur, "result_output", -1);
|
|
19362
|
+
res->t_logits = cur;
|
|
19363
|
+
|
|
19364
|
+
ggml_build_forward_expand(gf, cur);
|
|
19365
|
+
}
|
|
19366
|
+
};
|
|
19367
|
+
|
|
18864
19368
|
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
|
18865
19369
|
llama_memory_i * res;
|
|
18866
19370
|
|
|
@@ -19376,6 +19880,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
19376
19880
|
llm = std::make_unique<llm_build_falcon_h1>(*this, params);
|
|
19377
19881
|
} break;
|
|
19378
19882
|
case LLM_ARCH_LFM2:
|
|
19883
|
+
case LLM_ARCH_LFM2MOE:
|
|
19379
19884
|
{
|
|
19380
19885
|
llm = std::make_unique<llm_build_lfm2>(*this, params);
|
|
19381
19886
|
} break;
|
|
@@ -19387,6 +19892,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
19387
19892
|
llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
|
|
19388
19893
|
}
|
|
19389
19894
|
} break;
|
|
19895
|
+
case LLM_ARCH_GROVEMOE:
|
|
19896
|
+
{
|
|
19897
|
+
llm = std::make_unique<llm_build_grovemoe>(*this, params);
|
|
19898
|
+
} break;
|
|
19899
|
+
case LLM_ARCH_APERTUS:
|
|
19900
|
+
{
|
|
19901
|
+
llm = std::make_unique<llm_build_apertus>(*this, params);
|
|
19902
|
+
} break;
|
|
19390
19903
|
default:
|
|
19391
19904
|
GGML_ABORT("fatal error");
|
|
19392
19905
|
}
|
|
@@ -19394,6 +19907,12 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
19394
19907
|
// add on pooling layer
|
|
19395
19908
|
llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
|
|
19396
19909
|
|
|
19910
|
+
// if the gguf model was converted with --sentence-transformers-dense-modules
|
|
19911
|
+
// there will be two additional dense projection layers
|
|
19912
|
+
// dense linear projections are applied after pooling
|
|
19913
|
+
// TODO: move reranking logic here and generalize
|
|
19914
|
+
llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
|
|
19915
|
+
|
|
19397
19916
|
return llm->res->get_gf();
|
|
19398
19917
|
}
|
|
19399
19918
|
|
|
@@ -19418,6 +19937,7 @@ llama_model_params llama_model_default_params() {
|
|
|
19418
19937
|
/*.use_mlock =*/ false,
|
|
19419
19938
|
/*.check_tensors =*/ false,
|
|
19420
19939
|
/*.use_extra_bufts =*/ true,
|
|
19940
|
+
/*.no_host =*/ false,
|
|
19421
19941
|
};
|
|
19422
19942
|
|
|
19423
19943
|
return result;
|
|
@@ -19589,9 +20109,12 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
19589
20109
|
case LLM_ARCH_OPENAI_MOE:
|
|
19590
20110
|
case LLM_ARCH_HUNYUAN_DENSE:
|
|
19591
20111
|
case LLM_ARCH_LFM2:
|
|
20112
|
+
case LLM_ARCH_LFM2MOE:
|
|
19592
20113
|
case LLM_ARCH_SMALLTHINKER:
|
|
19593
20114
|
case LLM_ARCH_GLM4_MOE:
|
|
19594
20115
|
case LLM_ARCH_SEED_OSS:
|
|
20116
|
+
case LLM_ARCH_GROVEMOE:
|
|
20117
|
+
case LLM_ARCH_APERTUS:
|
|
19595
20118
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
19596
20119
|
|
|
19597
20120
|
case LLM_ARCH_QWEN2VL:
|
|
@@ -19702,6 +20225,10 @@ bool llama_model_is_recurrent(const llama_model * model) {
|
|
|
19702
20225
|
return llm_arch_is_recurrent(model->arch);
|
|
19703
20226
|
}
|
|
19704
20227
|
|
|
20228
|
+
bool llama_model_is_hybrid(const llama_model * model) {
|
|
20229
|
+
return llm_arch_is_hybrid(model->arch);
|
|
20230
|
+
}
|
|
20231
|
+
|
|
19705
20232
|
bool llama_model_is_diffusion(const llama_model * model) {
|
|
19706
20233
|
return llm_arch_is_diffusion(model->arch);
|
|
19707
20234
|
}
|