@fugood/llama.node 1.2.2 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +33 -11
- package/src/llama.cpp/CMakeLists.txt +1 -0
- package/src/llama.cpp/common/CMakeLists.txt +46 -2
- package/src/llama.cpp/common/arg.cpp +423 -186
- package/src/llama.cpp/common/arg.h +0 -1
- package/src/llama.cpp/common/chat-parser.cpp +154 -13
- package/src/llama.cpp/common/chat-parser.h +3 -0
- package/src/llama.cpp/common/chat.cpp +217 -6
- package/src/llama.cpp/common/chat.h +5 -3
- package/src/llama.cpp/common/common.cpp +23 -6
- package/src/llama.cpp/common/common.h +6 -4
- package/src/llama.cpp/common/http.h +73 -0
- package/src/llama.cpp/common/sampling.cpp +1 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -1
- package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +22 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +18 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +27 -19
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +6 -5
- package/src/llama.cpp/include/llama.h +23 -11
- package/src/llama.cpp/src/llama-arch.cpp +93 -0
- package/src/llama.cpp/src/llama-arch.h +22 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -1
- package/src/llama.cpp/src/llama-context.cpp +157 -0
- package/src/llama.cpp/src/llama-context.h +10 -0
- package/src/llama.cpp/src/llama-graph.cpp +57 -22
- package/src/llama.cpp/src/llama-graph.h +10 -1
- package/src/llama.cpp/src/llama-hparams.h +17 -2
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +10 -2
- package/src/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +10 -5
- package/src/llama.cpp/src/llama-kv-cache.h +2 -0
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
- package/src/llama.cpp/src/llama-memory-hybrid.h +2 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +19 -3
- package/src/llama.cpp/src/llama-memory-recurrent.h +3 -0
- package/src/llama.cpp/src/llama-memory.h +3 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model.cpp +582 -45
- package/src/llama.cpp/src/llama-model.h +23 -1
- package/src/llama.cpp/src/llama-sampling.cpp +5 -0
- package/src/llama.cpp/src/llama-vocab.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.h +41 -40
- package/src/llama.cpp/src/unicode.h +43 -0
|
@@ -66,6 +66,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
66
66
|
case LLM_TYPE_1_7B: return "1.7B";
|
|
67
67
|
case LLM_TYPE_1_8B: return "1.8B";
|
|
68
68
|
case LLM_TYPE_2B: return "2B";
|
|
69
|
+
case LLM_TYPE_2_6B: return "2.6B";
|
|
69
70
|
case LLM_TYPE_2_8B: return "2.8B";
|
|
70
71
|
case LLM_TYPE_2_9B: return "2.9B";
|
|
71
72
|
case LLM_TYPE_3B: return "3B";
|
|
@@ -113,6 +114,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
113
114
|
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
|
114
115
|
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
|
115
116
|
case LLM_TYPE_A13B: return "A13B";
|
|
117
|
+
case LLM_TYPE_8B_A1B: return "8B.A1B";
|
|
116
118
|
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
|
117
119
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
118
120
|
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
|
@@ -309,7 +311,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
|
|
|
309
311
|
}
|
|
310
312
|
|
|
311
313
|
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
|
|
312
|
-
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
|
|
314
|
+
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
|
|
313
315
|
buft_list_t buft_list;
|
|
314
316
|
|
|
315
317
|
// add ACCEL buffer types
|
|
@@ -330,11 +332,13 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
|
|
330
332
|
// generally, this will be done using the first device in the list
|
|
331
333
|
// a better approach would be to handle this on a weight-by-weight basis using the offload_op
|
|
332
334
|
// function of the device to determine if it would benefit from being stored in a host buffer
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
335
|
+
if (!no_host) {
|
|
336
|
+
for (auto * dev : devices) {
|
|
337
|
+
ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
|
|
338
|
+
if (buft) {
|
|
339
|
+
buft_list.emplace_back(dev, buft);
|
|
340
|
+
break;
|
|
341
|
+
}
|
|
338
342
|
}
|
|
339
343
|
}
|
|
340
344
|
|
|
@@ -511,9 +515,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
511
515
|
llm_arch_is_recurrent(ml.get_arch()));
|
|
512
516
|
|
|
513
517
|
std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
|
|
514
|
-
|
|
515
518
|
std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
|
|
516
519
|
|
|
520
|
+
std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f);
|
|
521
|
+
std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
|
|
522
|
+
std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
|
|
523
|
+
std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
|
|
524
|
+
|
|
517
525
|
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
|
518
526
|
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
|
519
527
|
|
|
@@ -674,10 +682,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
674
682
|
} break;
|
|
675
683
|
case LLM_ARCH_MINICPM:
|
|
676
684
|
{
|
|
685
|
+
// Backward-compatible defaults for older MiniCPM GGUFs
|
|
686
|
+
hparams.f_embedding_scale = 12.0f;
|
|
687
|
+
hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer));
|
|
688
|
+
hparams.f_logit_scale = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f;
|
|
689
|
+
|
|
677
690
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
ml.get_key(
|
|
691
|
+
|
|
692
|
+
// Optional KV reads, override defaults if present in newer GGUF exports
|
|
693
|
+
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /*required=*/false);
|
|
694
|
+
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /*required=*/false);
|
|
695
|
+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /*required=*/false);
|
|
681
696
|
|
|
682
697
|
// MiniCPM uses rope by default, unlike Granite which uses it as a switch
|
|
683
698
|
hparams.rope_finetuned = true;
|
|
@@ -1076,7 +1091,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1076
1091
|
}
|
|
1077
1092
|
break;
|
|
1078
1093
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1079
|
-
|
|
1094
|
+
}
|
|
1095
|
+
|
|
1096
|
+
// Load attention parameters
|
|
1097
|
+
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
|
|
1098
|
+
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
|
|
1080
1099
|
} break;
|
|
1081
1100
|
case LLM_ARCH_GPT2:
|
|
1082
1101
|
{
|
|
@@ -1199,12 +1218,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1199
1218
|
hparams.set_swa_pattern(6);
|
|
1200
1219
|
|
|
1201
1220
|
hparams.causal_attn = false; // embeddings do not use causal attention
|
|
1202
|
-
hparams.rope_freq_base_train_swa
|
|
1221
|
+
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1203
1222
|
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1204
1223
|
|
|
1205
|
-
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,
|
|
1224
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1206
1225
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1207
|
-
ml.get_key(LLM_KV_POOLING_TYPE,
|
|
1226
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
1227
|
+
|
|
1228
|
+
//applied only if model converted with --sentence-transformers-dense-modules
|
|
1229
|
+
ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
|
|
1230
|
+
ml.get_key(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out, false);
|
|
1231
|
+
ml.get_key(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in, false);
|
|
1232
|
+
ml.get_key(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out, false);
|
|
1233
|
+
|
|
1234
|
+
GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
|
|
1235
|
+
GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");
|
|
1208
1236
|
|
|
1209
1237
|
switch (hparams.n_layer) {
|
|
1210
1238
|
case 24: type = LLM_TYPE_0_3B; break;
|
|
@@ -1977,12 +2005,28 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1977
2005
|
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
|
|
1978
2006
|
hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
|
|
1979
2007
|
}
|
|
1980
|
-
|
|
1981
|
-
|
|
1982
|
-
case
|
|
1983
|
-
case
|
|
1984
|
-
|
|
2008
|
+
hparams.n_layer_dense_lead = hparams.n_layer;
|
|
2009
|
+
switch (hparams.n_ff()) {
|
|
2010
|
+
case 4608: type = LLM_TYPE_350M; break;
|
|
2011
|
+
case 6912: type = LLM_TYPE_700M; break;
|
|
2012
|
+
case 8192: type = LLM_TYPE_1_2B; break;
|
|
2013
|
+
case 10752: type = LLM_TYPE_2_6B; break;
|
|
2014
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
2015
|
+
}
|
|
2016
|
+
} break;
|
|
2017
|
+
case LLM_ARCH_LFM2MOE:
|
|
2018
|
+
{
|
|
2019
|
+
ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
|
|
2020
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2021
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
2022
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
2023
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
|
2024
|
+
|
|
2025
|
+
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
|
|
2026
|
+
hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
|
|
1985
2027
|
}
|
|
2028
|
+
|
|
2029
|
+
type = LLM_TYPE_8B_A1B;
|
|
1986
2030
|
} break;
|
|
1987
2031
|
case LLM_ARCH_SMALLTHINKER:
|
|
1988
2032
|
{
|
|
@@ -2007,6 +2051,32 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2007
2051
|
default: type = LLM_TYPE_UNKNOWN;
|
|
2008
2052
|
}
|
|
2009
2053
|
} break;
|
|
2054
|
+
case LLM_ARCH_GROVEMOE:
|
|
2055
|
+
{
|
|
2056
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
2057
|
+
ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp);
|
|
2058
|
+
ml.get_key(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale);
|
|
2059
|
+
ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
|
|
2060
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2061
|
+
|
|
2062
|
+
switch (hparams.n_layer) {
|
|
2063
|
+
case 48: type = LLM_TYPE_30B_A3B; break;
|
|
2064
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
2065
|
+
}
|
|
2066
|
+
} break;
|
|
2067
|
+
case LLM_ARCH_APERTUS:
|
|
2068
|
+
{
|
|
2069
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2070
|
+
ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer);
|
|
2071
|
+
ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer);
|
|
2072
|
+
ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer);
|
|
2073
|
+
ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer);
|
|
2074
|
+
|
|
2075
|
+
switch (hparams.n_layer) {
|
|
2076
|
+
case 32: type = LLM_TYPE_8B; break;
|
|
2077
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
2078
|
+
}
|
|
2079
|
+
} break;
|
|
2010
2080
|
default: throw std::runtime_error("unsupported model architecture");
|
|
2011
2081
|
}
|
|
2012
2082
|
|
|
@@ -2040,7 +2110,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2040
2110
|
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
|
|
2041
2111
|
|
|
2042
2112
|
// build a list of buffer types for the CPU and GPU devices
|
|
2043
|
-
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
|
|
2113
|
+
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
|
|
2044
2114
|
for (auto * dev : devices) {
|
|
2045
2115
|
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
|
|
2046
2116
|
// add CPU buffer types as a fallback
|
|
@@ -3165,6 +3235,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3165
3235
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3166
3236
|
}
|
|
3167
3237
|
|
|
3238
|
+
// output rerank head
|
|
3239
|
+
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
3240
|
+
|
|
3168
3241
|
for (int i = 0; i < n_layer; ++i) {
|
|
3169
3242
|
auto & layer = layers[i];
|
|
3170
3243
|
|
|
@@ -3367,17 +3440,17 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3367
3440
|
} break;
|
|
3368
3441
|
case LLM_ARCH_PLAMO2:
|
|
3369
3442
|
{
|
|
3443
|
+
// mamba parameters
|
|
3370
3444
|
const uint32_t d_conv = hparams.ssm_d_conv;
|
|
3371
3445
|
const uint32_t d_state = hparams.ssm_d_state;
|
|
3372
3446
|
const uint32_t num_heads = hparams.ssm_dt_rank;
|
|
3373
3447
|
const uint32_t intermediate_size = hparams.ssm_d_inner;
|
|
3374
|
-
const uint32_t head_dim = intermediate_size / num_heads;
|
|
3375
|
-
const uint32_t qk_dim = head_dim;
|
|
3376
|
-
const uint32_t v_dim = head_dim;
|
|
3377
|
-
const int64_t num_attention_heads = hparams.n_head();
|
|
3378
|
-
const int64_t q_num_heads = num_attention_heads;
|
|
3379
3448
|
const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
|
|
3380
3449
|
|
|
3450
|
+
// attention parameters
|
|
3451
|
+
const uint32_t qk_dim = hparams.n_embd_head_k;
|
|
3452
|
+
const uint32_t v_dim = hparams.n_embd_head_v;
|
|
3453
|
+
|
|
3381
3454
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3382
3455
|
|
|
3383
3456
|
// output
|
|
@@ -3411,6 +3484,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3411
3484
|
layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
|
|
3412
3485
|
layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
|
|
3413
3486
|
} else {
|
|
3487
|
+
const int64_t num_attention_heads = hparams.n_head(i);
|
|
3488
|
+
const int64_t q_num_heads = num_attention_heads;
|
|
3414
3489
|
const int64_t num_key_value_heads = hparams.n_head_kv(i);
|
|
3415
3490
|
const int64_t k_num_heads = num_key_value_heads;
|
|
3416
3491
|
const int64_t v_num_heads = num_key_value_heads;
|
|
@@ -3419,8 +3494,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3419
3494
|
const int64_t v_proj_dim = v_num_heads * v_dim;
|
|
3420
3495
|
|
|
3421
3496
|
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
|
|
3422
|
-
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {
|
|
3423
|
-
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {
|
|
3497
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {qk_dim, num_attention_heads}, 0);
|
|
3498
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {qk_dim, k_num_heads}, 0);
|
|
3424
3499
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
|
|
3425
3500
|
}
|
|
3426
3501
|
|
|
@@ -3620,6 +3695,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3620
3695
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3621
3696
|
}
|
|
3622
3697
|
|
|
3698
|
+
// Dense linear weights
|
|
3699
|
+
dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.dense_2_feat_out}, TENSOR_NOT_REQUIRED);
|
|
3700
|
+
dense_3_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_3_OUT, "weight"), {hparams.dense_3_feat_in, n_embd}, TENSOR_NOT_REQUIRED);
|
|
3701
|
+
|
|
3702
|
+
|
|
3623
3703
|
for (int i = 0; i < n_layer; ++i) {
|
|
3624
3704
|
auto & layer = layers[i];
|
|
3625
3705
|
|
|
@@ -4800,11 +4880,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4800
4880
|
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
|
4801
4881
|
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
4802
4882
|
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
|
|
4803
|
-
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
|
|
4804
4883
|
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
|
|
4805
4884
|
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
|
|
4806
|
-
|
|
4807
|
-
|
|
4885
|
+
|
|
4886
|
+
// Optional tensors
|
|
4887
|
+
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
|
|
4888
|
+
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
|
|
4889
|
+
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
|
|
4808
4890
|
}
|
|
4809
4891
|
}
|
|
4810
4892
|
}
|
|
@@ -5762,6 +5844,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5762
5844
|
}
|
|
5763
5845
|
} break;
|
|
5764
5846
|
case LLM_ARCH_LFM2:
|
|
5847
|
+
case LLM_ARCH_LFM2MOE:
|
|
5765
5848
|
{
|
|
5766
5849
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5767
5850
|
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
|
@@ -5773,11 +5856,23 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5773
5856
|
|
|
5774
5857
|
for (int i = 0; i < n_layer; ++i) {
|
|
5775
5858
|
auto & layer = layers[i];
|
|
5776
|
-
|
|
5859
|
+
|
|
5860
|
+
const bool is_moe_layer = i >= static_cast<int>(hparams.n_layer_dense_lead);
|
|
5861
|
+
|
|
5862
|
+
// ffn/moe is same for transformer and conv layers
|
|
5777
5863
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
5778
|
-
|
|
5779
|
-
|
|
5780
|
-
|
|
5864
|
+
if (is_moe_layer) {
|
|
5865
|
+
GGML_ASSERT(n_expert && n_expert_used);
|
|
5866
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
5867
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
|
|
5868
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {hparams.n_ff_exp, n_embd, n_expert}, 0);
|
|
5869
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
|
|
5870
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
|
|
5871
|
+
} else { // dense
|
|
5872
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
5873
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
5874
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
5875
|
+
}
|
|
5781
5876
|
|
|
5782
5877
|
// for operator_norm
|
|
5783
5878
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
@@ -5835,6 +5930,95 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5835
5930
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
|
|
5836
5931
|
}
|
|
5837
5932
|
} break;
|
|
5933
|
+
case LLM_ARCH_GROVEMOE:
|
|
5934
|
+
{
|
|
5935
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5936
|
+
|
|
5937
|
+
// output
|
|
5938
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
5939
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
5940
|
+
// if output is NULL, init from the input tok embed
|
|
5941
|
+
if (output == NULL) {
|
|
5942
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
5943
|
+
}
|
|
5944
|
+
|
|
5945
|
+
GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE");
|
|
5946
|
+
GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE");
|
|
5947
|
+
GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE");
|
|
5948
|
+
|
|
5949
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5950
|
+
auto & layer = layers[i];
|
|
5951
|
+
|
|
5952
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
5953
|
+
|
|
5954
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
5955
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
5956
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
5957
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
5958
|
+
|
|
5959
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
5960
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
5961
|
+
|
|
5962
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
5963
|
+
|
|
5964
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
5965
|
+
|
|
5966
|
+
// MoE branch
|
|
5967
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
5968
|
+
const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k;
|
|
5969
|
+
const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
|
|
5970
|
+
|
|
5971
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
5972
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
5973
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
5974
|
+
|
|
5975
|
+
layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
|
|
5976
|
+
layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp, n_embd, n_chunk_expert}, 0);
|
|
5977
|
+
layer.ffn_up_chexps = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
|
|
5978
|
+
}
|
|
5979
|
+
} break;
|
|
5980
|
+
case LLM_ARCH_APERTUS:
|
|
5981
|
+
{
|
|
5982
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
5983
|
+
|
|
5984
|
+
// output
|
|
5985
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
5986
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
|
|
5987
|
+
|
|
5988
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5989
|
+
auto & layer = layers[i];
|
|
5990
|
+
|
|
5991
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
|
5992
|
+
|
|
5993
|
+
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
|
5994
|
+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
5995
|
+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
5996
|
+
} else {
|
|
5997
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
5998
|
+
}
|
|
5999
|
+
|
|
6000
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
|
|
6001
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
|
|
6002
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
|
|
6003
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
|
6004
|
+
|
|
6005
|
+
// optional bias tensors
|
|
6006
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
|
6007
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
|
|
6008
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
|
|
6009
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
|
6010
|
+
|
|
6011
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
|
|
6012
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
|
6013
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
|
|
6014
|
+
|
|
6015
|
+
// Q and K layernorms for Apertus
|
|
6016
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
|
|
6017
|
+
layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
|
|
6018
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
|
|
6019
|
+
layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
|
|
6020
|
+
}
|
|
6021
|
+
} break;
|
|
5838
6022
|
default:
|
|
5839
6023
|
throw std::runtime_error("unknown architecture");
|
|
5840
6024
|
}
|
|
@@ -6003,6 +6187,14 @@ size_t llama_model::n_devices() const {
|
|
|
6003
6187
|
return devices.size();
|
|
6004
6188
|
}
|
|
6005
6189
|
|
|
6190
|
+
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
|
6191
|
+
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
6192
|
+
for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs) {
|
|
6193
|
+
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
|
|
6194
|
+
}
|
|
6195
|
+
return ret;
|
|
6196
|
+
}
|
|
6197
|
+
|
|
6006
6198
|
uint64_t llama_model::n_elements() const {
|
|
6007
6199
|
return pimpl->n_elements;
|
|
6008
6200
|
}
|
|
@@ -6161,11 +6353,18 @@ void llama_model::print_info() const {
|
|
|
6161
6353
|
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
6162
6354
|
}
|
|
6163
6355
|
|
|
6164
|
-
if (arch == LLM_ARCH_SMALLTHINKER) {
|
|
6356
|
+
if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
|
|
6165
6357
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
6166
6358
|
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
6167
6359
|
}
|
|
6168
6360
|
|
|
6361
|
+
if (arch == LLM_ARCH_GROVEMOE) {
|
|
6362
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
6363
|
+
LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
|
|
6364
|
+
LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
|
|
6365
|
+
LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
|
|
6366
|
+
}
|
|
6367
|
+
|
|
6169
6368
|
vocab.print_info();
|
|
6170
6369
|
}
|
|
6171
6370
|
|
|
@@ -7689,6 +7888,8 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7689
7888
|
}
|
|
7690
7889
|
|
|
7691
7890
|
if (model.layers[il].attn_q_norm) {
|
|
7891
|
+
Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head*n_head, n_tokens);
|
|
7892
|
+
|
|
7692
7893
|
Qcur = build_norm(Qcur,
|
|
7693
7894
|
model.layers[il].attn_q_norm,
|
|
7694
7895
|
model.layers[il].attn_q_norm_b,
|
|
@@ -7698,6 +7899,8 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7698
7899
|
}
|
|
7699
7900
|
|
|
7700
7901
|
if (model.layers[il].attn_k_norm) {
|
|
7902
|
+
Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_head*n_head_kv, n_tokens);
|
|
7903
|
+
|
|
7701
7904
|
Kcur = build_norm(Kcur,
|
|
7702
7905
|
model.layers[il].attn_k_norm,
|
|
7703
7906
|
model.layers[il].attn_k_norm_b,
|
|
@@ -8080,6 +8283,9 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
8080
8283
|
|
|
8081
8284
|
// Q/K Layernorm
|
|
8082
8285
|
if (model.layers[il].attn_q_norm) {
|
|
8286
|
+
Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head*n_head, n_tokens);
|
|
8287
|
+
Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_head*n_head_kv, n_tokens);
|
|
8288
|
+
|
|
8083
8289
|
Qcur = build_norm(Qcur,
|
|
8084
8290
|
model.layers[il].attn_q_norm,
|
|
8085
8291
|
model.layers[il].attn_q_norm_b,
|
|
@@ -11664,6 +11870,7 @@ struct llm_graph_context_mamba : public llm_graph_context {
|
|
|
11664
11870
|
// TODO: skip computing output earlier for unused tokens
|
|
11665
11871
|
|
|
11666
11872
|
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
|
|
11873
|
+
cb(y, "mamba2_y_add_d", il);
|
|
11667
11874
|
y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
|
|
11668
11875
|
|
|
11669
11876
|
// grouped RMS norm
|
|
@@ -14618,6 +14825,7 @@ struct llm_build_nemotron_h : public llm_graph_context_mamba {
|
|
|
14618
14825
|
ggml_tensor * inpL;
|
|
14619
14826
|
|
|
14620
14827
|
inpL = build_inp_embd(model.tok_embd);
|
|
14828
|
+
ggml_build_forward_expand(gf, inpL);
|
|
14621
14829
|
|
|
14622
14830
|
auto * inp = build_inp_mem_hybrid();
|
|
14623
14831
|
|
|
@@ -14649,7 +14857,7 @@ struct llm_build_nemotron_h : public llm_graph_context_mamba {
|
|
|
14649
14857
|
|
|
14650
14858
|
// add residual
|
|
14651
14859
|
cur = ggml_add(ctx0, cur, inpSA);
|
|
14652
|
-
cb(cur, "
|
|
14860
|
+
cb(cur, "nemotron_h_block_out", il);
|
|
14653
14861
|
|
|
14654
14862
|
// input for next layer
|
|
14655
14863
|
inpL = cur;
|
|
@@ -17520,6 +17728,7 @@ private:
|
|
|
17520
17728
|
const int64_t n_embd_head_q = hparams.n_embd_head_k;
|
|
17521
17729
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
17522
17730
|
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
|
17731
|
+
int32_t n_head = hparams.n_head(il);
|
|
17523
17732
|
int32_t n_head_kv = hparams.n_head_kv(il);
|
|
17524
17733
|
|
|
17525
17734
|
const int64_t q_offset = 0;
|
|
@@ -18436,6 +18645,8 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
18436
18645
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
18437
18646
|
|
|
18438
18647
|
for (int il = 0; il < n_layer; ++il) {
|
|
18648
|
+
const bool is_moe_layer = il >= static_cast<int>(hparams.n_layer_dense_lead);
|
|
18649
|
+
|
|
18439
18650
|
auto * prev_cur = cur;
|
|
18440
18651
|
cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
18441
18652
|
cb(cur, "model.layers.{}.operator_norm", il);
|
|
@@ -18450,7 +18661,16 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
18450
18661
|
}
|
|
18451
18662
|
|
|
18452
18663
|
cur = ggml_add(ctx0, prev_cur, cur);
|
|
18453
|
-
|
|
18664
|
+
|
|
18665
|
+
auto * ffn_norm_out = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
|
18666
|
+
cb(ffn_norm_out, "model.layers.{}.ffn_norm", il);
|
|
18667
|
+
|
|
18668
|
+
ggml_tensor * ffn_out = is_moe_layer ?
|
|
18669
|
+
build_moe_feed_forward(ffn_norm_out, il) :
|
|
18670
|
+
build_dense_feed_forward(ffn_norm_out, il);
|
|
18671
|
+
cb(ffn_norm_out, "model.layers.{}.ffn_out", il);
|
|
18672
|
+
|
|
18673
|
+
cur = ggml_add(ctx0, cur, ffn_out);
|
|
18454
18674
|
}
|
|
18455
18675
|
|
|
18456
18676
|
cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1);
|
|
@@ -18465,23 +18685,32 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
18465
18685
|
ggml_build_forward_expand(gf, cur);
|
|
18466
18686
|
}
|
|
18467
18687
|
|
|
18468
|
-
ggml_tensor *
|
|
18469
|
-
|
|
18470
|
-
|
|
18471
|
-
|
|
18688
|
+
ggml_tensor * build_moe_feed_forward(ggml_tensor * cur,
|
|
18689
|
+
int il) const {
|
|
18690
|
+
return build_moe_ffn(cur,
|
|
18691
|
+
model.layers[il].ffn_gate_inp,
|
|
18692
|
+
model.layers[il].ffn_up_exps,
|
|
18693
|
+
model.layers[il].ffn_gate_exps,
|
|
18694
|
+
model.layers[il].ffn_down_exps,
|
|
18695
|
+
model.layers[il].ffn_exp_probs_b,
|
|
18696
|
+
n_expert, n_expert_used,
|
|
18697
|
+
LLM_FFN_SILU, true,
|
|
18698
|
+
false, 0.0,
|
|
18699
|
+
static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
|
|
18700
|
+
il);
|
|
18701
|
+
}
|
|
18472
18702
|
|
|
18703
|
+
ggml_tensor * build_dense_feed_forward(ggml_tensor * cur,
|
|
18704
|
+
int il) const {
|
|
18473
18705
|
GGML_ASSERT(!model.layers[il].ffn_up_b);
|
|
18474
18706
|
GGML_ASSERT(!model.layers[il].ffn_gate_b);
|
|
18475
18707
|
GGML_ASSERT(!model.layers[il].ffn_down_b);
|
|
18476
|
-
|
|
18708
|
+
return build_ffn(cur,
|
|
18477
18709
|
model.layers[il].ffn_up, NULL, NULL,
|
|
18478
18710
|
model.layers[il].ffn_gate, NULL, NULL,
|
|
18479
18711
|
model.layers[il].ffn_down, NULL, NULL,
|
|
18480
18712
|
NULL,
|
|
18481
18713
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
18482
|
-
cb(cur, "model.layers.{}.feed_forward.w2", il);
|
|
18483
|
-
|
|
18484
|
-
return cur;
|
|
18485
18714
|
}
|
|
18486
18715
|
|
|
18487
18716
|
ggml_tensor * build_attn_block(ggml_tensor * cur,
|
|
@@ -18851,6 +19080,291 @@ struct llm_build_smallthinker : public llm_graph_context{
|
|
|
18851
19080
|
}
|
|
18852
19081
|
};
|
|
18853
19082
|
|
|
19083
|
+
struct llm_build_grovemoe : public llm_graph_context {
|
|
19084
|
+
llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
19085
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
19086
|
+
const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
|
|
19087
|
+
|
|
19088
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
19089
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
19090
|
+
|
|
19091
|
+
ggml_tensor * cur;
|
|
19092
|
+
ggml_tensor * inpL;
|
|
19093
|
+
|
|
19094
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
19095
|
+
|
|
19096
|
+
// inp_pos - contains the positions
|
|
19097
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
19098
|
+
|
|
19099
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
19100
|
+
|
|
19101
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
19102
|
+
|
|
19103
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
19104
|
+
ggml_tensor * inpSA = inpL;
|
|
19105
|
+
|
|
19106
|
+
// norm
|
|
19107
|
+
cur = build_norm(inpL,
|
|
19108
|
+
model.layers[il].attn_norm, NULL,
|
|
19109
|
+
LLM_NORM_RMS, il);
|
|
19110
|
+
cb(cur, "attn_norm", il);
|
|
19111
|
+
|
|
19112
|
+
// self_attention
|
|
19113
|
+
{
|
|
19114
|
+
// compute Q and K and RoPE them
|
|
19115
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
19116
|
+
cb(Qcur, "Qcur", il);
|
|
19117
|
+
|
|
19118
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
19119
|
+
cb(Kcur, "Kcur", il);
|
|
19120
|
+
|
|
19121
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
19122
|
+
cb(Vcur, "Vcur", il);
|
|
19123
|
+
|
|
19124
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
19125
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
19126
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
19127
|
+
|
|
19128
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
19129
|
+
cb(Qcur, "Qcur_normed", il);
|
|
19130
|
+
|
|
19131
|
+
Qcur = ggml_rope_ext(
|
|
19132
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
19133
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
19134
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
19135
|
+
);
|
|
19136
|
+
|
|
19137
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
19138
|
+
cb(Kcur, "Kcur_normed", il);
|
|
19139
|
+
|
|
19140
|
+
Kcur = ggml_rope_ext(
|
|
19141
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
19142
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
19143
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
19144
|
+
);
|
|
19145
|
+
|
|
19146
|
+
cb(Qcur, "Qcur", il);
|
|
19147
|
+
cb(Kcur, "Kcur", il);
|
|
19148
|
+
cb(Vcur, "Vcur", il);
|
|
19149
|
+
|
|
19150
|
+
cur = build_attn(inp_attn,
|
|
19151
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
19152
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
19153
|
+
}
|
|
19154
|
+
|
|
19155
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
19156
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
19157
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
19158
|
+
}
|
|
19159
|
+
|
|
19160
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
19161
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
19162
|
+
|
|
19163
|
+
// MoE branch
|
|
19164
|
+
cur = build_norm(ffn_inp,
|
|
19165
|
+
model.layers[il].ffn_norm, NULL,
|
|
19166
|
+
LLM_NORM_RMS, il);
|
|
19167
|
+
cb(cur, "ffn_norm", il);
|
|
19168
|
+
|
|
19169
|
+
ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, cur); // [n_expert, n_tokens]
|
|
19170
|
+
cb(probs, "ffn_moe_logits", il);
|
|
19171
|
+
|
|
19172
|
+
ggml_tensor * moe_out =
|
|
19173
|
+
build_moe_ffn(cur,
|
|
19174
|
+
nullptr,
|
|
19175
|
+
model.layers[il].ffn_up_exps,
|
|
19176
|
+
model.layers[il].ffn_gate_exps,
|
|
19177
|
+
model.layers[il].ffn_down_exps,
|
|
19178
|
+
nullptr,
|
|
19179
|
+
n_expert, n_expert_used,
|
|
19180
|
+
LLM_FFN_SILU, true,
|
|
19181
|
+
false, 0.0,
|
|
19182
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
19183
|
+
il, probs);
|
|
19184
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
19185
|
+
cur = moe_out;
|
|
19186
|
+
|
|
19187
|
+
// TODO: Only do the expert selection and weights once
|
|
19188
|
+
moe_out =
|
|
19189
|
+
build_moe_ffn(cur,
|
|
19190
|
+
nullptr,
|
|
19191
|
+
model.layers[il].ffn_up_chexps,
|
|
19192
|
+
model.layers[il].ffn_gate_chexps,
|
|
19193
|
+
model.layers[il].ffn_down_chexps,
|
|
19194
|
+
nullptr,
|
|
19195
|
+
n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used,
|
|
19196
|
+
LLM_FFN_SILU, true,
|
|
19197
|
+
false, 0.0,
|
|
19198
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
19199
|
+
il, probs);
|
|
19200
|
+
cb(moe_out, "ffn_adj_moe_out", il);
|
|
19201
|
+
|
|
19202
|
+
cur = ggml_add(ctx0, cur, ggml_scale(ctx0, moe_out, hparams.expert_group_scale));
|
|
19203
|
+
cb(cur, "ffn_final_moe_out", il);
|
|
19204
|
+
|
|
19205
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
19206
|
+
|
|
19207
|
+
cur = build_cvec(cur, il);
|
|
19208
|
+
cb(cur, "l_out", il);
|
|
19209
|
+
|
|
19210
|
+
// input for next layer
|
|
19211
|
+
inpL = cur;
|
|
19212
|
+
}
|
|
19213
|
+
|
|
19214
|
+
cur = inpL;
|
|
19215
|
+
|
|
19216
|
+
cur = build_norm(cur,
|
|
19217
|
+
model.output_norm, NULL,
|
|
19218
|
+
LLM_NORM_RMS, -1);
|
|
19219
|
+
|
|
19220
|
+
cb(cur, "result_norm", -1);
|
|
19221
|
+
res->t_embd = cur;
|
|
19222
|
+
|
|
19223
|
+
// lm_head
|
|
19224
|
+
cur = build_lora_mm(model.output, cur);
|
|
19225
|
+
|
|
19226
|
+
cb(cur, "result_output", -1);
|
|
19227
|
+
res->t_logits = cur;
|
|
19228
|
+
|
|
19229
|
+
ggml_build_forward_expand(gf, cur);
|
|
19230
|
+
}
|
|
19231
|
+
};
|
|
19232
|
+
|
|
19233
|
+
struct llm_build_apertus : public llm_graph_context {
|
|
19234
|
+
llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
19235
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
19236
|
+
|
|
19237
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
19238
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
19239
|
+
|
|
19240
|
+
ggml_tensor * cur;
|
|
19241
|
+
ggml_tensor * inpL;
|
|
19242
|
+
|
|
19243
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
19244
|
+
|
|
19245
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
19246
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
19247
|
+
|
|
19248
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
19249
|
+
|
|
19250
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
19251
|
+
|
|
19252
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
19253
|
+
ggml_tensor * inpSA = inpL;
|
|
19254
|
+
|
|
19255
|
+
cur = build_norm(inpL,
|
|
19256
|
+
model.layers[il].attn_norm, nullptr,
|
|
19257
|
+
LLM_NORM_RMS, il);
|
|
19258
|
+
cb(cur, "attn_norm", il);
|
|
19259
|
+
|
|
19260
|
+
// self-attention
|
|
19261
|
+
{
|
|
19262
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
19263
|
+
|
|
19264
|
+
// compute Q and K and RoPE them
|
|
19265
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
19266
|
+
cb(Qcur, "Qcur", il);
|
|
19267
|
+
|
|
19268
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
19269
|
+
cb(Kcur, "Kcur", il);
|
|
19270
|
+
|
|
19271
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
19272
|
+
cb(Vcur, "Vcur", il);
|
|
19273
|
+
|
|
19274
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
19275
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
19276
|
+
cb(Qcur, "Qcur_normed", il);
|
|
19277
|
+
|
|
19278
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
19279
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
19280
|
+
cb(Kcur, "Kcur_normed", il);
|
|
19281
|
+
|
|
19282
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
19283
|
+
|
|
19284
|
+
Qcur = ggml_rope_ext(
|
|
19285
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
19286
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
19287
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
19288
|
+
);
|
|
19289
|
+
|
|
19290
|
+
Kcur = ggml_rope_ext(
|
|
19291
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
19292
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
19293
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
19294
|
+
);
|
|
19295
|
+
|
|
19296
|
+
cb(Qcur, "Qcur_pos", il);
|
|
19297
|
+
cb(Kcur, "Kcur_pos", il);
|
|
19298
|
+
cb(Vcur, "Vcur_pos", il);
|
|
19299
|
+
|
|
19300
|
+
cur = build_attn(inp_attn,
|
|
19301
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
19302
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
19303
|
+
cb(cur, "attn_out", il);
|
|
19304
|
+
}
|
|
19305
|
+
|
|
19306
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
19307
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
19308
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
19309
|
+
}
|
|
19310
|
+
|
|
19311
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
19312
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
19313
|
+
|
|
19314
|
+
// feed-forward network with xIELU activation
|
|
19315
|
+
{
|
|
19316
|
+
cur = build_norm(ffn_inp,
|
|
19317
|
+
model.layers[il].ffn_norm, nullptr,
|
|
19318
|
+
LLM_NORM_RMS, il);
|
|
19319
|
+
cb(cur, "ffn_norm", il);
|
|
19320
|
+
|
|
19321
|
+
// Up projection
|
|
19322
|
+
ggml_tensor * up = build_lora_mm(model.layers[il].ffn_up, cur);
|
|
19323
|
+
cb(up, "ffn_up", il);
|
|
19324
|
+
|
|
19325
|
+
float alpha_n_val = hparams.xielu_alpha_n[il];
|
|
19326
|
+
float alpha_p_val = hparams.xielu_alpha_p[il];
|
|
19327
|
+
float beta_val = hparams.xielu_beta[il];
|
|
19328
|
+
float eps_val = hparams.xielu_eps[il];
|
|
19329
|
+
|
|
19330
|
+
// Apply xIELU activation
|
|
19331
|
+
ggml_tensor * activated = ggml_xielu(ctx0, up, alpha_n_val, alpha_p_val, beta_val, eps_val);
|
|
19332
|
+
cb(activated, "ffn_xielu", il);
|
|
19333
|
+
|
|
19334
|
+
// Down projection
|
|
19335
|
+
cur = build_lora_mm(model.layers[il].ffn_down, activated);
|
|
19336
|
+
cb(cur, "ffn_down", il);
|
|
19337
|
+
}
|
|
19338
|
+
|
|
19339
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
19340
|
+
cb(cur, "ffn_out", il);
|
|
19341
|
+
|
|
19342
|
+
cur = build_cvec(cur, il);
|
|
19343
|
+
cb(cur, "l_out", il);
|
|
19344
|
+
|
|
19345
|
+
// input for next layer
|
|
19346
|
+
inpL = cur;
|
|
19347
|
+
}
|
|
19348
|
+
|
|
19349
|
+
cur = inpL;
|
|
19350
|
+
|
|
19351
|
+
cur = build_norm(cur,
|
|
19352
|
+
model.output_norm, nullptr,
|
|
19353
|
+
LLM_NORM_RMS, -1);
|
|
19354
|
+
|
|
19355
|
+
cb(cur, "result_norm", -1);
|
|
19356
|
+
res->t_embd = cur;
|
|
19357
|
+
|
|
19358
|
+
// lm_head
|
|
19359
|
+
cur = build_lora_mm(model.output, cur);
|
|
19360
|
+
|
|
19361
|
+
cb(cur, "result_output", -1);
|
|
19362
|
+
res->t_logits = cur;
|
|
19363
|
+
|
|
19364
|
+
ggml_build_forward_expand(gf, cur);
|
|
19365
|
+
}
|
|
19366
|
+
};
|
|
19367
|
+
|
|
18854
19368
|
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
|
18855
19369
|
llama_memory_i * res;
|
|
18856
19370
|
|
|
@@ -19366,6 +19880,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
19366
19880
|
llm = std::make_unique<llm_build_falcon_h1>(*this, params);
|
|
19367
19881
|
} break;
|
|
19368
19882
|
case LLM_ARCH_LFM2:
|
|
19883
|
+
case LLM_ARCH_LFM2MOE:
|
|
19369
19884
|
{
|
|
19370
19885
|
llm = std::make_unique<llm_build_lfm2>(*this, params);
|
|
19371
19886
|
} break;
|
|
@@ -19377,6 +19892,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
19377
19892
|
llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
|
|
19378
19893
|
}
|
|
19379
19894
|
} break;
|
|
19895
|
+
case LLM_ARCH_GROVEMOE:
|
|
19896
|
+
{
|
|
19897
|
+
llm = std::make_unique<llm_build_grovemoe>(*this, params);
|
|
19898
|
+
} break;
|
|
19899
|
+
case LLM_ARCH_APERTUS:
|
|
19900
|
+
{
|
|
19901
|
+
llm = std::make_unique<llm_build_apertus>(*this, params);
|
|
19902
|
+
} break;
|
|
19380
19903
|
default:
|
|
19381
19904
|
GGML_ABORT("fatal error");
|
|
19382
19905
|
}
|
|
@@ -19384,6 +19907,12 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
19384
19907
|
// add on pooling layer
|
|
19385
19908
|
llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
|
|
19386
19909
|
|
|
19910
|
+
// if the gguf model was converted with --sentence-transformers-dense-modules
|
|
19911
|
+
// there will be two additional dense projection layers
|
|
19912
|
+
// dense linear projections are applied after pooling
|
|
19913
|
+
// TODO: move reranking logic here and generalize
|
|
19914
|
+
llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
|
|
19915
|
+
|
|
19387
19916
|
return llm->res->get_gf();
|
|
19388
19917
|
}
|
|
19389
19918
|
|
|
@@ -19408,6 +19937,7 @@ llama_model_params llama_model_default_params() {
|
|
|
19408
19937
|
/*.use_mlock =*/ false,
|
|
19409
19938
|
/*.check_tensors =*/ false,
|
|
19410
19939
|
/*.use_extra_bufts =*/ true,
|
|
19940
|
+
/*.no_host =*/ false,
|
|
19411
19941
|
};
|
|
19412
19942
|
|
|
19413
19943
|
return result;
|
|
@@ -19579,9 +20109,12 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
19579
20109
|
case LLM_ARCH_OPENAI_MOE:
|
|
19580
20110
|
case LLM_ARCH_HUNYUAN_DENSE:
|
|
19581
20111
|
case LLM_ARCH_LFM2:
|
|
20112
|
+
case LLM_ARCH_LFM2MOE:
|
|
19582
20113
|
case LLM_ARCH_SMALLTHINKER:
|
|
19583
20114
|
case LLM_ARCH_GLM4_MOE:
|
|
19584
20115
|
case LLM_ARCH_SEED_OSS:
|
|
20116
|
+
case LLM_ARCH_GROVEMOE:
|
|
20117
|
+
case LLM_ARCH_APERTUS:
|
|
19585
20118
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
19586
20119
|
|
|
19587
20120
|
case LLM_ARCH_QWEN2VL:
|
|
@@ -19692,6 +20225,10 @@ bool llama_model_is_recurrent(const llama_model * model) {
|
|
|
19692
20225
|
return llm_arch_is_recurrent(model->arch);
|
|
19693
20226
|
}
|
|
19694
20227
|
|
|
20228
|
+
bool llama_model_is_hybrid(const llama_model * model) {
|
|
20229
|
+
return llm_arch_is_hybrid(model->arch);
|
|
20230
|
+
}
|
|
20231
|
+
|
|
19695
20232
|
bool llama_model_is_diffusion(const llama_model * model) {
|
|
19696
20233
|
return llm_arch_is_diffusion(model->arch);
|
|
19697
20234
|
}
|