@fugood/llama.node 1.4.11 → 1.4.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +30 -30
- package/src/llama.cpp/common/arg.cpp +29 -14
- package/src/llama.cpp/common/arg.h +1 -0
- package/src/llama.cpp/common/chat-parser.cpp +11 -0
- package/src/llama.cpp/common/chat.cpp +32 -3
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +23 -23
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- package/src/llama.cpp/include/llama.h +13 -4
- package/src/llama.cpp/src/CMakeLists.txt +4 -0
- package/src/llama.cpp/src/llama-adapter.cpp +12 -3
- package/src/llama.cpp/src/llama-adapter.h +7 -1
- package/src/llama.cpp/src/llama-arch.cpp +76 -0
- package/src/llama.cpp/src/llama-arch.h +7 -0
- package/src/llama.cpp/src/llama-chat.cpp +11 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +22 -21
- package/src/llama.cpp/src/llama-hparams.h +4 -3
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +287 -16
- package/src/llama.cpp/src/llama-model.h +13 -2
- package/src/llama.cpp/src/llama-sampling.cpp +44 -33
- package/src/llama.cpp/src/llama-sampling.h +3 -0
- package/src/llama.cpp/src/llama-vocab.cpp +101 -33
- package/src/llama.cpp/src/llama-vocab.h +2 -0
- package/src/llama.cpp/src/llama.cpp +52 -37
- package/src/llama.cpp/src/models/bert.cpp +4 -2
- package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
- package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
- package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- package/src/llama.cpp/src/models/gemma3.cpp +3 -4
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- package/src/llama.cpp/src/models/llama.cpp +19 -6
- package/src/llama.cpp/src/models/maincoder.cpp +117 -0
- package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- package/src/llama.cpp/src/models/models.h +18 -0
- package/src/llama.cpp/src/models/modern-bert.cpp +115 -0
- package/src/llama.cpp/src/models/plamo3.cpp +128 -0
- package/src/llama.cpp/src/unicode.cpp +23 -14
|
@@ -31,12 +31,14 @@ const char * llm_type_name(llm_type type) {
|
|
|
31
31
|
case LLM_TYPE_17M: return "17M";
|
|
32
32
|
case LLM_TYPE_22M: return "22M";
|
|
33
33
|
case LLM_TYPE_33M: return "33M";
|
|
34
|
+
case LLM_TYPE_47M: return "47M";
|
|
34
35
|
case LLM_TYPE_60M: return "60M";
|
|
35
36
|
case LLM_TYPE_70M: return "70M";
|
|
36
37
|
case LLM_TYPE_80M: return "80M";
|
|
37
38
|
case LLM_TYPE_109M: return "109M";
|
|
38
39
|
case LLM_TYPE_137M: return "137M";
|
|
39
40
|
case LLM_TYPE_140M: return "140M";
|
|
41
|
+
case LLM_TYPE_149M: return "149M";
|
|
40
42
|
case LLM_TYPE_160M: return "160M";
|
|
41
43
|
case LLM_TYPE_190M: return "190M";
|
|
42
44
|
case LLM_TYPE_220M: return "220M";
|
|
@@ -46,6 +48,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
46
48
|
case LLM_TYPE_335M: return "335M";
|
|
47
49
|
case LLM_TYPE_350M: return "350M";
|
|
48
50
|
case LLM_TYPE_360M: return "360M";
|
|
51
|
+
case LLM_TYPE_395M: return "395M";
|
|
49
52
|
case LLM_TYPE_410M: return "410M";
|
|
50
53
|
case LLM_TYPE_450M: return "450M";
|
|
51
54
|
case LLM_TYPE_475M: return "475M";
|
|
@@ -123,10 +126,12 @@ const char * llm_type_name(llm_type type) {
|
|
|
123
126
|
case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
|
|
124
127
|
case LLM_TYPE_80B_A3B: return "80B.A3B";
|
|
125
128
|
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
|
129
|
+
case LLM_TYPE_102B_A12B: return "102B.A12B";
|
|
126
130
|
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
|
127
131
|
case LLM_TYPE_230B_A10B: return "230B.A10B";
|
|
128
132
|
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
129
133
|
case LLM_TYPE_300B_A47B: return "300B.A47B";
|
|
134
|
+
case LLM_TYPE_310B_A15B: return "310B.A15B";
|
|
130
135
|
case LLM_TYPE_355B_A32B: return "355B.A32B";
|
|
131
136
|
case LLM_TYPE_E2B: return "E2B";
|
|
132
137
|
case LLM_TYPE_E4B: return "E4B";
|
|
@@ -603,7 +608,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
603
608
|
|
|
604
609
|
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
|
605
610
|
|
|
606
|
-
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
|
|
611
|
+
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
|
|
607
612
|
if (hparams.n_rot != hparams.n_embd_head_k) {
|
|
608
613
|
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
|
609
614
|
}
|
|
@@ -627,6 +632,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
627
632
|
// arch-specific KVs
|
|
628
633
|
switch (arch) {
|
|
629
634
|
case LLM_ARCH_LLAMA:
|
|
635
|
+
case LLM_ARCH_LLAMA_EMBED:
|
|
630
636
|
{
|
|
631
637
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
632
638
|
|
|
@@ -875,6 +881,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
875
881
|
default: type = LLM_TYPE_UNKNOWN;
|
|
876
882
|
}
|
|
877
883
|
} break;
|
|
884
|
+
case LLM_ARCH_MODERN_BERT:
|
|
885
|
+
{
|
|
886
|
+
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
887
|
+
if (found_swa && hparams.n_swa > 0) {
|
|
888
|
+
uint32_t swa_period = 3;
|
|
889
|
+
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
|
|
890
|
+
|
|
891
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
|
892
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
893
|
+
hparams.set_swa_pattern(swa_period);
|
|
894
|
+
} else {
|
|
895
|
+
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
899
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
900
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
901
|
+
|
|
902
|
+
switch (hparams.n_layer) {
|
|
903
|
+
case 12:
|
|
904
|
+
type = LLM_TYPE_47M; break; // granite-embedding-small
|
|
905
|
+
case 22:
|
|
906
|
+
type = LLM_TYPE_149M; break; // modern-bert-base
|
|
907
|
+
case 28:
|
|
908
|
+
type = LLM_TYPE_395M; break; // modern-bert-large
|
|
909
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
910
|
+
}
|
|
911
|
+
} break;
|
|
878
912
|
case LLM_ARCH_JINA_BERT_V2:
|
|
879
913
|
{
|
|
880
914
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1076,6 +1110,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1076
1110
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1077
1111
|
}
|
|
1078
1112
|
} break;
|
|
1113
|
+
case LLM_ARCH_MAINCODER:
|
|
1114
|
+
{
|
|
1115
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1116
|
+
switch (hparams.n_layer) {
|
|
1117
|
+
case 32: type = LLM_TYPE_1B; break;
|
|
1118
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1119
|
+
}
|
|
1120
|
+
} break;
|
|
1079
1121
|
case LLM_ARCH_QWEN3VL:
|
|
1080
1122
|
{
|
|
1081
1123
|
ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
|
|
@@ -1194,6 +1236,26 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1194
1236
|
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
|
|
1195
1237
|
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
|
|
1196
1238
|
} break;
|
|
1239
|
+
case LLM_ARCH_PLAMO3:
|
|
1240
|
+
{
|
|
1241
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1242
|
+
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
1243
|
+
if (found_swa && hparams.n_swa > 0) {
|
|
1244
|
+
uint32_t swa_period = 8;
|
|
1245
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1246
|
+
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1247
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
|
1248
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
1249
|
+
hparams.set_swa_pattern(swa_period);
|
|
1250
|
+
} else {
|
|
1251
|
+
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
1252
|
+
}
|
|
1253
|
+
|
|
1254
|
+
switch (hparams.n_layer) {
|
|
1255
|
+
case 24: type = LLM_TYPE_2B; break;
|
|
1256
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1257
|
+
}
|
|
1258
|
+
} break;
|
|
1197
1259
|
case LLM_ARCH_GPT2:
|
|
1198
1260
|
{
|
|
1199
1261
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1629,7 +1691,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1629
1691
|
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
|
|
1630
1692
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1631
1693
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1632
|
-
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1694
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
1633
1695
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1634
1696
|
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
1635
1697
|
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
|
@@ -1725,6 +1787,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1725
1787
|
|
|
1726
1788
|
switch (hparams.n_layer) {
|
|
1727
1789
|
case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
|
|
1790
|
+
case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
|
|
1728
1791
|
case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
|
|
1729
1792
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1730
1793
|
}
|
|
@@ -2307,6 +2370,22 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2307
2370
|
default: type = LLM_TYPE_UNKNOWN;
|
|
2308
2371
|
}
|
|
2309
2372
|
} break;
|
|
2373
|
+
case LLM_ARCH_MIMO2:
|
|
2374
|
+
{
|
|
2375
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2376
|
+
|
|
2377
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
2378
|
+
|
|
2379
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
2380
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
2381
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
|
2382
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
|
|
2383
|
+
|
|
2384
|
+
switch (hparams.n_layer) {
|
|
2385
|
+
case 48: type = LLM_TYPE_310B_A15B; break;
|
|
2386
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
2387
|
+
}
|
|
2388
|
+
} break;
|
|
2310
2389
|
default: throw std::runtime_error("unsupported model architecture");
|
|
2311
2390
|
}
|
|
2312
2391
|
|
|
@@ -2329,11 +2408,11 @@ void llama_model::load_vocab(llama_model_loader & ml) {
|
|
|
2329
2408
|
|
|
2330
2409
|
bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
2331
2410
|
const auto & split_mode = params.split_mode;
|
|
2332
|
-
const auto & n_gpu_layers = params.n_gpu_layers;
|
|
2333
2411
|
const auto & use_mlock = params.use_mlock;
|
|
2334
2412
|
const auto & tensor_split = params.tensor_split;
|
|
2335
2413
|
|
|
2336
|
-
const int n_layer
|
|
2414
|
+
const int n_layer = hparams.n_layer;
|
|
2415
|
+
const int n_gpu_layers = this->n_gpu_layers();
|
|
2337
2416
|
|
|
2338
2417
|
const bool use_mmap_buffer = true;
|
|
2339
2418
|
|
|
@@ -2621,6 +2700,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2621
2700
|
case LLM_ARCH_GRANITE:
|
|
2622
2701
|
case LLM_ARCH_GRANITE_MOE:
|
|
2623
2702
|
case LLM_ARCH_MISTRAL3:
|
|
2703
|
+
case LLM_ARCH_LLAMA_EMBED:
|
|
2624
2704
|
{
|
|
2625
2705
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2626
2706
|
|
|
@@ -3155,6 +3235,37 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3155
3235
|
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
|
|
3156
3236
|
}
|
|
3157
3237
|
} break;
|
|
3238
|
+
case LLM_ARCH_MODERN_BERT:
|
|
3239
|
+
{
|
|
3240
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3241
|
+
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
|
3242
|
+
|
|
3243
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3244
|
+
|
|
3245
|
+
for(int i = 0; i < n_layer; ++i) {
|
|
3246
|
+
auto& layer = layers[i];
|
|
3247
|
+
|
|
3248
|
+
if ( i != 0 ) {
|
|
3249
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3250
|
+
} else{
|
|
3251
|
+
// layer 0 uses identity
|
|
3252
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3253
|
+
}
|
|
3254
|
+
|
|
3255
|
+
|
|
3256
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
|
|
3257
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
3258
|
+
|
|
3259
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, 2 * n_ff}, 0);
|
|
3260
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
3261
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3262
|
+
}
|
|
3263
|
+
|
|
3264
|
+
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
|
3265
|
+
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
3266
|
+
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
3267
|
+
|
|
3268
|
+
} break;
|
|
3158
3269
|
case LLM_ARCH_NEO_BERT:
|
|
3159
3270
|
{
|
|
3160
3271
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -3219,7 +3330,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3219
3330
|
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3220
3331
|
|
|
3221
3332
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
3222
|
-
|
|
3333
|
+
|
|
3334
|
+
const auto tn_ffn_up_weight = tn(LLM_TENSOR_FFN_UP, "weight", i);
|
|
3335
|
+
ggml_tensor * t_ffn_up = ml.get_tensor_meta(tn_ffn_up_weight.str().c_str());
|
|
3336
|
+
const int64_t n_ffn_up = t_ffn_up ? t_ffn_up->ne[1] : n_ff;
|
|
3337
|
+
|
|
3338
|
+
GGML_ASSERT(n_ffn_up == n_ff || n_ffn_up == n_ff * 2);
|
|
3339
|
+
layer.ffn_up = create_tensor(tn_ffn_up_weight, {n_embd, n_ffn_up}, 0);
|
|
3340
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ffn_up}, TENSOR_NOT_REQUIRED);
|
|
3223
3341
|
|
|
3224
3342
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
3225
3343
|
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
@@ -3747,6 +3865,44 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3747
3865
|
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
|
|
3748
3866
|
}
|
|
3749
3867
|
} break;
|
|
3868
|
+
case LLM_ARCH_PLAMO3:
|
|
3869
|
+
{
|
|
3870
|
+
const int64_t head_dim_q = hparams.n_embd_head_k;
|
|
3871
|
+
const int64_t head_dim_v = hparams.n_embd_head_v;
|
|
3872
|
+
|
|
3873
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3874
|
+
|
|
3875
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3876
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
3877
|
+
if (output == NULL) {
|
|
3878
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3879
|
+
}
|
|
3880
|
+
|
|
3881
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3882
|
+
auto & layer = layers[i];
|
|
3883
|
+
|
|
3884
|
+
const int64_t num_attention_heads = hparams.n_head(i);
|
|
3885
|
+
const int64_t num_key_value_heads = hparams.n_head_kv(i);
|
|
3886
|
+
const int64_t q_proj_dim = num_attention_heads * head_dim_q;
|
|
3887
|
+
const int64_t k_proj_dim = num_key_value_heads * head_dim_q;
|
|
3888
|
+
const int64_t v_proj_dim = num_key_value_heads * head_dim_v;
|
|
3889
|
+
const int64_t n_ff_cur = hparams.n_ff(i);
|
|
3890
|
+
|
|
3891
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3892
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i),
|
|
3893
|
+
{n_embd,q_proj_dim + k_proj_dim + v_proj_dim}, 0);
|
|
3894
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim_q}, 0);
|
|
3895
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim_q}, 0);
|
|
3896
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {num_attention_heads * head_dim_v, n_embd}, 0);
|
|
3897
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
|
|
3898
|
+
|
|
3899
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3900
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
|
|
3901
|
+
|
|
3902
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_cur * 2}, 0);
|
|
3903
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
|
|
3904
|
+
}
|
|
3905
|
+
} break;
|
|
3750
3906
|
case LLM_ARCH_GPT2:
|
|
3751
3907
|
{
|
|
3752
3908
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -4637,7 +4793,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4637
4793
|
|
|
4638
4794
|
// output
|
|
4639
4795
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4640
|
-
output
|
|
4796
|
+
// try to load output.weight, if not found, use token_embd (tied embeddings)
|
|
4797
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4798
|
+
if (!output) {
|
|
4799
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4800
|
+
}
|
|
4641
4801
|
|
|
4642
4802
|
for (int i = 0; i < n_layer; ++i) {
|
|
4643
4803
|
auto & layer = layers[i];
|
|
@@ -4700,7 +4860,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4700
4860
|
|
|
4701
4861
|
// output
|
|
4702
4862
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4703
|
-
output
|
|
4863
|
+
// try to load output.weight, if not found, use token_embd (tied embeddings)
|
|
4864
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4865
|
+
if (!output) {
|
|
4866
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4867
|
+
}
|
|
4704
4868
|
|
|
4705
4869
|
for (int i = 0; i < n_layer; ++i) {
|
|
4706
4870
|
auto & layer = layers[i];
|
|
@@ -5067,9 +5231,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5067
5231
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
|
|
5068
5232
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
|
|
5069
5233
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
|
|
5070
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
|
|
5071
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
|
|
5072
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
|
|
5234
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, TENSOR_NOT_REQUIRED | flags);
|
|
5235
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, TENSOR_NOT_REQUIRED | flags);
|
|
5236
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, TENSOR_NOT_REQUIRED | flags);
|
|
5073
5237
|
|
|
5074
5238
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
|
|
5075
5239
|
|
|
@@ -5181,9 +5345,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5181
5345
|
const int64_t n_group = hparams.ssm_n_group;
|
|
5182
5346
|
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
|
5183
5347
|
|
|
5184
|
-
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
5185
|
-
const int64_t n_ff_shexp = hparams.n_ff_shexp;
|
|
5186
|
-
|
|
5187
5348
|
// embeddings
|
|
5188
5349
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5189
5350
|
|
|
@@ -5235,6 +5396,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5235
5396
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
5236
5397
|
} else {
|
|
5237
5398
|
if (n_expert != 0) {
|
|
5399
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
5400
|
+
const int64_t n_ff_shexp = hparams.n_ff_shexp;
|
|
5401
|
+
|
|
5238
5402
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
|
|
5239
5403
|
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
|
|
5240
5404
|
|
|
@@ -6584,6 +6748,75 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6584
6748
|
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
|
|
6585
6749
|
}
|
|
6586
6750
|
} break;
|
|
6751
|
+
case LLM_ARCH_MIMO2:
|
|
6752
|
+
{
|
|
6753
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
6754
|
+
|
|
6755
|
+
// output
|
|
6756
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
6757
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
6758
|
+
|
|
6759
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
6760
|
+
auto & layer = layers[i];
|
|
6761
|
+
uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
|
|
6762
|
+
uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
|
|
6763
|
+
uint32_t n_head = hparams.n_head(i);
|
|
6764
|
+
|
|
6765
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
|
|
6766
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
|
|
6767
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
|
|
6768
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_v * n_head, n_embd }, 0);
|
|
6769
|
+
|
|
6770
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
6771
|
+
layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, TENSOR_NOT_REQUIRED);
|
|
6772
|
+
|
|
6773
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
6774
|
+
|
|
6775
|
+
// non-MoE branch
|
|
6776
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
6777
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED);
|
|
6778
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
6779
|
+
|
|
6780
|
+
// MoE branch
|
|
6781
|
+
int64_t n_ff_exp = hparams.n_ff_exp;
|
|
6782
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
|
|
6783
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
|
|
6784
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED);
|
|
6785
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
|
|
6786
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
|
6787
|
+
}
|
|
6788
|
+
} break;
|
|
6789
|
+
case LLM_ARCH_MAINCODER:
|
|
6790
|
+
{
|
|
6791
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
6792
|
+
|
|
6793
|
+
// output
|
|
6794
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
6795
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
6796
|
+
// if output is NULL, init from the input tok embed
|
|
6797
|
+
if (output == NULL) {
|
|
6798
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
6799
|
+
}
|
|
6800
|
+
|
|
6801
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
6802
|
+
auto & layer = layers[i];
|
|
6803
|
+
|
|
6804
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
6805
|
+
|
|
6806
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
6807
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
6808
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
6809
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
6810
|
+
|
|
6811
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
6812
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
6813
|
+
|
|
6814
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
6815
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
6816
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
6817
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
6818
|
+
}
|
|
6819
|
+
} break;
|
|
6587
6820
|
default:
|
|
6588
6821
|
throw std::runtime_error("unknown architecture");
|
|
6589
6822
|
}
|
|
@@ -6765,6 +6998,14 @@ size_t llama_model::n_devices() const {
|
|
|
6765
6998
|
return devices.size();
|
|
6766
6999
|
}
|
|
6767
7000
|
|
|
7001
|
+
uint32_t llama_model::n_gpu_layers() const {
|
|
7002
|
+
return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
|
|
7003
|
+
}
|
|
7004
|
+
|
|
7005
|
+
llama_split_mode llama_model::split_mode() const {
|
|
7006
|
+
return params.split_mode;
|
|
7007
|
+
}
|
|
7008
|
+
|
|
6768
7009
|
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
|
6769
7010
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
6770
7011
|
for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
|
|
@@ -7089,6 +7330,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
7089
7330
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
7090
7331
|
case LLM_ARCH_NEO_BERT:
|
|
7091
7332
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
7333
|
+
case LLM_ARCH_MODERN_BERT:
|
|
7092
7334
|
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
7093
7335
|
case LLM_ARCH_DREAM:
|
|
7094
7336
|
case LLM_ARCH_LLADA:
|
|
@@ -7206,16 +7448,24 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7206
7448
|
switch (arch) {
|
|
7207
7449
|
case LLM_ARCH_LLAMA:
|
|
7208
7450
|
{
|
|
7209
|
-
llm = std::make_unique<llm_build_llama
|
|
7451
|
+
llm = std::make_unique<llm_build_llama<false>>(*this, params);
|
|
7210
7452
|
} break;
|
|
7211
7453
|
case LLM_ARCH_LLAMA4:
|
|
7212
7454
|
{
|
|
7213
7455
|
if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
|
|
7214
|
-
llm = std::make_unique<llm_build_llama
|
|
7456
|
+
llm = std::make_unique<llm_build_llama<false>>(*this, params);
|
|
7215
7457
|
} else {
|
|
7216
7458
|
llm = std::make_unique<llm_build_llama_iswa>(*this, params);
|
|
7217
7459
|
}
|
|
7218
7460
|
} break;
|
|
7461
|
+
case LLM_ARCH_LLAMA_EMBED:
|
|
7462
|
+
{
|
|
7463
|
+
llm = std::make_unique<llm_build_llama<true>>(*this, params);
|
|
7464
|
+
} break;
|
|
7465
|
+
case LLM_ARCH_MAINCODER:
|
|
7466
|
+
{
|
|
7467
|
+
llm = std::make_unique<llm_build_maincoder>(*this, params);
|
|
7468
|
+
} break;
|
|
7219
7469
|
case LLM_ARCH_DECI:
|
|
7220
7470
|
{
|
|
7221
7471
|
llm = std::make_unique<llm_build_deci>(*this, params);
|
|
@@ -7248,6 +7498,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7248
7498
|
{
|
|
7249
7499
|
llm = std::make_unique<llm_build_bert>(*this, params);
|
|
7250
7500
|
} break;
|
|
7501
|
+
case LLM_ARCH_MODERN_BERT:
|
|
7502
|
+
{
|
|
7503
|
+
llm = std::make_unique<llm_build_modern_bert>(*this, params);
|
|
7504
|
+
} break;
|
|
7251
7505
|
case LLM_ARCH_NEO_BERT:
|
|
7252
7506
|
{
|
|
7253
7507
|
llm = std::make_unique<llm_build_neo_bert>(*this, params);
|
|
@@ -7337,6 +7591,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7337
7591
|
{
|
|
7338
7592
|
llm = std::make_unique<llm_build_plamo2>(*this, params);
|
|
7339
7593
|
} break;
|
|
7594
|
+
case LLM_ARCH_PLAMO3:
|
|
7595
|
+
{
|
|
7596
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
7597
|
+
llm = std::make_unique<llm_build_plamo3<true>> (*this, params);
|
|
7598
|
+
} else {
|
|
7599
|
+
llm = std::make_unique<llm_build_plamo3<false>>(*this, params);
|
|
7600
|
+
}
|
|
7601
|
+
} break;
|
|
7340
7602
|
case LLM_ARCH_GPT2:
|
|
7341
7603
|
{
|
|
7342
7604
|
llm = std::make_unique<llm_build_gpt2>(*this, params);
|
|
@@ -7637,6 +7899,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7637
7899
|
{
|
|
7638
7900
|
llm = std::make_unique<llm_build_mistral3>(*this, params);
|
|
7639
7901
|
} break;
|
|
7902
|
+
case LLM_ARCH_MIMO2:
|
|
7903
|
+
{
|
|
7904
|
+
llm = std::make_unique<llm_build_mimo2_iswa>(*this, params);
|
|
7905
|
+
} break;
|
|
7640
7906
|
default:
|
|
7641
7907
|
GGML_ABORT("fatal error");
|
|
7642
7908
|
}
|
|
@@ -7662,7 +7928,7 @@ llama_model_params llama_model_default_params() {
|
|
|
7662
7928
|
llama_model_params result = {
|
|
7663
7929
|
/*.devices =*/ nullptr,
|
|
7664
7930
|
/*.tensor_buft_overrides =*/ nullptr,
|
|
7665
|
-
/*.n_gpu_layers =*/
|
|
7931
|
+
/*.n_gpu_layers =*/ -1,
|
|
7666
7932
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
|
7667
7933
|
/*.main_gpu =*/ 0,
|
|
7668
7934
|
/*.tensor_split =*/ nullptr,
|
|
@@ -7807,6 +8073,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7807
8073
|
case LLM_ARCH_ERNIE4_5:
|
|
7808
8074
|
case LLM_ARCH_ERNIE4_5_MOE:
|
|
7809
8075
|
case LLM_ARCH_MISTRAL3:
|
|
8076
|
+
case LLM_ARCH_LLAMA_EMBED:
|
|
8077
|
+
case LLM_ARCH_MAINCODER:
|
|
7810
8078
|
return LLAMA_ROPE_TYPE_NORM;
|
|
7811
8079
|
|
|
7812
8080
|
// the pairs of head values are offset by n_rot/2
|
|
@@ -7816,6 +8084,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7816
8084
|
case LLM_ARCH_DBRX:
|
|
7817
8085
|
case LLM_ARCH_BERT:
|
|
7818
8086
|
case LLM_ARCH_JINA_BERT_V3:
|
|
8087
|
+
case LLM_ARCH_MODERN_BERT:
|
|
7819
8088
|
case LLM_ARCH_NOMIC_BERT:
|
|
7820
8089
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
7821
8090
|
case LLM_ARCH_STABLELM:
|
|
@@ -7835,6 +8104,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7835
8104
|
case LLM_ARCH_PHIMOE:
|
|
7836
8105
|
case LLM_ARCH_PLAMO:
|
|
7837
8106
|
case LLM_ARCH_PLAMO2:
|
|
8107
|
+
case LLM_ARCH_PLAMO3:
|
|
7838
8108
|
case LLM_ARCH_GEMMA:
|
|
7839
8109
|
case LLM_ARCH_GEMMA2:
|
|
7840
8110
|
case LLM_ARCH_GEMMA3:
|
|
@@ -7865,6 +8135,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7865
8135
|
case LLM_ARCH_PANGU_EMBED:
|
|
7866
8136
|
case LLM_ARCH_AFMOE:
|
|
7867
8137
|
case LLM_ARCH_QWEN3NEXT:
|
|
8138
|
+
case LLM_ARCH_MIMO2:
|
|
7868
8139
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
7869
8140
|
|
|
7870
8141
|
case LLM_ARCH_QWEN2VL:
|
|
@@ -24,12 +24,14 @@ enum llm_type {
|
|
|
24
24
|
LLM_TYPE_17M,
|
|
25
25
|
LLM_TYPE_22M,
|
|
26
26
|
LLM_TYPE_33M,
|
|
27
|
+
LLM_TYPE_47M,
|
|
27
28
|
LLM_TYPE_60M,
|
|
28
29
|
LLM_TYPE_70M,
|
|
29
30
|
LLM_TYPE_80M,
|
|
30
31
|
LLM_TYPE_109M,
|
|
31
32
|
LLM_TYPE_137M,
|
|
32
33
|
LLM_TYPE_140M,
|
|
34
|
+
LLM_TYPE_149M,
|
|
33
35
|
LLM_TYPE_160M,
|
|
34
36
|
LLM_TYPE_190M,
|
|
35
37
|
LLM_TYPE_220M,
|
|
@@ -39,6 +41,7 @@ enum llm_type {
|
|
|
39
41
|
LLM_TYPE_335M,
|
|
40
42
|
LLM_TYPE_350M,
|
|
41
43
|
LLM_TYPE_360M,
|
|
44
|
+
LLM_TYPE_395M,
|
|
42
45
|
LLM_TYPE_410M,
|
|
43
46
|
LLM_TYPE_450M,
|
|
44
47
|
LLM_TYPE_475M,
|
|
@@ -116,10 +119,12 @@ enum llm_type {
|
|
|
116
119
|
LLM_TYPE_31B_A3_5B,
|
|
117
120
|
LLM_TYPE_80B_A3B, // Qwen3 Next
|
|
118
121
|
LLM_TYPE_100B_A6B,
|
|
122
|
+
LLM_TYPE_102B_A12B, // Solar-Open
|
|
119
123
|
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
|
120
124
|
LLM_TYPE_230B_A10B, // Minimax M2
|
|
121
125
|
LLM_TYPE_235B_A22B,
|
|
122
126
|
LLM_TYPE_300B_A47B, // Ernie MoE big
|
|
127
|
+
LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
|
|
123
128
|
LLM_TYPE_355B_A32B, // GLM-4.5
|
|
124
129
|
LLM_TYPE_E2B,
|
|
125
130
|
LLM_TYPE_E4B,
|
|
@@ -462,8 +467,6 @@ struct llama_model {
|
|
|
462
467
|
struct ggml_tensor * dense_2_out_layers = nullptr;
|
|
463
468
|
struct ggml_tensor * dense_3_out_layers = nullptr;
|
|
464
469
|
|
|
465
|
-
llama_model_params params;
|
|
466
|
-
|
|
467
470
|
// gguf metadata
|
|
468
471
|
std::unordered_map<std::string, std::string> gguf_kv;
|
|
469
472
|
|
|
@@ -473,6 +476,9 @@ struct llama_model {
|
|
|
473
476
|
// for quantize-stats only
|
|
474
477
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
|
475
478
|
|
|
479
|
+
// for keeping track of extra nodes used by lora adapters
|
|
480
|
+
uint32_t n_lora_nodes = 0;
|
|
481
|
+
|
|
476
482
|
int64_t t_load_us = 0;
|
|
477
483
|
int64_t t_start_us = 0;
|
|
478
484
|
|
|
@@ -494,6 +500,9 @@ struct llama_model {
|
|
|
494
500
|
size_t n_tensors() const;
|
|
495
501
|
size_t n_devices() const;
|
|
496
502
|
|
|
503
|
+
uint32_t n_gpu_layers() const;
|
|
504
|
+
llama_split_mode split_mode() const;
|
|
505
|
+
|
|
497
506
|
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
|
|
498
507
|
|
|
499
508
|
// total number of parameters in the model
|
|
@@ -522,6 +531,8 @@ struct llama_model {
|
|
|
522
531
|
ggml_cgraph * build_graph(const llm_graph_params & params) const;
|
|
523
532
|
|
|
524
533
|
private:
|
|
534
|
+
llama_model_params params;
|
|
535
|
+
|
|
525
536
|
struct impl;
|
|
526
537
|
std::unique_ptr<impl> pimpl;
|
|
527
538
|
};
|