@fugood/llama.node 1.4.11 → 1.4.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +31 -31
- package/src/llama.cpp/common/arg.cpp +128 -59
- package/src/llama.cpp/common/arg.h +1 -0
- package/src/llama.cpp/common/chat-parser.cpp +11 -0
- package/src/llama.cpp/common/chat.cpp +36 -7
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +42 -23
- package/src/llama.cpp/common/common.h +11 -1
- package/src/llama.cpp/common/llguidance.cpp +10 -6
- package/src/llama.cpp/common/regex-partial.cpp +13 -13
- package/src/llama.cpp/common/sampling.cpp +58 -14
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- package/src/llama.cpp/include/llama.h +100 -12
- package/src/llama.cpp/src/CMakeLists.txt +4 -0
- package/src/llama.cpp/src/llama-adapter.cpp +12 -3
- package/src/llama.cpp/src/llama-adapter.h +7 -1
- package/src/llama.cpp/src/llama-arch.cpp +78 -0
- package/src/llama.cpp/src/llama-arch.h +8 -0
- package/src/llama.cpp/src/llama-chat.cpp +11 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +637 -49
- package/src/llama.cpp/src/llama-context.h +43 -1
- package/src/llama.cpp/src/llama-grammar.cpp +40 -13
- package/src/llama.cpp/src/llama-grammar.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +173 -5
- package/src/llama.cpp/src/llama-graph.h +71 -6
- package/src/llama.cpp/src/llama-hparams.cpp +4 -0
- package/src/llama.cpp/src/llama-hparams.h +12 -5
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
- package/src/llama.cpp/src/llama-model.cpp +337 -26
- package/src/llama.cpp/src/llama-model.h +13 -2
- package/src/llama.cpp/src/llama-sampling.cpp +1259 -186
- package/src/llama.cpp/src/llama-sampling.h +19 -7
- package/src/llama.cpp/src/llama-vocab.cpp +101 -33
- package/src/llama.cpp/src/llama-vocab.h +2 -0
- package/src/llama.cpp/src/llama.cpp +87 -64
- package/src/llama.cpp/src/models/afmoe.cpp +9 -5
- package/src/llama.cpp/src/models/bert.cpp +4 -2
- package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
- package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/gemma3.cpp +3 -4
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
- package/src/llama.cpp/src/models/llama.cpp +19 -6
- package/src/llama.cpp/src/models/maincoder.cpp +117 -0
- package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- package/src/llama.cpp/src/models/models.h +18 -0
- package/src/llama.cpp/src/models/modern-bert.cpp +116 -0
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/plamo3.cpp +128 -0
- package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
- package/src/llama.cpp/src/unicode.cpp +23 -14
|
@@ -31,12 +31,14 @@ const char * llm_type_name(llm_type type) {
|
|
|
31
31
|
case LLM_TYPE_17M: return "17M";
|
|
32
32
|
case LLM_TYPE_22M: return "22M";
|
|
33
33
|
case LLM_TYPE_33M: return "33M";
|
|
34
|
+
case LLM_TYPE_47M: return "47M";
|
|
34
35
|
case LLM_TYPE_60M: return "60M";
|
|
35
36
|
case LLM_TYPE_70M: return "70M";
|
|
36
37
|
case LLM_TYPE_80M: return "80M";
|
|
37
38
|
case LLM_TYPE_109M: return "109M";
|
|
38
39
|
case LLM_TYPE_137M: return "137M";
|
|
39
40
|
case LLM_TYPE_140M: return "140M";
|
|
41
|
+
case LLM_TYPE_149M: return "149M";
|
|
40
42
|
case LLM_TYPE_160M: return "160M";
|
|
41
43
|
case LLM_TYPE_190M: return "190M";
|
|
42
44
|
case LLM_TYPE_220M: return "220M";
|
|
@@ -46,6 +48,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
46
48
|
case LLM_TYPE_335M: return "335M";
|
|
47
49
|
case LLM_TYPE_350M: return "350M";
|
|
48
50
|
case LLM_TYPE_360M: return "360M";
|
|
51
|
+
case LLM_TYPE_395M: return "395M";
|
|
49
52
|
case LLM_TYPE_410M: return "410M";
|
|
50
53
|
case LLM_TYPE_450M: return "450M";
|
|
51
54
|
case LLM_TYPE_475M: return "475M";
|
|
@@ -123,10 +126,12 @@ const char * llm_type_name(llm_type type) {
|
|
|
123
126
|
case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
|
|
124
127
|
case LLM_TYPE_80B_A3B: return "80B.A3B";
|
|
125
128
|
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
|
129
|
+
case LLM_TYPE_102B_A12B: return "102B.A12B";
|
|
126
130
|
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
|
127
131
|
case LLM_TYPE_230B_A10B: return "230B.A10B";
|
|
128
132
|
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
129
133
|
case LLM_TYPE_300B_A47B: return "300B.A47B";
|
|
134
|
+
case LLM_TYPE_310B_A15B: return "310B.A15B";
|
|
130
135
|
case LLM_TYPE_355B_A32B: return "355B.A32B";
|
|
131
136
|
case LLM_TYPE_E2B: return "E2B";
|
|
132
137
|
case LLM_TYPE_E4B: return "E4B";
|
|
@@ -502,6 +507,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
502
507
|
|
|
503
508
|
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
|
504
509
|
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
|
510
|
+
ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out, false);
|
|
505
511
|
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
|
506
512
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
|
507
513
|
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
|
@@ -573,6 +579,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
573
579
|
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
|
574
580
|
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
|
|
575
581
|
|
|
582
|
+
// TODO: Handle SWA metadata similarly when models start implementing it
|
|
576
583
|
// rope_freq_scale (inverse of the kv) is optional
|
|
577
584
|
float ropescale = 0.0f;
|
|
578
585
|
if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
|
|
@@ -581,10 +588,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
581
588
|
}
|
|
582
589
|
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
|
583
590
|
|
|
584
|
-
// by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
|
|
585
|
-
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
586
|
-
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
587
|
-
|
|
588
591
|
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
|
589
592
|
|
|
590
593
|
// non-transformer models do not have attention heads
|
|
@@ -603,7 +606,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
603
606
|
|
|
604
607
|
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
|
605
608
|
|
|
606
|
-
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
|
|
609
|
+
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
|
|
607
610
|
if (hparams.n_rot != hparams.n_embd_head_k) {
|
|
608
611
|
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
|
609
612
|
}
|
|
@@ -627,6 +630,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
627
630
|
// arch-specific KVs
|
|
628
631
|
switch (arch) {
|
|
629
632
|
case LLM_ARCH_LLAMA:
|
|
633
|
+
case LLM_ARCH_LLAMA_EMBED:
|
|
630
634
|
{
|
|
631
635
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
632
636
|
|
|
@@ -671,6 +675,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
671
675
|
hparams.f_attn_temp_scale = 0.1f;
|
|
672
676
|
hparams.f_attn_temp_offset = 1.0f;
|
|
673
677
|
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
|
678
|
+
|
|
679
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
680
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
681
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
674
682
|
}
|
|
675
683
|
|
|
676
684
|
switch (hparams.n_expert) {
|
|
@@ -716,6 +724,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
716
724
|
if (hparams.n_swa > 0) {
|
|
717
725
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
718
726
|
hparams.set_swa_pattern(4);
|
|
727
|
+
|
|
728
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
729
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
730
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
719
731
|
} else {
|
|
720
732
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
721
733
|
}
|
|
@@ -875,6 +887,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
875
887
|
default: type = LLM_TYPE_UNKNOWN;
|
|
876
888
|
}
|
|
877
889
|
} break;
|
|
890
|
+
case LLM_ARCH_MODERN_BERT:
|
|
891
|
+
{
|
|
892
|
+
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
893
|
+
if (found_swa && hparams.n_swa > 0) {
|
|
894
|
+
uint32_t swa_period = 3;
|
|
895
|
+
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
|
|
896
|
+
|
|
897
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
|
898
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
899
|
+
hparams.set_swa_pattern(swa_period);
|
|
900
|
+
} else {
|
|
901
|
+
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
905
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
906
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
907
|
+
|
|
908
|
+
switch (hparams.n_layer) {
|
|
909
|
+
case 12:
|
|
910
|
+
type = LLM_TYPE_47M; break; // granite-embedding-small
|
|
911
|
+
case 22:
|
|
912
|
+
type = LLM_TYPE_149M; break; // modern-bert-base
|
|
913
|
+
case 28:
|
|
914
|
+
type = LLM_TYPE_395M; break; // modern-bert-large
|
|
915
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
916
|
+
}
|
|
917
|
+
} break;
|
|
878
918
|
case LLM_ARCH_JINA_BERT_V2:
|
|
879
919
|
{
|
|
880
920
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1076,6 +1116,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1076
1116
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1077
1117
|
}
|
|
1078
1118
|
} break;
|
|
1119
|
+
case LLM_ARCH_MAINCODER:
|
|
1120
|
+
{
|
|
1121
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1122
|
+
switch (hparams.n_layer) {
|
|
1123
|
+
case 32: type = LLM_TYPE_1B; break;
|
|
1124
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1125
|
+
}
|
|
1126
|
+
} break;
|
|
1079
1127
|
case LLM_ARCH_QWEN3VL:
|
|
1080
1128
|
{
|
|
1081
1129
|
ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
|
|
@@ -1194,6 +1242,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1194
1242
|
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
|
|
1195
1243
|
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
|
|
1196
1244
|
} break;
|
|
1245
|
+
case LLM_ARCH_PLAMO3:
|
|
1246
|
+
{
|
|
1247
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1248
|
+
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
1249
|
+
if (found_swa && hparams.n_swa > 0) {
|
|
1250
|
+
uint32_t swa_period = 8;
|
|
1251
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1252
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
|
1253
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
1254
|
+
hparams.set_swa_pattern(swa_period);
|
|
1255
|
+
} else {
|
|
1256
|
+
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
1257
|
+
}
|
|
1258
|
+
|
|
1259
|
+
switch (hparams.n_layer) {
|
|
1260
|
+
case 24: type = LLM_TYPE_2B; break;
|
|
1261
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1262
|
+
}
|
|
1263
|
+
} break;
|
|
1197
1264
|
case LLM_ARCH_GPT2:
|
|
1198
1265
|
{
|
|
1199
1266
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1247,7 +1314,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1247
1314
|
hparams.n_swa = 4096; // default value of gemma 2
|
|
1248
1315
|
hparams.set_swa_pattern(2);
|
|
1249
1316
|
hparams.attn_soft_cap = true;
|
|
1317
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1318
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
1250
1319
|
|
|
1320
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1251
1321
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
1252
1322
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1253
1323
|
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
|
|
@@ -1272,8 +1342,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1272
1342
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1273
1343
|
hparams.set_swa_pattern(6);
|
|
1274
1344
|
|
|
1275
|
-
hparams.rope_freq_base_train_swa
|
|
1276
|
-
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1345
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1277
1346
|
} else {
|
|
1278
1347
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
1279
1348
|
}
|
|
@@ -1303,10 +1372,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1303
1372
|
hparams.set_swa_pattern(5);
|
|
1304
1373
|
|
|
1305
1374
|
hparams.n_layer_kv_from_start = 20;
|
|
1306
|
-
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1307
|
-
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1308
1375
|
hparams.f_attention_scale = 1.0f;
|
|
1309
1376
|
|
|
1377
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1310
1378
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1311
1379
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1312
1380
|
|
|
@@ -1322,9 +1390,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1322
1390
|
hparams.set_swa_pattern(6);
|
|
1323
1391
|
|
|
1324
1392
|
hparams.causal_attn = false; // embeddings do not use causal attention
|
|
1325
|
-
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1326
|
-
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1327
1393
|
|
|
1394
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1328
1395
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1329
1396
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1330
1397
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
@@ -1463,7 +1530,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1463
1530
|
{
|
|
1464
1531
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1465
1532
|
hparams.set_swa_pattern(4);
|
|
1533
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1534
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
1466
1535
|
|
|
1536
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1467
1537
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1468
1538
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
1469
1539
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1502,6 +1572,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1502
1572
|
if (found_swa && hparams.n_swa > 0) {
|
|
1503
1573
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1504
1574
|
hparams.set_swa_pattern(4);
|
|
1575
|
+
|
|
1576
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1577
|
+
hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
|
|
1578
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1505
1579
|
} else {
|
|
1506
1580
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
1507
1581
|
}
|
|
@@ -1629,7 +1703,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1629
1703
|
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
|
|
1630
1704
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1631
1705
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1632
|
-
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1706
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
1633
1707
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1634
1708
|
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
1635
1709
|
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
|
@@ -1725,6 +1799,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1725
1799
|
|
|
1726
1800
|
switch (hparams.n_layer) {
|
|
1727
1801
|
case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
|
|
1802
|
+
case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
|
|
1728
1803
|
case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
|
|
1729
1804
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1730
1805
|
}
|
|
@@ -1843,6 +1918,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1843
1918
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1844
1919
|
hparams.n_swa = 4096;
|
|
1845
1920
|
hparams.set_swa_pattern(4);
|
|
1921
|
+
|
|
1922
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1923
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
1924
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1846
1925
|
}
|
|
1847
1926
|
|
|
1848
1927
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
@@ -2145,6 +2224,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2145
2224
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
2146
2225
|
hparams.set_swa_pattern(2);
|
|
2147
2226
|
|
|
2227
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
2228
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
2229
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
2230
|
+
|
|
2148
2231
|
switch (hparams.n_layer) {
|
|
2149
2232
|
case 24: type = LLM_TYPE_20B; break;
|
|
2150
2233
|
case 36: type = LLM_TYPE_120B; break;
|
|
@@ -2189,6 +2272,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2189
2272
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
2190
2273
|
hparams.n_swa = 4096;
|
|
2191
2274
|
hparams.set_swa_pattern(4, true);
|
|
2275
|
+
|
|
2276
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
2277
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
2278
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
2192
2279
|
} else {
|
|
2193
2280
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
2194
2281
|
hparams.n_no_rope_layer_step = hparams.n_layer;
|
|
@@ -2307,6 +2394,22 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2307
2394
|
default: type = LLM_TYPE_UNKNOWN;
|
|
2308
2395
|
}
|
|
2309
2396
|
} break;
|
|
2397
|
+
case LLM_ARCH_MIMO2:
|
|
2398
|
+
{
|
|
2399
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2400
|
+
|
|
2401
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
2402
|
+
|
|
2403
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
2404
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
2405
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
|
2406
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
|
|
2407
|
+
|
|
2408
|
+
switch (hparams.n_layer) {
|
|
2409
|
+
case 48: type = LLM_TYPE_310B_A15B; break;
|
|
2410
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
2411
|
+
}
|
|
2412
|
+
} break;
|
|
2310
2413
|
default: throw std::runtime_error("unsupported model architecture");
|
|
2311
2414
|
}
|
|
2312
2415
|
|
|
@@ -2329,11 +2432,11 @@ void llama_model::load_vocab(llama_model_loader & ml) {
|
|
|
2329
2432
|
|
|
2330
2433
|
bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
2331
2434
|
const auto & split_mode = params.split_mode;
|
|
2332
|
-
const auto & n_gpu_layers = params.n_gpu_layers;
|
|
2333
2435
|
const auto & use_mlock = params.use_mlock;
|
|
2334
2436
|
const auto & tensor_split = params.tensor_split;
|
|
2335
2437
|
|
|
2336
|
-
const int n_layer
|
|
2438
|
+
const int n_layer = hparams.n_layer;
|
|
2439
|
+
const int n_gpu_layers = this->n_gpu_layers();
|
|
2337
2440
|
|
|
2338
2441
|
const bool use_mmap_buffer = true;
|
|
2339
2442
|
|
|
@@ -2621,6 +2724,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2621
2724
|
case LLM_ARCH_GRANITE:
|
|
2622
2725
|
case LLM_ARCH_GRANITE_MOE:
|
|
2623
2726
|
case LLM_ARCH_MISTRAL3:
|
|
2727
|
+
case LLM_ARCH_LLAMA_EMBED:
|
|
2624
2728
|
{
|
|
2625
2729
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2626
2730
|
|
|
@@ -3155,6 +3259,37 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3155
3259
|
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
|
|
3156
3260
|
}
|
|
3157
3261
|
} break;
|
|
3262
|
+
case LLM_ARCH_MODERN_BERT:
|
|
3263
|
+
{
|
|
3264
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3265
|
+
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
|
3266
|
+
|
|
3267
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3268
|
+
|
|
3269
|
+
for(int i = 0; i < n_layer; ++i) {
|
|
3270
|
+
auto& layer = layers[i];
|
|
3271
|
+
|
|
3272
|
+
if ( i != 0 ) {
|
|
3273
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3274
|
+
} else{
|
|
3275
|
+
// layer 0 uses identity
|
|
3276
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3277
|
+
}
|
|
3278
|
+
|
|
3279
|
+
|
|
3280
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
|
|
3281
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
3282
|
+
|
|
3283
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, 2 * n_ff}, 0);
|
|
3284
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
3285
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3286
|
+
}
|
|
3287
|
+
|
|
3288
|
+
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
|
3289
|
+
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
3290
|
+
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
3291
|
+
|
|
3292
|
+
} break;
|
|
3158
3293
|
case LLM_ARCH_NEO_BERT:
|
|
3159
3294
|
{
|
|
3160
3295
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -3219,7 +3354,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3219
3354
|
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3220
3355
|
|
|
3221
3356
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
3222
|
-
|
|
3357
|
+
|
|
3358
|
+
const auto tn_ffn_up_weight = tn(LLM_TENSOR_FFN_UP, "weight", i);
|
|
3359
|
+
ggml_tensor * t_ffn_up = ml.get_tensor_meta(tn_ffn_up_weight.str().c_str());
|
|
3360
|
+
const int64_t n_ffn_up = t_ffn_up ? t_ffn_up->ne[1] : n_ff;
|
|
3361
|
+
|
|
3362
|
+
GGML_ASSERT(n_ffn_up == n_ff || n_ffn_up == n_ff * 2);
|
|
3363
|
+
layer.ffn_up = create_tensor(tn_ffn_up_weight, {n_embd, n_ffn_up}, 0);
|
|
3364
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ffn_up}, TENSOR_NOT_REQUIRED);
|
|
3223
3365
|
|
|
3224
3366
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
3225
3367
|
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
@@ -3747,6 +3889,44 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3747
3889
|
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
|
|
3748
3890
|
}
|
|
3749
3891
|
} break;
|
|
3892
|
+
case LLM_ARCH_PLAMO3:
|
|
3893
|
+
{
|
|
3894
|
+
const int64_t head_dim_q = hparams.n_embd_head_k;
|
|
3895
|
+
const int64_t head_dim_v = hparams.n_embd_head_v;
|
|
3896
|
+
|
|
3897
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3898
|
+
|
|
3899
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3900
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
3901
|
+
if (output == NULL) {
|
|
3902
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3903
|
+
}
|
|
3904
|
+
|
|
3905
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3906
|
+
auto & layer = layers[i];
|
|
3907
|
+
|
|
3908
|
+
const int64_t num_attention_heads = hparams.n_head(i);
|
|
3909
|
+
const int64_t num_key_value_heads = hparams.n_head_kv(i);
|
|
3910
|
+
const int64_t q_proj_dim = num_attention_heads * head_dim_q;
|
|
3911
|
+
const int64_t k_proj_dim = num_key_value_heads * head_dim_q;
|
|
3912
|
+
const int64_t v_proj_dim = num_key_value_heads * head_dim_v;
|
|
3913
|
+
const int64_t n_ff_cur = hparams.n_ff(i);
|
|
3914
|
+
|
|
3915
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3916
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i),
|
|
3917
|
+
{n_embd,q_proj_dim + k_proj_dim + v_proj_dim}, 0);
|
|
3918
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim_q}, 0);
|
|
3919
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim_q}, 0);
|
|
3920
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {num_attention_heads * head_dim_v, n_embd}, 0);
|
|
3921
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
|
|
3922
|
+
|
|
3923
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3924
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
|
|
3925
|
+
|
|
3926
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_cur * 2}, 0);
|
|
3927
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
|
|
3928
|
+
}
|
|
3929
|
+
} break;
|
|
3750
3930
|
case LLM_ARCH_GPT2:
|
|
3751
3931
|
{
|
|
3752
3932
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -4637,7 +4817,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4637
4817
|
|
|
4638
4818
|
// output
|
|
4639
4819
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4640
|
-
output
|
|
4820
|
+
// try to load output.weight, if not found, use token_embd (tied embeddings)
|
|
4821
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4822
|
+
if (!output) {
|
|
4823
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4824
|
+
}
|
|
4641
4825
|
|
|
4642
4826
|
for (int i = 0; i < n_layer; ++i) {
|
|
4643
4827
|
auto & layer = layers[i];
|
|
@@ -4700,7 +4884,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4700
4884
|
|
|
4701
4885
|
// output
|
|
4702
4886
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4703
|
-
output
|
|
4887
|
+
// try to load output.weight, if not found, use token_embd (tied embeddings)
|
|
4888
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4889
|
+
if (!output) {
|
|
4890
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4891
|
+
}
|
|
4704
4892
|
|
|
4705
4893
|
for (int i = 0; i < n_layer; ++i) {
|
|
4706
4894
|
auto & layer = layers[i];
|
|
@@ -5067,9 +5255,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5067
5255
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
|
|
5068
5256
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
|
|
5069
5257
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
|
|
5070
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
|
|
5071
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
|
|
5072
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
|
|
5258
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, TENSOR_NOT_REQUIRED | flags);
|
|
5259
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, TENSOR_NOT_REQUIRED | flags);
|
|
5260
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, TENSOR_NOT_REQUIRED | flags);
|
|
5073
5261
|
|
|
5074
5262
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
|
|
5075
5263
|
|
|
@@ -5181,9 +5369,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5181
5369
|
const int64_t n_group = hparams.ssm_n_group;
|
|
5182
5370
|
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
|
5183
5371
|
|
|
5184
|
-
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
5185
|
-
const int64_t n_ff_shexp = hparams.n_ff_shexp;
|
|
5186
|
-
|
|
5187
5372
|
// embeddings
|
|
5188
5373
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5189
5374
|
|
|
@@ -5235,6 +5420,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5235
5420
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
5236
5421
|
} else {
|
|
5237
5422
|
if (n_expert != 0) {
|
|
5423
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
5424
|
+
const int64_t n_ff_shexp = hparams.n_ff_shexp;
|
|
5425
|
+
|
|
5238
5426
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
|
|
5239
5427
|
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
|
|
5240
5428
|
|
|
@@ -6282,6 +6470,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6282
6470
|
layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
|
|
6283
6471
|
}
|
|
6284
6472
|
}
|
|
6473
|
+
|
|
6474
|
+
// for LFM2-ColBert-350M
|
|
6475
|
+
dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.get_n_embd_out()}, TENSOR_NOT_REQUIRED);
|
|
6285
6476
|
} break;
|
|
6286
6477
|
case LLM_ARCH_SMALLTHINKER:
|
|
6287
6478
|
{
|
|
@@ -6584,6 +6775,75 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6584
6775
|
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
|
|
6585
6776
|
}
|
|
6586
6777
|
} break;
|
|
6778
|
+
case LLM_ARCH_MIMO2:
|
|
6779
|
+
{
|
|
6780
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
6781
|
+
|
|
6782
|
+
// output
|
|
6783
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
6784
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
6785
|
+
|
|
6786
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
6787
|
+
auto & layer = layers[i];
|
|
6788
|
+
uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
|
|
6789
|
+
uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
|
|
6790
|
+
uint32_t n_head = hparams.n_head(i);
|
|
6791
|
+
|
|
6792
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
|
|
6793
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
|
|
6794
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
|
|
6795
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_v * n_head, n_embd }, 0);
|
|
6796
|
+
|
|
6797
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
6798
|
+
layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, TENSOR_NOT_REQUIRED);
|
|
6799
|
+
|
|
6800
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
6801
|
+
|
|
6802
|
+
// non-MoE branch
|
|
6803
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
6804
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED);
|
|
6805
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
6806
|
+
|
|
6807
|
+
// MoE branch
|
|
6808
|
+
int64_t n_ff_exp = hparams.n_ff_exp;
|
|
6809
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
|
|
6810
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
|
|
6811
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED);
|
|
6812
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
|
|
6813
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
|
6814
|
+
}
|
|
6815
|
+
} break;
|
|
6816
|
+
case LLM_ARCH_MAINCODER:
|
|
6817
|
+
{
|
|
6818
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
6819
|
+
|
|
6820
|
+
// output
|
|
6821
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
6822
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
6823
|
+
// if output is NULL, init from the input tok embed
|
|
6824
|
+
if (output == NULL) {
|
|
6825
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
6826
|
+
}
|
|
6827
|
+
|
|
6828
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
6829
|
+
auto & layer = layers[i];
|
|
6830
|
+
|
|
6831
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
6832
|
+
|
|
6833
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
6834
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
6835
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
6836
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
6837
|
+
|
|
6838
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
6839
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
6840
|
+
|
|
6841
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
6842
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
6843
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
6844
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
6845
|
+
}
|
|
6846
|
+
} break;
|
|
6587
6847
|
default:
|
|
6588
6848
|
throw std::runtime_error("unknown architecture");
|
|
6589
6849
|
}
|
|
@@ -6765,6 +7025,14 @@ size_t llama_model::n_devices() const {
|
|
|
6765
7025
|
return devices.size();
|
|
6766
7026
|
}
|
|
6767
7027
|
|
|
7028
|
+
uint32_t llama_model::n_gpu_layers() const {
|
|
7029
|
+
return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
|
|
7030
|
+
}
|
|
7031
|
+
|
|
7032
|
+
llama_split_mode llama_model::split_mode() const {
|
|
7033
|
+
return params.split_mode;
|
|
7034
|
+
}
|
|
7035
|
+
|
|
6768
7036
|
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
|
6769
7037
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
6770
7038
|
for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
|
|
@@ -6857,6 +7125,10 @@ void llama_model::print_info() const {
|
|
|
6857
7125
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
|
6858
7126
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
6859
7127
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
7128
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
7129
|
+
LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
|
|
7130
|
+
LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
|
|
7131
|
+
}
|
|
6860
7132
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
6861
7133
|
LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
6862
7134
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
@@ -7089,6 +7361,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
7089
7361
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
7090
7362
|
case LLM_ARCH_NEO_BERT:
|
|
7091
7363
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
7364
|
+
case LLM_ARCH_MODERN_BERT:
|
|
7092
7365
|
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
7093
7366
|
case LLM_ARCH_DREAM:
|
|
7094
7367
|
case LLM_ARCH_LLADA:
|
|
@@ -7206,16 +7479,24 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7206
7479
|
switch (arch) {
|
|
7207
7480
|
case LLM_ARCH_LLAMA:
|
|
7208
7481
|
{
|
|
7209
|
-
llm = std::make_unique<llm_build_llama
|
|
7482
|
+
llm = std::make_unique<llm_build_llama<false>>(*this, params);
|
|
7210
7483
|
} break;
|
|
7211
7484
|
case LLM_ARCH_LLAMA4:
|
|
7212
7485
|
{
|
|
7213
7486
|
if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
|
|
7214
|
-
llm = std::make_unique<llm_build_llama
|
|
7487
|
+
llm = std::make_unique<llm_build_llama<false>>(*this, params);
|
|
7215
7488
|
} else {
|
|
7216
7489
|
llm = std::make_unique<llm_build_llama_iswa>(*this, params);
|
|
7217
7490
|
}
|
|
7218
7491
|
} break;
|
|
7492
|
+
case LLM_ARCH_LLAMA_EMBED:
|
|
7493
|
+
{
|
|
7494
|
+
llm = std::make_unique<llm_build_llama<true>>(*this, params);
|
|
7495
|
+
} break;
|
|
7496
|
+
case LLM_ARCH_MAINCODER:
|
|
7497
|
+
{
|
|
7498
|
+
llm = std::make_unique<llm_build_maincoder>(*this, params);
|
|
7499
|
+
} break;
|
|
7219
7500
|
case LLM_ARCH_DECI:
|
|
7220
7501
|
{
|
|
7221
7502
|
llm = std::make_unique<llm_build_deci>(*this, params);
|
|
@@ -7248,6 +7529,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7248
7529
|
{
|
|
7249
7530
|
llm = std::make_unique<llm_build_bert>(*this, params);
|
|
7250
7531
|
} break;
|
|
7532
|
+
case LLM_ARCH_MODERN_BERT:
|
|
7533
|
+
{
|
|
7534
|
+
llm = std::make_unique<llm_build_modern_bert>(*this, params);
|
|
7535
|
+
} break;
|
|
7251
7536
|
case LLM_ARCH_NEO_BERT:
|
|
7252
7537
|
{
|
|
7253
7538
|
llm = std::make_unique<llm_build_neo_bert>(*this, params);
|
|
@@ -7337,6 +7622,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7337
7622
|
{
|
|
7338
7623
|
llm = std::make_unique<llm_build_plamo2>(*this, params);
|
|
7339
7624
|
} break;
|
|
7625
|
+
case LLM_ARCH_PLAMO3:
|
|
7626
|
+
{
|
|
7627
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
7628
|
+
llm = std::make_unique<llm_build_plamo3<true>> (*this, params);
|
|
7629
|
+
} else {
|
|
7630
|
+
llm = std::make_unique<llm_build_plamo3<false>>(*this, params);
|
|
7631
|
+
}
|
|
7632
|
+
} break;
|
|
7340
7633
|
case LLM_ARCH_GPT2:
|
|
7341
7634
|
{
|
|
7342
7635
|
llm = std::make_unique<llm_build_gpt2>(*this, params);
|
|
@@ -7637,6 +7930,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7637
7930
|
{
|
|
7638
7931
|
llm = std::make_unique<llm_build_mistral3>(*this, params);
|
|
7639
7932
|
} break;
|
|
7933
|
+
case LLM_ARCH_MIMO2:
|
|
7934
|
+
{
|
|
7935
|
+
llm = std::make_unique<llm_build_mimo2_iswa>(*this, params);
|
|
7936
|
+
} break;
|
|
7640
7937
|
default:
|
|
7641
7938
|
GGML_ABORT("fatal error");
|
|
7642
7939
|
}
|
|
@@ -7644,12 +7941,17 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7644
7941
|
// add on pooling layer
|
|
7645
7942
|
llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
|
|
7646
7943
|
|
|
7944
|
+
// add backend sampling layers (if any)
|
|
7945
|
+
llm->build_sampling();
|
|
7946
|
+
|
|
7647
7947
|
// if the gguf model was converted with --sentence-transformers-dense-modules
|
|
7648
7948
|
// there will be two additional dense projection layers
|
|
7649
7949
|
// dense linear projections are applied after pooling
|
|
7650
7950
|
// TODO: move reranking logic here and generalize
|
|
7651
7951
|
llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
|
|
7652
7952
|
|
|
7953
|
+
llm->res->set_outputs();
|
|
7954
|
+
|
|
7653
7955
|
return llm->res->get_gf();
|
|
7654
7956
|
}
|
|
7655
7957
|
|
|
@@ -7662,7 +7964,7 @@ llama_model_params llama_model_default_params() {
|
|
|
7662
7964
|
llama_model_params result = {
|
|
7663
7965
|
/*.devices =*/ nullptr,
|
|
7664
7966
|
/*.tensor_buft_overrides =*/ nullptr,
|
|
7665
|
-
/*.n_gpu_layers =*/
|
|
7967
|
+
/*.n_gpu_layers =*/ -1,
|
|
7666
7968
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
|
7667
7969
|
/*.main_gpu =*/ 0,
|
|
7668
7970
|
/*.tensor_split =*/ nullptr,
|
|
@@ -7705,6 +8007,10 @@ int32_t llama_model_n_embd_inp(const llama_model * model) {
|
|
|
7705
8007
|
return model->hparams.n_embd_inp();
|
|
7706
8008
|
}
|
|
7707
8009
|
|
|
8010
|
+
int32_t llama_model_n_embd_out(const llama_model * model) {
|
|
8011
|
+
return model->hparams.get_n_embd_out();
|
|
8012
|
+
}
|
|
8013
|
+
|
|
7708
8014
|
int32_t llama_model_n_layer(const llama_model * model) {
|
|
7709
8015
|
return model->hparams.n_layer;
|
|
7710
8016
|
}
|
|
@@ -7807,6 +8113,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7807
8113
|
case LLM_ARCH_ERNIE4_5:
|
|
7808
8114
|
case LLM_ARCH_ERNIE4_5_MOE:
|
|
7809
8115
|
case LLM_ARCH_MISTRAL3:
|
|
8116
|
+
case LLM_ARCH_LLAMA_EMBED:
|
|
8117
|
+
case LLM_ARCH_MAINCODER:
|
|
7810
8118
|
return LLAMA_ROPE_TYPE_NORM;
|
|
7811
8119
|
|
|
7812
8120
|
// the pairs of head values are offset by n_rot/2
|
|
@@ -7816,6 +8124,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7816
8124
|
case LLM_ARCH_DBRX:
|
|
7817
8125
|
case LLM_ARCH_BERT:
|
|
7818
8126
|
case LLM_ARCH_JINA_BERT_V3:
|
|
8127
|
+
case LLM_ARCH_MODERN_BERT:
|
|
7819
8128
|
case LLM_ARCH_NOMIC_BERT:
|
|
7820
8129
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
7821
8130
|
case LLM_ARCH_STABLELM:
|
|
@@ -7835,6 +8144,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7835
8144
|
case LLM_ARCH_PHIMOE:
|
|
7836
8145
|
case LLM_ARCH_PLAMO:
|
|
7837
8146
|
case LLM_ARCH_PLAMO2:
|
|
8147
|
+
case LLM_ARCH_PLAMO3:
|
|
7838
8148
|
case LLM_ARCH_GEMMA:
|
|
7839
8149
|
case LLM_ARCH_GEMMA2:
|
|
7840
8150
|
case LLM_ARCH_GEMMA3:
|
|
@@ -7865,6 +8175,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7865
8175
|
case LLM_ARCH_PANGU_EMBED:
|
|
7866
8176
|
case LLM_ARCH_AFMOE:
|
|
7867
8177
|
case LLM_ARCH_QWEN3NEXT:
|
|
8178
|
+
case LLM_ARCH_MIMO2:
|
|
7868
8179
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
7869
8180
|
|
|
7870
8181
|
case LLM_ARCH_QWEN2VL:
|