@fugood/llama.node 1.4.11 → 1.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/package.json +15 -15
  2. package/scripts/llama.cpp.patch +31 -31
  3. package/src/llama.cpp/common/arg.cpp +128 -59
  4. package/src/llama.cpp/common/arg.h +1 -0
  5. package/src/llama.cpp/common/chat-parser.cpp +11 -0
  6. package/src/llama.cpp/common/chat.cpp +36 -7
  7. package/src/llama.cpp/common/chat.h +1 -0
  8. package/src/llama.cpp/common/common.cpp +42 -23
  9. package/src/llama.cpp/common/common.h +11 -1
  10. package/src/llama.cpp/common/llguidance.cpp +10 -6
  11. package/src/llama.cpp/common/regex-partial.cpp +13 -13
  12. package/src/llama.cpp/common/sampling.cpp +58 -14
  13. package/src/llama.cpp/common/sampling.h +3 -1
  14. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  15. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
  16. package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  17. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  19. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  20. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  21. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  23. package/src/llama.cpp/include/llama.h +100 -12
  24. package/src/llama.cpp/src/CMakeLists.txt +4 -0
  25. package/src/llama.cpp/src/llama-adapter.cpp +12 -3
  26. package/src/llama.cpp/src/llama-adapter.h +7 -1
  27. package/src/llama.cpp/src/llama-arch.cpp +78 -0
  28. package/src/llama.cpp/src/llama-arch.h +8 -0
  29. package/src/llama.cpp/src/llama-chat.cpp +11 -0
  30. package/src/llama.cpp/src/llama-chat.h +1 -0
  31. package/src/llama.cpp/src/llama-context.cpp +637 -49
  32. package/src/llama.cpp/src/llama-context.h +43 -1
  33. package/src/llama.cpp/src/llama-grammar.cpp +40 -13
  34. package/src/llama.cpp/src/llama-grammar.h +2 -0
  35. package/src/llama.cpp/src/llama-graph.cpp +173 -5
  36. package/src/llama.cpp/src/llama-graph.h +71 -6
  37. package/src/llama.cpp/src/llama-hparams.cpp +4 -0
  38. package/src/llama.cpp/src/llama-hparams.h +12 -5
  39. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  40. package/src/llama.cpp/src/llama-mmap.cpp +11 -4
  41. package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
  42. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  43. package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
  44. package/src/llama.cpp/src/llama-model.cpp +337 -26
  45. package/src/llama.cpp/src/llama-model.h +13 -2
  46. package/src/llama.cpp/src/llama-sampling.cpp +1259 -186
  47. package/src/llama.cpp/src/llama-sampling.h +19 -7
  48. package/src/llama.cpp/src/llama-vocab.cpp +101 -33
  49. package/src/llama.cpp/src/llama-vocab.h +2 -0
  50. package/src/llama.cpp/src/llama.cpp +87 -64
  51. package/src/llama.cpp/src/models/afmoe.cpp +9 -5
  52. package/src/llama.cpp/src/models/bert.cpp +4 -2
  53. package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
  54. package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  55. package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
  56. package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  57. package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  58. package/src/llama.cpp/src/models/gemma3.cpp +3 -4
  59. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  60. package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
  61. package/src/llama.cpp/src/models/llama.cpp +19 -6
  62. package/src/llama.cpp/src/models/maincoder.cpp +117 -0
  63. package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  64. package/src/llama.cpp/src/models/models.h +18 -0
  65. package/src/llama.cpp/src/models/modern-bert.cpp +116 -0
  66. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  67. package/src/llama.cpp/src/models/plamo3.cpp +128 -0
  68. package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
  69. package/src/llama.cpp/src/unicode.cpp +23 -14
@@ -31,12 +31,14 @@ const char * llm_type_name(llm_type type) {
31
31
  case LLM_TYPE_17M: return "17M";
32
32
  case LLM_TYPE_22M: return "22M";
33
33
  case LLM_TYPE_33M: return "33M";
34
+ case LLM_TYPE_47M: return "47M";
34
35
  case LLM_TYPE_60M: return "60M";
35
36
  case LLM_TYPE_70M: return "70M";
36
37
  case LLM_TYPE_80M: return "80M";
37
38
  case LLM_TYPE_109M: return "109M";
38
39
  case LLM_TYPE_137M: return "137M";
39
40
  case LLM_TYPE_140M: return "140M";
41
+ case LLM_TYPE_149M: return "149M";
40
42
  case LLM_TYPE_160M: return "160M";
41
43
  case LLM_TYPE_190M: return "190M";
42
44
  case LLM_TYPE_220M: return "220M";
@@ -46,6 +48,7 @@ const char * llm_type_name(llm_type type) {
46
48
  case LLM_TYPE_335M: return "335M";
47
49
  case LLM_TYPE_350M: return "350M";
48
50
  case LLM_TYPE_360M: return "360M";
51
+ case LLM_TYPE_395M: return "395M";
49
52
  case LLM_TYPE_410M: return "410M";
50
53
  case LLM_TYPE_450M: return "450M";
51
54
  case LLM_TYPE_475M: return "475M";
@@ -123,10 +126,12 @@ const char * llm_type_name(llm_type type) {
123
126
  case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
124
127
  case LLM_TYPE_80B_A3B: return "80B.A3B";
125
128
  case LLM_TYPE_100B_A6B: return "100B.A6B";
129
+ case LLM_TYPE_102B_A12B: return "102B.A12B";
126
130
  case LLM_TYPE_106B_A12B: return "106B.A12B";
127
131
  case LLM_TYPE_230B_A10B: return "230B.A10B";
128
132
  case LLM_TYPE_235B_A22B: return "235B.A22B";
129
133
  case LLM_TYPE_300B_A47B: return "300B.A47B";
134
+ case LLM_TYPE_310B_A15B: return "310B.A15B";
130
135
  case LLM_TYPE_355B_A32B: return "355B.A32B";
131
136
  case LLM_TYPE_E2B: return "E2B";
132
137
  case LLM_TYPE_E4B: return "E4B";
@@ -502,6 +507,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
502
507
 
503
508
  ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
504
509
  ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
510
+ ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out, false);
505
511
  ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
506
512
  ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
507
513
  ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
@@ -573,6 +579,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
573
579
  hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
574
580
  GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
575
581
 
582
+ // TODO: Handle SWA metadata similarly when models start implementing it
576
583
  // rope_freq_scale (inverse of the kv) is optional
577
584
  float ropescale = 0.0f;
578
585
  if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
@@ -581,10 +588,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
581
588
  }
582
589
  hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
583
590
 
584
- // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
585
- hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
586
- hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
587
-
588
591
  ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
589
592
 
590
593
  // non-transformer models do not have attention heads
@@ -603,7 +606,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
603
606
 
604
607
  ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
605
608
 
606
- if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
609
+ if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
607
610
  if (hparams.n_rot != hparams.n_embd_head_k) {
608
611
  throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
609
612
  }
@@ -627,6 +630,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
627
630
  // arch-specific KVs
628
631
  switch (arch) {
629
632
  case LLM_ARCH_LLAMA:
633
+ case LLM_ARCH_LLAMA_EMBED:
630
634
  {
631
635
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
632
636
 
@@ -671,6 +675,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
671
675
  hparams.f_attn_temp_scale = 0.1f;
672
676
  hparams.f_attn_temp_offset = 1.0f;
673
677
  hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
678
+
679
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
680
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
681
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
674
682
  }
675
683
 
676
684
  switch (hparams.n_expert) {
@@ -716,6 +724,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
716
724
  if (hparams.n_swa > 0) {
717
725
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
718
726
  hparams.set_swa_pattern(4);
727
+
728
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
729
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
730
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
719
731
  } else {
720
732
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
721
733
  }
@@ -875,6 +887,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
875
887
  default: type = LLM_TYPE_UNKNOWN;
876
888
  }
877
889
  } break;
890
+ case LLM_ARCH_MODERN_BERT:
891
+ {
892
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
893
+ if (found_swa && hparams.n_swa > 0) {
894
+ uint32_t swa_period = 3;
895
+ hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
896
+
897
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
898
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
899
+ hparams.set_swa_pattern(swa_period);
900
+ } else {
901
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
902
+ }
903
+
904
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
905
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
906
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
907
+
908
+ switch (hparams.n_layer) {
909
+ case 12:
910
+ type = LLM_TYPE_47M; break; // granite-embedding-small
911
+ case 22:
912
+ type = LLM_TYPE_149M; break; // modern-bert-base
913
+ case 28:
914
+ type = LLM_TYPE_395M; break; // modern-bert-large
915
+ default: type = LLM_TYPE_UNKNOWN;
916
+ }
917
+ } break;
878
918
  case LLM_ARCH_JINA_BERT_V2:
879
919
  {
880
920
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1076,6 +1116,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1076
1116
  default: type = LLM_TYPE_UNKNOWN;
1077
1117
  }
1078
1118
  } break;
1119
+ case LLM_ARCH_MAINCODER:
1120
+ {
1121
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1122
+ switch (hparams.n_layer) {
1123
+ case 32: type = LLM_TYPE_1B; break;
1124
+ default: type = LLM_TYPE_UNKNOWN;
1125
+ }
1126
+ } break;
1079
1127
  case LLM_ARCH_QWEN3VL:
1080
1128
  {
1081
1129
  ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
@@ -1194,6 +1242,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1194
1242
  ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
1195
1243
  ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
1196
1244
  } break;
1245
+ case LLM_ARCH_PLAMO3:
1246
+ {
1247
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1248
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1249
+ if (found_swa && hparams.n_swa > 0) {
1250
+ uint32_t swa_period = 8;
1251
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1252
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
1253
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1254
+ hparams.set_swa_pattern(swa_period);
1255
+ } else {
1256
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1257
+ }
1258
+
1259
+ switch (hparams.n_layer) {
1260
+ case 24: type = LLM_TYPE_2B; break;
1261
+ default: type = LLM_TYPE_UNKNOWN;
1262
+ }
1263
+ } break;
1197
1264
  case LLM_ARCH_GPT2:
1198
1265
  {
1199
1266
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1247,7 +1314,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1247
1314
  hparams.n_swa = 4096; // default value of gemma 2
1248
1315
  hparams.set_swa_pattern(2);
1249
1316
  hparams.attn_soft_cap = true;
1317
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1318
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1250
1319
 
1320
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1251
1321
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1252
1322
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1253
1323
  ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
@@ -1272,8 +1342,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1272
1342
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1273
1343
  hparams.set_swa_pattern(6);
1274
1344
 
1275
- hparams.rope_freq_base_train_swa = 10000.0f;
1276
- hparams.rope_freq_scale_train_swa = 1.0f;
1345
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1277
1346
  } else {
1278
1347
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1279
1348
  }
@@ -1303,10 +1372,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1303
1372
  hparams.set_swa_pattern(5);
1304
1373
 
1305
1374
  hparams.n_layer_kv_from_start = 20;
1306
- hparams.rope_freq_base_train_swa = 10000.0f;
1307
- hparams.rope_freq_scale_train_swa = 1.0f;
1308
1375
  hparams.f_attention_scale = 1.0f;
1309
1376
 
1377
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1310
1378
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1311
1379
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1312
1380
 
@@ -1322,9 +1390,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1322
1390
  hparams.set_swa_pattern(6);
1323
1391
 
1324
1392
  hparams.causal_attn = false; // embeddings do not use causal attention
1325
- hparams.rope_freq_base_train_swa = 10000.0f;
1326
- hparams.rope_freq_scale_train_swa = 1.0f;
1327
1393
 
1394
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1328
1395
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1329
1396
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1330
1397
  ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
@@ -1463,7 +1530,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1463
1530
  {
1464
1531
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1465
1532
  hparams.set_swa_pattern(4);
1533
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1534
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1466
1535
 
1536
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1467
1537
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1468
1538
  ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
1469
1539
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1502,6 +1572,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1502
1572
  if (found_swa && hparams.n_swa > 0) {
1503
1573
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1504
1574
  hparams.set_swa_pattern(4);
1575
+
1576
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1577
+ hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
1578
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1505
1579
  } else {
1506
1580
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1507
1581
  }
@@ -1629,7 +1703,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1629
1703
  ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
1630
1704
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1631
1705
  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1632
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1706
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
1633
1707
  ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1634
1708
  ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1635
1709
  if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
@@ -1725,6 +1799,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1725
1799
 
1726
1800
  switch (hparams.n_layer) {
1727
1801
  case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
1802
+ case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
1728
1803
  case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
1729
1804
  default: type = LLM_TYPE_UNKNOWN;
1730
1805
  }
@@ -1843,6 +1918,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1843
1918
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1844
1919
  hparams.n_swa = 4096;
1845
1920
  hparams.set_swa_pattern(4);
1921
+
1922
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1923
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1924
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1846
1925
  }
1847
1926
 
1848
1927
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
@@ -2145,6 +2224,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2145
2224
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2146
2225
  hparams.set_swa_pattern(2);
2147
2226
 
2227
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
2228
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
2229
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
2230
+
2148
2231
  switch (hparams.n_layer) {
2149
2232
  case 24: type = LLM_TYPE_20B; break;
2150
2233
  case 36: type = LLM_TYPE_120B; break;
@@ -2189,6 +2272,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2189
2272
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2190
2273
  hparams.n_swa = 4096;
2191
2274
  hparams.set_swa_pattern(4, true);
2275
+
2276
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
2277
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
2278
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
2192
2279
  } else {
2193
2280
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
2194
2281
  hparams.n_no_rope_layer_step = hparams.n_layer;
@@ -2307,6 +2394,22 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2307
2394
  default: type = LLM_TYPE_UNKNOWN;
2308
2395
  }
2309
2396
  } break;
2397
+ case LLM_ARCH_MIMO2:
2398
+ {
2399
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2400
+
2401
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2402
+
2403
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2404
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
2405
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
2406
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
2407
+
2408
+ switch (hparams.n_layer) {
2409
+ case 48: type = LLM_TYPE_310B_A15B; break;
2410
+ default: type = LLM_TYPE_UNKNOWN;
2411
+ }
2412
+ } break;
2310
2413
  default: throw std::runtime_error("unsupported model architecture");
2311
2414
  }
2312
2415
 
@@ -2329,11 +2432,11 @@ void llama_model::load_vocab(llama_model_loader & ml) {
2329
2432
 
2330
2433
  bool llama_model::load_tensors(llama_model_loader & ml) {
2331
2434
  const auto & split_mode = params.split_mode;
2332
- const auto & n_gpu_layers = params.n_gpu_layers;
2333
2435
  const auto & use_mlock = params.use_mlock;
2334
2436
  const auto & tensor_split = params.tensor_split;
2335
2437
 
2336
- const int n_layer = hparams.n_layer;
2438
+ const int n_layer = hparams.n_layer;
2439
+ const int n_gpu_layers = this->n_gpu_layers();
2337
2440
 
2338
2441
  const bool use_mmap_buffer = true;
2339
2442
 
@@ -2621,6 +2724,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2621
2724
  case LLM_ARCH_GRANITE:
2622
2725
  case LLM_ARCH_GRANITE_MOE:
2623
2726
  case LLM_ARCH_MISTRAL3:
2727
+ case LLM_ARCH_LLAMA_EMBED:
2624
2728
  {
2625
2729
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2626
2730
 
@@ -3155,6 +3259,37 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3155
3259
  layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
3156
3260
  }
3157
3261
  } break;
3262
+ case LLM_ARCH_MODERN_BERT:
3263
+ {
3264
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3265
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
3266
+
3267
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3268
+
3269
+ for(int i = 0; i < n_layer; ++i) {
3270
+ auto& layer = layers[i];
3271
+
3272
+ if ( i != 0 ) {
3273
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3274
+ } else{
3275
+ // layer 0 uses identity
3276
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3277
+ }
3278
+
3279
+
3280
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
3281
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3282
+
3283
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, 2 * n_ff}, 0);
3284
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3285
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3286
+ }
3287
+
3288
+ cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
3289
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3290
+ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3291
+
3292
+ } break;
3158
3293
  case LLM_ARCH_NEO_BERT:
3159
3294
  {
3160
3295
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3219,7 +3354,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3219
3354
  layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3220
3355
 
3221
3356
  layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
3222
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
3357
+
3358
+ const auto tn_ffn_up_weight = tn(LLM_TENSOR_FFN_UP, "weight", i);
3359
+ ggml_tensor * t_ffn_up = ml.get_tensor_meta(tn_ffn_up_weight.str().c_str());
3360
+ const int64_t n_ffn_up = t_ffn_up ? t_ffn_up->ne[1] : n_ff;
3361
+
3362
+ GGML_ASSERT(n_ffn_up == n_ff || n_ffn_up == n_ff * 2);
3363
+ layer.ffn_up = create_tensor(tn_ffn_up_weight, {n_embd, n_ffn_up}, 0);
3364
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ffn_up}, TENSOR_NOT_REQUIRED);
3223
3365
 
3224
3366
  layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3225
3367
  layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
@@ -3747,6 +3889,44 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3747
3889
  layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
3748
3890
  }
3749
3891
  } break;
3892
+ case LLM_ARCH_PLAMO3:
3893
+ {
3894
+ const int64_t head_dim_q = hparams.n_embd_head_k;
3895
+ const int64_t head_dim_v = hparams.n_embd_head_v;
3896
+
3897
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3898
+
3899
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3900
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3901
+ if (output == NULL) {
3902
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3903
+ }
3904
+
3905
+ for (int i = 0; i < n_layer; ++i) {
3906
+ auto & layer = layers[i];
3907
+
3908
+ const int64_t num_attention_heads = hparams.n_head(i);
3909
+ const int64_t num_key_value_heads = hparams.n_head_kv(i);
3910
+ const int64_t q_proj_dim = num_attention_heads * head_dim_q;
3911
+ const int64_t k_proj_dim = num_key_value_heads * head_dim_q;
3912
+ const int64_t v_proj_dim = num_key_value_heads * head_dim_v;
3913
+ const int64_t n_ff_cur = hparams.n_ff(i);
3914
+
3915
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3916
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i),
3917
+ {n_embd,q_proj_dim + k_proj_dim + v_proj_dim}, 0);
3918
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim_q}, 0);
3919
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim_q}, 0);
3920
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {num_attention_heads * head_dim_v, n_embd}, 0);
3921
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
3922
+
3923
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3924
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
3925
+
3926
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_cur * 2}, 0);
3927
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
3928
+ }
3929
+ } break;
3750
3930
  case LLM_ARCH_GPT2:
3751
3931
  {
3752
3932
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4637,7 +4817,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4637
4817
 
4638
4818
  // output
4639
4819
  output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4640
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4820
+ // try to load output.weight, if not found, use token_embd (tied embeddings)
4821
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4822
+ if (!output) {
4823
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4824
+ }
4641
4825
 
4642
4826
  for (int i = 0; i < n_layer; ++i) {
4643
4827
  auto & layer = layers[i];
@@ -4700,7 +4884,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4700
4884
 
4701
4885
  // output
4702
4886
  output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4703
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4887
+ // try to load output.weight, if not found, use token_embd (tied embeddings)
4888
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4889
+ if (!output) {
4890
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4891
+ }
4704
4892
 
4705
4893
  for (int i = 0; i < n_layer; ++i) {
4706
4894
  auto & layer = layers[i];
@@ -5067,9 +5255,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5067
5255
  layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
5068
5256
  layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
5069
5257
  layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
5070
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
5071
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
5072
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
5258
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, TENSOR_NOT_REQUIRED | flags);
5259
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, TENSOR_NOT_REQUIRED | flags);
5260
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, TENSOR_NOT_REQUIRED | flags);
5073
5261
 
5074
5262
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
5075
5263
 
@@ -5181,9 +5369,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5181
5369
  const int64_t n_group = hparams.ssm_n_group;
5182
5370
  const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
5183
5371
 
5184
- const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
5185
- const int64_t n_ff_shexp = hparams.n_ff_shexp;
5186
-
5187
5372
  // embeddings
5188
5373
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5189
5374
 
@@ -5235,6 +5420,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5235
5420
  layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5236
5421
  } else {
5237
5422
  if (n_expert != 0) {
5423
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
5424
+ const int64_t n_ff_shexp = hparams.n_ff_shexp;
5425
+
5238
5426
  layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
5239
5427
  layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
5240
5428
 
@@ -6282,6 +6470,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6282
6470
  layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
6283
6471
  }
6284
6472
  }
6473
+
6474
+ // for LFM2-ColBert-350M
6475
+ dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.get_n_embd_out()}, TENSOR_NOT_REQUIRED);
6285
6476
  } break;
6286
6477
  case LLM_ARCH_SMALLTHINKER:
6287
6478
  {
@@ -6584,6 +6775,75 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6584
6775
  layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
6585
6776
  }
6586
6777
  } break;
6778
+ case LLM_ARCH_MIMO2:
6779
+ {
6780
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6781
+
6782
+ // output
6783
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6784
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
6785
+
6786
+ for (int i = 0; i < n_layer; ++i) {
6787
+ auto & layer = layers[i];
6788
+ uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
6789
+ uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
6790
+ uint32_t n_head = hparams.n_head(i);
6791
+
6792
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6793
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
6794
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
6795
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_v * n_head, n_embd }, 0);
6796
+
6797
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6798
+ layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, TENSOR_NOT_REQUIRED);
6799
+
6800
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6801
+
6802
+ // non-MoE branch
6803
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
6804
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED);
6805
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
6806
+
6807
+ // MoE branch
6808
+ int64_t n_ff_exp = hparams.n_ff_exp;
6809
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
6810
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
6811
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED);
6812
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
6813
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
6814
+ }
6815
+ } break;
6816
+ case LLM_ARCH_MAINCODER:
6817
+ {
6818
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6819
+
6820
+ // output
6821
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6822
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6823
+ // if output is NULL, init from the input tok embed
6824
+ if (output == NULL) {
6825
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6826
+ }
6827
+
6828
+ for (int i = 0; i < n_layer; ++i) {
6829
+ auto & layer = layers[i];
6830
+
6831
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6832
+
6833
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6834
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
6835
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
6836
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6837
+
6838
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6839
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6840
+
6841
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6842
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6843
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
6844
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6845
+ }
6846
+ } break;
6587
6847
  default:
6588
6848
  throw std::runtime_error("unknown architecture");
6589
6849
  }
@@ -6765,6 +7025,14 @@ size_t llama_model::n_devices() const {
6765
7025
  return devices.size();
6766
7026
  }
6767
7027
 
7028
+ uint32_t llama_model::n_gpu_layers() const {
7029
+ return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
7030
+ }
7031
+
7032
+ llama_split_mode llama_model::split_mode() const {
7033
+ return params.split_mode;
7034
+ }
7035
+
6768
7036
  std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
6769
7037
  std::map<ggml_backend_buffer_type_t, size_t> ret;
6770
7038
  for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
@@ -6857,6 +7125,10 @@ void llama_model::print_info() const {
6857
7125
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
6858
7126
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
6859
7127
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
7128
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
7129
+ LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
7130
+ LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
7131
+ }
6860
7132
  LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
6861
7133
  LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
6862
7134
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
@@ -7089,6 +7361,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
7089
7361
  case LLM_ARCH_NOMIC_BERT_MOE:
7090
7362
  case LLM_ARCH_NEO_BERT:
7091
7363
  case LLM_ARCH_WAVTOKENIZER_DEC:
7364
+ case LLM_ARCH_MODERN_BERT:
7092
7365
  case LLM_ARCH_GEMMA_EMBEDDING:
7093
7366
  case LLM_ARCH_DREAM:
7094
7367
  case LLM_ARCH_LLADA:
@@ -7206,16 +7479,24 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7206
7479
  switch (arch) {
7207
7480
  case LLM_ARCH_LLAMA:
7208
7481
  {
7209
- llm = std::make_unique<llm_build_llama>(*this, params);
7482
+ llm = std::make_unique<llm_build_llama<false>>(*this, params);
7210
7483
  } break;
7211
7484
  case LLM_ARCH_LLAMA4:
7212
7485
  {
7213
7486
  if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
7214
- llm = std::make_unique<llm_build_llama>(*this, params);
7487
+ llm = std::make_unique<llm_build_llama<false>>(*this, params);
7215
7488
  } else {
7216
7489
  llm = std::make_unique<llm_build_llama_iswa>(*this, params);
7217
7490
  }
7218
7491
  } break;
7492
+ case LLM_ARCH_LLAMA_EMBED:
7493
+ {
7494
+ llm = std::make_unique<llm_build_llama<true>>(*this, params);
7495
+ } break;
7496
+ case LLM_ARCH_MAINCODER:
7497
+ {
7498
+ llm = std::make_unique<llm_build_maincoder>(*this, params);
7499
+ } break;
7219
7500
  case LLM_ARCH_DECI:
7220
7501
  {
7221
7502
  llm = std::make_unique<llm_build_deci>(*this, params);
@@ -7248,6 +7529,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7248
7529
  {
7249
7530
  llm = std::make_unique<llm_build_bert>(*this, params);
7250
7531
  } break;
7532
+ case LLM_ARCH_MODERN_BERT:
7533
+ {
7534
+ llm = std::make_unique<llm_build_modern_bert>(*this, params);
7535
+ } break;
7251
7536
  case LLM_ARCH_NEO_BERT:
7252
7537
  {
7253
7538
  llm = std::make_unique<llm_build_neo_bert>(*this, params);
@@ -7337,6 +7622,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7337
7622
  {
7338
7623
  llm = std::make_unique<llm_build_plamo2>(*this, params);
7339
7624
  } break;
7625
+ case LLM_ARCH_PLAMO3:
7626
+ {
7627
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
7628
+ llm = std::make_unique<llm_build_plamo3<true>> (*this, params);
7629
+ } else {
7630
+ llm = std::make_unique<llm_build_plamo3<false>>(*this, params);
7631
+ }
7632
+ } break;
7340
7633
  case LLM_ARCH_GPT2:
7341
7634
  {
7342
7635
  llm = std::make_unique<llm_build_gpt2>(*this, params);
@@ -7637,6 +7930,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7637
7930
  {
7638
7931
  llm = std::make_unique<llm_build_mistral3>(*this, params);
7639
7932
  } break;
7933
+ case LLM_ARCH_MIMO2:
7934
+ {
7935
+ llm = std::make_unique<llm_build_mimo2_iswa>(*this, params);
7936
+ } break;
7640
7937
  default:
7641
7938
  GGML_ABORT("fatal error");
7642
7939
  }
@@ -7644,12 +7941,17 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7644
7941
  // add on pooling layer
7645
7942
  llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
7646
7943
 
7944
+ // add backend sampling layers (if any)
7945
+ llm->build_sampling();
7946
+
7647
7947
  // if the gguf model was converted with --sentence-transformers-dense-modules
7648
7948
  // there will be two additional dense projection layers
7649
7949
  // dense linear projections are applied after pooling
7650
7950
  // TODO: move reranking logic here and generalize
7651
7951
  llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
7652
7952
 
7953
+ llm->res->set_outputs();
7954
+
7653
7955
  return llm->res->get_gf();
7654
7956
  }
7655
7957
 
@@ -7662,7 +7964,7 @@ llama_model_params llama_model_default_params() {
7662
7964
  llama_model_params result = {
7663
7965
  /*.devices =*/ nullptr,
7664
7966
  /*.tensor_buft_overrides =*/ nullptr,
7665
- /*.n_gpu_layers =*/ 999,
7967
+ /*.n_gpu_layers =*/ -1,
7666
7968
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
7667
7969
  /*.main_gpu =*/ 0,
7668
7970
  /*.tensor_split =*/ nullptr,
@@ -7705,6 +8007,10 @@ int32_t llama_model_n_embd_inp(const llama_model * model) {
7705
8007
  return model->hparams.n_embd_inp();
7706
8008
  }
7707
8009
 
8010
+ int32_t llama_model_n_embd_out(const llama_model * model) {
8011
+ return model->hparams.get_n_embd_out();
8012
+ }
8013
+
7708
8014
  int32_t llama_model_n_layer(const llama_model * model) {
7709
8015
  return model->hparams.n_layer;
7710
8016
  }
@@ -7807,6 +8113,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7807
8113
  case LLM_ARCH_ERNIE4_5:
7808
8114
  case LLM_ARCH_ERNIE4_5_MOE:
7809
8115
  case LLM_ARCH_MISTRAL3:
8116
+ case LLM_ARCH_LLAMA_EMBED:
8117
+ case LLM_ARCH_MAINCODER:
7810
8118
  return LLAMA_ROPE_TYPE_NORM;
7811
8119
 
7812
8120
  // the pairs of head values are offset by n_rot/2
@@ -7816,6 +8124,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7816
8124
  case LLM_ARCH_DBRX:
7817
8125
  case LLM_ARCH_BERT:
7818
8126
  case LLM_ARCH_JINA_BERT_V3:
8127
+ case LLM_ARCH_MODERN_BERT:
7819
8128
  case LLM_ARCH_NOMIC_BERT:
7820
8129
  case LLM_ARCH_NOMIC_BERT_MOE:
7821
8130
  case LLM_ARCH_STABLELM:
@@ -7835,6 +8144,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7835
8144
  case LLM_ARCH_PHIMOE:
7836
8145
  case LLM_ARCH_PLAMO:
7837
8146
  case LLM_ARCH_PLAMO2:
8147
+ case LLM_ARCH_PLAMO3:
7838
8148
  case LLM_ARCH_GEMMA:
7839
8149
  case LLM_ARCH_GEMMA2:
7840
8150
  case LLM_ARCH_GEMMA3:
@@ -7865,6 +8175,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7865
8175
  case LLM_ARCH_PANGU_EMBED:
7866
8176
  case LLM_ARCH_AFMOE:
7867
8177
  case LLM_ARCH_QWEN3NEXT:
8178
+ case LLM_ARCH_MIMO2:
7868
8179
  return LLAMA_ROPE_TYPE_NEOX;
7869
8180
 
7870
8181
  case LLM_ARCH_QWEN2VL: