@fugood/llama.node 1.4.11 → 1.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/package.json +15 -15
  2. package/scripts/llama.cpp.patch +30 -30
  3. package/src/llama.cpp/common/arg.cpp +29 -14
  4. package/src/llama.cpp/common/arg.h +1 -0
  5. package/src/llama.cpp/common/chat-parser.cpp +11 -0
  6. package/src/llama.cpp/common/chat.cpp +32 -3
  7. package/src/llama.cpp/common/chat.h +1 -0
  8. package/src/llama.cpp/common/common.cpp +23 -23
  9. package/src/llama.cpp/common/common.h +1 -1
  10. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  11. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
  12. package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  13. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  16. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  17. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  19. package/src/llama.cpp/include/llama.h +13 -4
  20. package/src/llama.cpp/src/CMakeLists.txt +4 -0
  21. package/src/llama.cpp/src/llama-adapter.cpp +12 -3
  22. package/src/llama.cpp/src/llama-adapter.h +7 -1
  23. package/src/llama.cpp/src/llama-arch.cpp +76 -0
  24. package/src/llama.cpp/src/llama-arch.h +7 -0
  25. package/src/llama.cpp/src/llama-chat.cpp +11 -0
  26. package/src/llama.cpp/src/llama-chat.h +1 -0
  27. package/src/llama.cpp/src/llama-context.cpp +22 -21
  28. package/src/llama.cpp/src/llama-hparams.h +4 -3
  29. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  30. package/src/llama.cpp/src/llama-mmap.cpp +11 -4
  31. package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
  32. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  33. package/src/llama.cpp/src/llama-model.cpp +287 -16
  34. package/src/llama.cpp/src/llama-model.h +13 -2
  35. package/src/llama.cpp/src/llama-sampling.cpp +44 -33
  36. package/src/llama.cpp/src/llama-sampling.h +3 -0
  37. package/src/llama.cpp/src/llama-vocab.cpp +101 -33
  38. package/src/llama.cpp/src/llama-vocab.h +2 -0
  39. package/src/llama.cpp/src/llama.cpp +52 -37
  40. package/src/llama.cpp/src/models/bert.cpp +4 -2
  41. package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
  42. package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
  43. package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  44. package/src/llama.cpp/src/models/gemma3.cpp +3 -4
  45. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  46. package/src/llama.cpp/src/models/llama.cpp +19 -6
  47. package/src/llama.cpp/src/models/maincoder.cpp +117 -0
  48. package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  49. package/src/llama.cpp/src/models/models.h +18 -0
  50. package/src/llama.cpp/src/models/modern-bert.cpp +115 -0
  51. package/src/llama.cpp/src/models/plamo3.cpp +128 -0
  52. package/src/llama.cpp/src/unicode.cpp +23 -14
@@ -31,12 +31,14 @@ const char * llm_type_name(llm_type type) {
31
31
  case LLM_TYPE_17M: return "17M";
32
32
  case LLM_TYPE_22M: return "22M";
33
33
  case LLM_TYPE_33M: return "33M";
34
+ case LLM_TYPE_47M: return "47M";
34
35
  case LLM_TYPE_60M: return "60M";
35
36
  case LLM_TYPE_70M: return "70M";
36
37
  case LLM_TYPE_80M: return "80M";
37
38
  case LLM_TYPE_109M: return "109M";
38
39
  case LLM_TYPE_137M: return "137M";
39
40
  case LLM_TYPE_140M: return "140M";
41
+ case LLM_TYPE_149M: return "149M";
40
42
  case LLM_TYPE_160M: return "160M";
41
43
  case LLM_TYPE_190M: return "190M";
42
44
  case LLM_TYPE_220M: return "220M";
@@ -46,6 +48,7 @@ const char * llm_type_name(llm_type type) {
46
48
  case LLM_TYPE_335M: return "335M";
47
49
  case LLM_TYPE_350M: return "350M";
48
50
  case LLM_TYPE_360M: return "360M";
51
+ case LLM_TYPE_395M: return "395M";
49
52
  case LLM_TYPE_410M: return "410M";
50
53
  case LLM_TYPE_450M: return "450M";
51
54
  case LLM_TYPE_475M: return "475M";
@@ -123,10 +126,12 @@ const char * llm_type_name(llm_type type) {
123
126
  case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
124
127
  case LLM_TYPE_80B_A3B: return "80B.A3B";
125
128
  case LLM_TYPE_100B_A6B: return "100B.A6B";
129
+ case LLM_TYPE_102B_A12B: return "102B.A12B";
126
130
  case LLM_TYPE_106B_A12B: return "106B.A12B";
127
131
  case LLM_TYPE_230B_A10B: return "230B.A10B";
128
132
  case LLM_TYPE_235B_A22B: return "235B.A22B";
129
133
  case LLM_TYPE_300B_A47B: return "300B.A47B";
134
+ case LLM_TYPE_310B_A15B: return "310B.A15B";
130
135
  case LLM_TYPE_355B_A32B: return "355B.A32B";
131
136
  case LLM_TYPE_E2B: return "E2B";
132
137
  case LLM_TYPE_E4B: return "E4B";
@@ -603,7 +608,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
603
608
 
604
609
  ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
605
610
 
606
- if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
611
+ if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
607
612
  if (hparams.n_rot != hparams.n_embd_head_k) {
608
613
  throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
609
614
  }
@@ -627,6 +632,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
627
632
  // arch-specific KVs
628
633
  switch (arch) {
629
634
  case LLM_ARCH_LLAMA:
635
+ case LLM_ARCH_LLAMA_EMBED:
630
636
  {
631
637
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
632
638
 
@@ -875,6 +881,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
875
881
  default: type = LLM_TYPE_UNKNOWN;
876
882
  }
877
883
  } break;
884
+ case LLM_ARCH_MODERN_BERT:
885
+ {
886
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
887
+ if (found_swa && hparams.n_swa > 0) {
888
+ uint32_t swa_period = 3;
889
+ hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
890
+
891
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
892
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
893
+ hparams.set_swa_pattern(swa_period);
894
+ } else {
895
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
896
+ }
897
+
898
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
899
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
900
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
901
+
902
+ switch (hparams.n_layer) {
903
+ case 12:
904
+ type = LLM_TYPE_47M; break; // granite-embedding-small
905
+ case 22:
906
+ type = LLM_TYPE_149M; break; // modern-bert-base
907
+ case 28:
908
+ type = LLM_TYPE_395M; break; // modern-bert-large
909
+ default: type = LLM_TYPE_UNKNOWN;
910
+ }
911
+ } break;
878
912
  case LLM_ARCH_JINA_BERT_V2:
879
913
  {
880
914
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1076,6 +1110,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1076
1110
  default: type = LLM_TYPE_UNKNOWN;
1077
1111
  }
1078
1112
  } break;
1113
+ case LLM_ARCH_MAINCODER:
1114
+ {
1115
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1116
+ switch (hparams.n_layer) {
1117
+ case 32: type = LLM_TYPE_1B; break;
1118
+ default: type = LLM_TYPE_UNKNOWN;
1119
+ }
1120
+ } break;
1079
1121
  case LLM_ARCH_QWEN3VL:
1080
1122
  {
1081
1123
  ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
@@ -1194,6 +1236,26 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1194
1236
  ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
1195
1237
  ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
1196
1238
  } break;
1239
+ case LLM_ARCH_PLAMO3:
1240
+ {
1241
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1242
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1243
+ if (found_swa && hparams.n_swa > 0) {
1244
+ uint32_t swa_period = 8;
1245
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1246
+ hparams.rope_freq_scale_train_swa = 1.0f;
1247
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
1248
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1249
+ hparams.set_swa_pattern(swa_period);
1250
+ } else {
1251
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1252
+ }
1253
+
1254
+ switch (hparams.n_layer) {
1255
+ case 24: type = LLM_TYPE_2B; break;
1256
+ default: type = LLM_TYPE_UNKNOWN;
1257
+ }
1258
+ } break;
1197
1259
  case LLM_ARCH_GPT2:
1198
1260
  {
1199
1261
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1629,7 +1691,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1629
1691
  ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
1630
1692
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1631
1693
  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1632
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1694
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
1633
1695
  ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1634
1696
  ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1635
1697
  if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
@@ -1725,6 +1787,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1725
1787
 
1726
1788
  switch (hparams.n_layer) {
1727
1789
  case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
1790
+ case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
1728
1791
  case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
1729
1792
  default: type = LLM_TYPE_UNKNOWN;
1730
1793
  }
@@ -2307,6 +2370,22 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2307
2370
  default: type = LLM_TYPE_UNKNOWN;
2308
2371
  }
2309
2372
  } break;
2373
+ case LLM_ARCH_MIMO2:
2374
+ {
2375
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2376
+
2377
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2378
+
2379
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2380
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
2381
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
2382
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
2383
+
2384
+ switch (hparams.n_layer) {
2385
+ case 48: type = LLM_TYPE_310B_A15B; break;
2386
+ default: type = LLM_TYPE_UNKNOWN;
2387
+ }
2388
+ } break;
2310
2389
  default: throw std::runtime_error("unsupported model architecture");
2311
2390
  }
2312
2391
 
@@ -2329,11 +2408,11 @@ void llama_model::load_vocab(llama_model_loader & ml) {
2329
2408
 
2330
2409
  bool llama_model::load_tensors(llama_model_loader & ml) {
2331
2410
  const auto & split_mode = params.split_mode;
2332
- const auto & n_gpu_layers = params.n_gpu_layers;
2333
2411
  const auto & use_mlock = params.use_mlock;
2334
2412
  const auto & tensor_split = params.tensor_split;
2335
2413
 
2336
- const int n_layer = hparams.n_layer;
2414
+ const int n_layer = hparams.n_layer;
2415
+ const int n_gpu_layers = this->n_gpu_layers();
2337
2416
 
2338
2417
  const bool use_mmap_buffer = true;
2339
2418
 
@@ -2621,6 +2700,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2621
2700
  case LLM_ARCH_GRANITE:
2622
2701
  case LLM_ARCH_GRANITE_MOE:
2623
2702
  case LLM_ARCH_MISTRAL3:
2703
+ case LLM_ARCH_LLAMA_EMBED:
2624
2704
  {
2625
2705
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2626
2706
 
@@ -3155,6 +3235,37 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3155
3235
  layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
3156
3236
  }
3157
3237
  } break;
3238
+ case LLM_ARCH_MODERN_BERT:
3239
+ {
3240
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3241
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
3242
+
3243
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3244
+
3245
+ for(int i = 0; i < n_layer; ++i) {
3246
+ auto& layer = layers[i];
3247
+
3248
+ if ( i != 0 ) {
3249
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3250
+ } else{
3251
+ // layer 0 uses identity
3252
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3253
+ }
3254
+
3255
+
3256
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
3257
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3258
+
3259
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, 2 * n_ff}, 0);
3260
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3261
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3262
+ }
3263
+
3264
+ cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
3265
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3266
+ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3267
+
3268
+ } break;
3158
3269
  case LLM_ARCH_NEO_BERT:
3159
3270
  {
3160
3271
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3219,7 +3330,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3219
3330
  layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3220
3331
 
3221
3332
  layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
3222
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
3333
+
3334
+ const auto tn_ffn_up_weight = tn(LLM_TENSOR_FFN_UP, "weight", i);
3335
+ ggml_tensor * t_ffn_up = ml.get_tensor_meta(tn_ffn_up_weight.str().c_str());
3336
+ const int64_t n_ffn_up = t_ffn_up ? t_ffn_up->ne[1] : n_ff;
3337
+
3338
+ GGML_ASSERT(n_ffn_up == n_ff || n_ffn_up == n_ff * 2);
3339
+ layer.ffn_up = create_tensor(tn_ffn_up_weight, {n_embd, n_ffn_up}, 0);
3340
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ffn_up}, TENSOR_NOT_REQUIRED);
3223
3341
 
3224
3342
  layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3225
3343
  layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
@@ -3747,6 +3865,44 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3747
3865
  layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
3748
3866
  }
3749
3867
  } break;
3868
+ case LLM_ARCH_PLAMO3:
3869
+ {
3870
+ const int64_t head_dim_q = hparams.n_embd_head_k;
3871
+ const int64_t head_dim_v = hparams.n_embd_head_v;
3872
+
3873
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3874
+
3875
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3876
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3877
+ if (output == NULL) {
3878
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3879
+ }
3880
+
3881
+ for (int i = 0; i < n_layer; ++i) {
3882
+ auto & layer = layers[i];
3883
+
3884
+ const int64_t num_attention_heads = hparams.n_head(i);
3885
+ const int64_t num_key_value_heads = hparams.n_head_kv(i);
3886
+ const int64_t q_proj_dim = num_attention_heads * head_dim_q;
3887
+ const int64_t k_proj_dim = num_key_value_heads * head_dim_q;
3888
+ const int64_t v_proj_dim = num_key_value_heads * head_dim_v;
3889
+ const int64_t n_ff_cur = hparams.n_ff(i);
3890
+
3891
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3892
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i),
3893
+ {n_embd,q_proj_dim + k_proj_dim + v_proj_dim}, 0);
3894
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim_q}, 0);
3895
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim_q}, 0);
3896
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {num_attention_heads * head_dim_v, n_embd}, 0);
3897
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
3898
+
3899
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3900
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
3901
+
3902
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_cur * 2}, 0);
3903
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
3904
+ }
3905
+ } break;
3750
3906
  case LLM_ARCH_GPT2:
3751
3907
  {
3752
3908
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4637,7 +4793,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4637
4793
 
4638
4794
  // output
4639
4795
  output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4640
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4796
+ // try to load output.weight, if not found, use token_embd (tied embeddings)
4797
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4798
+ if (!output) {
4799
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4800
+ }
4641
4801
 
4642
4802
  for (int i = 0; i < n_layer; ++i) {
4643
4803
  auto & layer = layers[i];
@@ -4700,7 +4860,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4700
4860
 
4701
4861
  // output
4702
4862
  output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4703
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4863
+ // try to load output.weight, if not found, use token_embd (tied embeddings)
4864
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4865
+ if (!output) {
4866
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4867
+ }
4704
4868
 
4705
4869
  for (int i = 0; i < n_layer; ++i) {
4706
4870
  auto & layer = layers[i];
@@ -5067,9 +5231,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5067
5231
  layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
5068
5232
  layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
5069
5233
  layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
5070
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
5071
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
5072
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
5234
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, TENSOR_NOT_REQUIRED | flags);
5235
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, TENSOR_NOT_REQUIRED | flags);
5236
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, TENSOR_NOT_REQUIRED | flags);
5073
5237
 
5074
5238
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
5075
5239
 
@@ -5181,9 +5345,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5181
5345
  const int64_t n_group = hparams.ssm_n_group;
5182
5346
  const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
5183
5347
 
5184
- const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
5185
- const int64_t n_ff_shexp = hparams.n_ff_shexp;
5186
-
5187
5348
  // embeddings
5188
5349
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5189
5350
 
@@ -5235,6 +5396,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5235
5396
  layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5236
5397
  } else {
5237
5398
  if (n_expert != 0) {
5399
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
5400
+ const int64_t n_ff_shexp = hparams.n_ff_shexp;
5401
+
5238
5402
  layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
5239
5403
  layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
5240
5404
 
@@ -6584,6 +6748,75 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6584
6748
  layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
6585
6749
  }
6586
6750
  } break;
6751
+ case LLM_ARCH_MIMO2:
6752
+ {
6753
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6754
+
6755
+ // output
6756
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6757
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
6758
+
6759
+ for (int i = 0; i < n_layer; ++i) {
6760
+ auto & layer = layers[i];
6761
+ uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
6762
+ uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
6763
+ uint32_t n_head = hparams.n_head(i);
6764
+
6765
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6766
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
6767
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
6768
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_v * n_head, n_embd }, 0);
6769
+
6770
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6771
+ layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, TENSOR_NOT_REQUIRED);
6772
+
6773
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6774
+
6775
+ // non-MoE branch
6776
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
6777
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED);
6778
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
6779
+
6780
+ // MoE branch
6781
+ int64_t n_ff_exp = hparams.n_ff_exp;
6782
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
6783
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
6784
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED);
6785
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
6786
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
6787
+ }
6788
+ } break;
6789
+ case LLM_ARCH_MAINCODER:
6790
+ {
6791
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6792
+
6793
+ // output
6794
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6795
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6796
+ // if output is NULL, init from the input tok embed
6797
+ if (output == NULL) {
6798
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6799
+ }
6800
+
6801
+ for (int i = 0; i < n_layer; ++i) {
6802
+ auto & layer = layers[i];
6803
+
6804
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6805
+
6806
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6807
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
6808
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
6809
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6810
+
6811
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6812
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6813
+
6814
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6815
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6816
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
6817
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6818
+ }
6819
+ } break;
6587
6820
  default:
6588
6821
  throw std::runtime_error("unknown architecture");
6589
6822
  }
@@ -6765,6 +6998,14 @@ size_t llama_model::n_devices() const {
6765
6998
  return devices.size();
6766
6999
  }
6767
7000
 
7001
+ uint32_t llama_model::n_gpu_layers() const {
7002
+ return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
7003
+ }
7004
+
7005
+ llama_split_mode llama_model::split_mode() const {
7006
+ return params.split_mode;
7007
+ }
7008
+
6768
7009
  std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
6769
7010
  std::map<ggml_backend_buffer_type_t, size_t> ret;
6770
7011
  for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
@@ -7089,6 +7330,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
7089
7330
  case LLM_ARCH_NOMIC_BERT_MOE:
7090
7331
  case LLM_ARCH_NEO_BERT:
7091
7332
  case LLM_ARCH_WAVTOKENIZER_DEC:
7333
+ case LLM_ARCH_MODERN_BERT:
7092
7334
  case LLM_ARCH_GEMMA_EMBEDDING:
7093
7335
  case LLM_ARCH_DREAM:
7094
7336
  case LLM_ARCH_LLADA:
@@ -7206,16 +7448,24 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7206
7448
  switch (arch) {
7207
7449
  case LLM_ARCH_LLAMA:
7208
7450
  {
7209
- llm = std::make_unique<llm_build_llama>(*this, params);
7451
+ llm = std::make_unique<llm_build_llama<false>>(*this, params);
7210
7452
  } break;
7211
7453
  case LLM_ARCH_LLAMA4:
7212
7454
  {
7213
7455
  if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
7214
- llm = std::make_unique<llm_build_llama>(*this, params);
7456
+ llm = std::make_unique<llm_build_llama<false>>(*this, params);
7215
7457
  } else {
7216
7458
  llm = std::make_unique<llm_build_llama_iswa>(*this, params);
7217
7459
  }
7218
7460
  } break;
7461
+ case LLM_ARCH_LLAMA_EMBED:
7462
+ {
7463
+ llm = std::make_unique<llm_build_llama<true>>(*this, params);
7464
+ } break;
7465
+ case LLM_ARCH_MAINCODER:
7466
+ {
7467
+ llm = std::make_unique<llm_build_maincoder>(*this, params);
7468
+ } break;
7219
7469
  case LLM_ARCH_DECI:
7220
7470
  {
7221
7471
  llm = std::make_unique<llm_build_deci>(*this, params);
@@ -7248,6 +7498,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7248
7498
  {
7249
7499
  llm = std::make_unique<llm_build_bert>(*this, params);
7250
7500
  } break;
7501
+ case LLM_ARCH_MODERN_BERT:
7502
+ {
7503
+ llm = std::make_unique<llm_build_modern_bert>(*this, params);
7504
+ } break;
7251
7505
  case LLM_ARCH_NEO_BERT:
7252
7506
  {
7253
7507
  llm = std::make_unique<llm_build_neo_bert>(*this, params);
@@ -7337,6 +7591,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7337
7591
  {
7338
7592
  llm = std::make_unique<llm_build_plamo2>(*this, params);
7339
7593
  } break;
7594
+ case LLM_ARCH_PLAMO3:
7595
+ {
7596
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
7597
+ llm = std::make_unique<llm_build_plamo3<true>> (*this, params);
7598
+ } else {
7599
+ llm = std::make_unique<llm_build_plamo3<false>>(*this, params);
7600
+ }
7601
+ } break;
7340
7602
  case LLM_ARCH_GPT2:
7341
7603
  {
7342
7604
  llm = std::make_unique<llm_build_gpt2>(*this, params);
@@ -7637,6 +7899,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7637
7899
  {
7638
7900
  llm = std::make_unique<llm_build_mistral3>(*this, params);
7639
7901
  } break;
7902
+ case LLM_ARCH_MIMO2:
7903
+ {
7904
+ llm = std::make_unique<llm_build_mimo2_iswa>(*this, params);
7905
+ } break;
7640
7906
  default:
7641
7907
  GGML_ABORT("fatal error");
7642
7908
  }
@@ -7662,7 +7928,7 @@ llama_model_params llama_model_default_params() {
7662
7928
  llama_model_params result = {
7663
7929
  /*.devices =*/ nullptr,
7664
7930
  /*.tensor_buft_overrides =*/ nullptr,
7665
- /*.n_gpu_layers =*/ 999,
7931
+ /*.n_gpu_layers =*/ -1,
7666
7932
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
7667
7933
  /*.main_gpu =*/ 0,
7668
7934
  /*.tensor_split =*/ nullptr,
@@ -7807,6 +8073,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7807
8073
  case LLM_ARCH_ERNIE4_5:
7808
8074
  case LLM_ARCH_ERNIE4_5_MOE:
7809
8075
  case LLM_ARCH_MISTRAL3:
8076
+ case LLM_ARCH_LLAMA_EMBED:
8077
+ case LLM_ARCH_MAINCODER:
7810
8078
  return LLAMA_ROPE_TYPE_NORM;
7811
8079
 
7812
8080
  // the pairs of head values are offset by n_rot/2
@@ -7816,6 +8084,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7816
8084
  case LLM_ARCH_DBRX:
7817
8085
  case LLM_ARCH_BERT:
7818
8086
  case LLM_ARCH_JINA_BERT_V3:
8087
+ case LLM_ARCH_MODERN_BERT:
7819
8088
  case LLM_ARCH_NOMIC_BERT:
7820
8089
  case LLM_ARCH_NOMIC_BERT_MOE:
7821
8090
  case LLM_ARCH_STABLELM:
@@ -7835,6 +8104,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7835
8104
  case LLM_ARCH_PHIMOE:
7836
8105
  case LLM_ARCH_PLAMO:
7837
8106
  case LLM_ARCH_PLAMO2:
8107
+ case LLM_ARCH_PLAMO3:
7838
8108
  case LLM_ARCH_GEMMA:
7839
8109
  case LLM_ARCH_GEMMA2:
7840
8110
  case LLM_ARCH_GEMMA3:
@@ -7865,6 +8135,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7865
8135
  case LLM_ARCH_PANGU_EMBED:
7866
8136
  case LLM_ARCH_AFMOE:
7867
8137
  case LLM_ARCH_QWEN3NEXT:
8138
+ case LLM_ARCH_MIMO2:
7868
8139
  return LLAMA_ROPE_TYPE_NEOX;
7869
8140
 
7870
8141
  case LLM_ARCH_QWEN2VL:
@@ -24,12 +24,14 @@ enum llm_type {
24
24
  LLM_TYPE_17M,
25
25
  LLM_TYPE_22M,
26
26
  LLM_TYPE_33M,
27
+ LLM_TYPE_47M,
27
28
  LLM_TYPE_60M,
28
29
  LLM_TYPE_70M,
29
30
  LLM_TYPE_80M,
30
31
  LLM_TYPE_109M,
31
32
  LLM_TYPE_137M,
32
33
  LLM_TYPE_140M,
34
+ LLM_TYPE_149M,
33
35
  LLM_TYPE_160M,
34
36
  LLM_TYPE_190M,
35
37
  LLM_TYPE_220M,
@@ -39,6 +41,7 @@ enum llm_type {
39
41
  LLM_TYPE_335M,
40
42
  LLM_TYPE_350M,
41
43
  LLM_TYPE_360M,
44
+ LLM_TYPE_395M,
42
45
  LLM_TYPE_410M,
43
46
  LLM_TYPE_450M,
44
47
  LLM_TYPE_475M,
@@ -116,10 +119,12 @@ enum llm_type {
116
119
  LLM_TYPE_31B_A3_5B,
117
120
  LLM_TYPE_80B_A3B, // Qwen3 Next
118
121
  LLM_TYPE_100B_A6B,
122
+ LLM_TYPE_102B_A12B, // Solar-Open
119
123
  LLM_TYPE_106B_A12B, // GLM-4.5-Air
120
124
  LLM_TYPE_230B_A10B, // Minimax M2
121
125
  LLM_TYPE_235B_A22B,
122
126
  LLM_TYPE_300B_A47B, // Ernie MoE big
127
+ LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
123
128
  LLM_TYPE_355B_A32B, // GLM-4.5
124
129
  LLM_TYPE_E2B,
125
130
  LLM_TYPE_E4B,
@@ -462,8 +467,6 @@ struct llama_model {
462
467
  struct ggml_tensor * dense_2_out_layers = nullptr;
463
468
  struct ggml_tensor * dense_3_out_layers = nullptr;
464
469
 
465
- llama_model_params params;
466
-
467
470
  // gguf metadata
468
471
  std::unordered_map<std::string, std::string> gguf_kv;
469
472
 
@@ -473,6 +476,9 @@ struct llama_model {
473
476
  // for quantize-stats only
474
477
  std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
475
478
 
479
+ // for keeping track of extra nodes used by lora adapters
480
+ uint32_t n_lora_nodes = 0;
481
+
476
482
  int64_t t_load_us = 0;
477
483
  int64_t t_start_us = 0;
478
484
 
@@ -494,6 +500,9 @@ struct llama_model {
494
500
  size_t n_tensors() const;
495
501
  size_t n_devices() const;
496
502
 
503
+ uint32_t n_gpu_layers() const;
504
+ llama_split_mode split_mode() const;
505
+
497
506
  std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
498
507
 
499
508
  // total number of parameters in the model
@@ -522,6 +531,8 @@ struct llama_model {
522
531
  ggml_cgraph * build_graph(const llm_graph_params & params) const;
523
532
 
524
533
  private:
534
+ llama_model_params params;
535
+
525
536
  struct impl;
526
537
  std::unique_ptr<impl> pimpl;
527
538
  };