@fugood/llama.node 1.4.11 → 1.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/package.json +15 -15
  2. package/scripts/llama.cpp.patch +31 -31
  3. package/src/llama.cpp/common/arg.cpp +128 -59
  4. package/src/llama.cpp/common/arg.h +1 -0
  5. package/src/llama.cpp/common/chat-parser.cpp +11 -0
  6. package/src/llama.cpp/common/chat.cpp +36 -7
  7. package/src/llama.cpp/common/chat.h +1 -0
  8. package/src/llama.cpp/common/common.cpp +42 -23
  9. package/src/llama.cpp/common/common.h +11 -1
  10. package/src/llama.cpp/common/llguidance.cpp +10 -6
  11. package/src/llama.cpp/common/regex-partial.cpp +13 -13
  12. package/src/llama.cpp/common/sampling.cpp +58 -14
  13. package/src/llama.cpp/common/sampling.h +3 -1
  14. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  15. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
  16. package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  17. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  19. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  20. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  21. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  23. package/src/llama.cpp/include/llama.h +100 -12
  24. package/src/llama.cpp/src/CMakeLists.txt +4 -0
  25. package/src/llama.cpp/src/llama-adapter.cpp +12 -3
  26. package/src/llama.cpp/src/llama-adapter.h +7 -1
  27. package/src/llama.cpp/src/llama-arch.cpp +78 -0
  28. package/src/llama.cpp/src/llama-arch.h +8 -0
  29. package/src/llama.cpp/src/llama-chat.cpp +11 -0
  30. package/src/llama.cpp/src/llama-chat.h +1 -0
  31. package/src/llama.cpp/src/llama-context.cpp +637 -49
  32. package/src/llama.cpp/src/llama-context.h +43 -1
  33. package/src/llama.cpp/src/llama-grammar.cpp +40 -13
  34. package/src/llama.cpp/src/llama-grammar.h +2 -0
  35. package/src/llama.cpp/src/llama-graph.cpp +173 -5
  36. package/src/llama.cpp/src/llama-graph.h +71 -6
  37. package/src/llama.cpp/src/llama-hparams.cpp +4 -0
  38. package/src/llama.cpp/src/llama-hparams.h +12 -5
  39. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  40. package/src/llama.cpp/src/llama-mmap.cpp +11 -4
  41. package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
  42. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  43. package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
  44. package/src/llama.cpp/src/llama-model.cpp +337 -26
  45. package/src/llama.cpp/src/llama-model.h +13 -2
  46. package/src/llama.cpp/src/llama-sampling.cpp +1259 -186
  47. package/src/llama.cpp/src/llama-sampling.h +19 -7
  48. package/src/llama.cpp/src/llama-vocab.cpp +101 -33
  49. package/src/llama.cpp/src/llama-vocab.h +2 -0
  50. package/src/llama.cpp/src/llama.cpp +87 -64
  51. package/src/llama.cpp/src/models/afmoe.cpp +9 -5
  52. package/src/llama.cpp/src/models/bert.cpp +4 -2
  53. package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
  54. package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  55. package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
  56. package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  57. package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  58. package/src/llama.cpp/src/models/gemma3.cpp +3 -4
  59. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  60. package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
  61. package/src/llama.cpp/src/models/llama.cpp +19 -6
  62. package/src/llama.cpp/src/models/maincoder.cpp +117 -0
  63. package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  64. package/src/llama.cpp/src/models/models.h +18 -0
  65. package/src/llama.cpp/src/models/modern-bert.cpp +116 -0
  66. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  67. package/src/llama.cpp/src/models/plamo3.cpp +128 -0
  68. package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
  69. package/src/llama.cpp/src/unicode.cpp +23 -14
@@ -14,10 +14,6 @@
14
14
  #include <arm_neon.h>
15
15
  #endif
16
16
 
17
- #if defined(__F16C__)
18
- #include <immintrin.h>
19
- #endif
20
-
21
17
  #if defined(__riscv_v_intrinsic)
22
18
  #include <riscv_vector.h>
23
19
  #endif
@@ -286,7 +286,7 @@ extern "C" {
286
286
  // NULL-terminated list of buffer types to use for tensors that match a pattern
287
287
  const struct llama_model_tensor_buft_override * tensor_buft_overrides;
288
288
 
289
- int32_t n_gpu_layers; // number of layers to store in VRAM
289
+ int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
290
290
  enum llama_split_mode split_mode; // how to split the model across multiple GPUs
291
291
 
292
292
  // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
@@ -316,6 +316,11 @@ extern "C" {
316
316
  bool no_alloc; // only load metadata and simulate memory allocations
317
317
  };
318
318
 
319
+ struct llama_sampler_seq_config {
320
+ llama_seq_id seq_id;
321
+ struct llama_sampler * sampler;
322
+ };
323
+
319
324
  // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
320
325
  // https://github.com/ggml-org/llama.cpp/pull/7544
321
326
  struct llama_context_params {
@@ -364,6 +369,12 @@ extern "C" {
364
369
  bool kv_unified; // use a unified buffer across the input sequences when computing the attention
365
370
  // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
366
371
  // ref: https://github.com/ggml-org/llama.cpp/pull/14363
372
+
373
+ // [EXPERIMENTAL]
374
+ // backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
375
+ // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
376
+ struct llama_sampler_seq_config * samplers;
377
+ size_t n_samplers;
367
378
  };
368
379
 
369
380
  // model quantization parameters
@@ -467,10 +478,17 @@ extern "C" {
467
478
  // Frees all allocated memory
468
479
  LLAMA_API void llama_free(struct llama_context * ctx);
469
480
 
481
+ enum llama_params_fit_status {
482
+ LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
483
+ LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
484
+ LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occured, e.g. because no model could be found at the specified path
485
+ };
486
+
470
487
  // fits mparams and cparams to free device memory (assumes system memory is unlimited)
471
- // returns true if the parameters could be successfully modified to fit device memory
472
- // this function is NOT thread safe because it modifies the global llama logger state
473
- LLAMA_API bool llama_params_fit(
488
+ // - returns true if the parameters could be successfully modified to fit device memory
489
+ // - this function is NOT thread safe because it modifies the global llama logger state
490
+ // - only parameters that have the same value as in llama_default_model_params are modified
491
+ LLAMA_API enum llama_params_fit_status llama_params_fit(
474
492
  const char * path_model,
475
493
  struct llama_model_params * mparams,
476
494
  struct llama_context_params * cparams,
@@ -517,6 +535,7 @@ extern "C" {
517
535
  LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
518
536
  LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
519
537
  LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
538
+ LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
520
539
  LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
521
540
  LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
522
541
  LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
@@ -600,6 +619,8 @@ extern "C" {
600
619
  //
601
620
 
602
621
  // Load a LoRA adapter from file
622
+ // The adapter is valid as long as the associated model is not freed
623
+ // All adapters must be loaded before context creation
603
624
  LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
604
625
  struct llama_model * model,
605
626
  const char * path_lora);
@@ -983,6 +1004,32 @@ extern "C" {
983
1004
  // otherwise: float[n_embd] (1-dimensional)
984
1005
  LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
985
1006
 
1007
+ //
1008
+ // backend sampling API [EXPERIMENTAL]
1009
+ // note: use only if the llama_context was created with at least one llama_sampler_seq_config
1010
+ //
1011
+
1012
+ // Get the backend sampled token for the ith token.
1013
+ // Returns LLAMA_TOKEN_NULL if no token was sampled.
1014
+ LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
1015
+
1016
+ // Get the backend sampled probabilites for the ith token
1017
+ // The index matches llama_get_sampled_token_ith().
1018
+ // Returns NULL if no probabilites were generated.
1019
+ LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i);
1020
+ LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
1021
+
1022
+ // Get the backend sampled logits for the ith token
1023
+ // Returns NULL if no logits were sampled.
1024
+ LLAMA_API float * llama_get_sampled_logits_ith (struct llama_context * ctx, int32_t i);
1025
+ LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i);
1026
+
1027
+ // Get the backend sampled candidates (token ids) for the ith token
1028
+ // These are needed to map probability/logit indices to vocab token ids.
1029
+ // Returns NULL if no candidates were sampled.
1030
+ LLAMA_API llama_token * llama_get_sampled_candidates_ith (struct llama_context * ctx, int32_t i);
1031
+ LLAMA_API uint32_t llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i);
1032
+
986
1033
  //
987
1034
  // Vocab
988
1035
  //
@@ -1154,11 +1201,16 @@ extern "C" {
1154
1201
  //
1155
1202
  // llama_sampler_free(smpl);
1156
1203
  //
1157
- // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
1158
- //
1159
1204
 
1160
1205
  typedef void * llama_sampler_context_t;
1161
1206
 
1207
+ struct llama_sampler_data {
1208
+ struct ggml_tensor * logits;
1209
+ struct ggml_tensor * probs;
1210
+ struct ggml_tensor * sampled;
1211
+ struct ggml_tensor * candidates;
1212
+ };
1213
+
1162
1214
  // user code can implement the interface below in order to create custom llama_sampler
1163
1215
  struct llama_sampler_i {
1164
1216
  const char * (*name) (const struct llama_sampler * smpl); // can be NULL
@@ -1168,17 +1220,45 @@ extern "C" {
1168
1220
  struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL
1169
1221
  void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL
1170
1222
 
1171
- // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
1172
- //void (*apply_ggml) (struct llama_sampler * smpl, ...);
1223
+ // [EXPERIMENTAL]
1224
+ // backend sampling interface:
1225
+
1226
+ // return true if the backend supports all ops needed by the sampler
1227
+ // note: call once per sampler
1228
+ bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
1229
+
1230
+ // call after .backend_apply()
1231
+ void (*backend_accept)(
1232
+ struct llama_sampler * smpl,
1233
+ struct ggml_context * ctx,
1234
+ struct ggml_cgraph * gf,
1235
+ struct ggml_tensor * selected_token);
1236
+
1237
+ // call after .backend_init()
1238
+ void (*backend_apply)(
1239
+ struct llama_sampler * smpl,
1240
+ struct ggml_context * ctx,
1241
+ struct ggml_cgraph * gf,
1242
+ struct llama_sampler_data * data);
1243
+
1244
+ // called before graph execution to set inputs for the current ubatch
1245
+ void (*backend_set_input)(struct llama_sampler * smpl);
1173
1246
  };
1174
1247
 
1175
1248
  struct llama_sampler {
1176
- const struct llama_sampler_i * iface;
1177
- llama_sampler_context_t ctx;
1249
+ struct llama_sampler_i * iface;
1250
+
1251
+ llama_sampler_context_t ctx;
1178
1252
  };
1179
1253
 
1254
+ // [EXPERIMENTAL]
1255
+ // attach a sampler to the context
1256
+ // note: prefer initializing the context with llama_context_params.samplers when possible
1257
+ // note: changing the samplers of a context can cause graph reallocations and degraded performance
1258
+ LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
1259
+
1180
1260
  // mirror of llama_sampler_i:
1181
- LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
1261
+ LLAMA_API struct llama_sampler * llama_sampler_init ( struct llama_sampler_i * iface, llama_sampler_context_t ctx);
1182
1262
  LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
1183
1263
  LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
1184
1264
  LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
@@ -1194,7 +1274,15 @@ extern "C" {
1194
1274
 
1195
1275
  // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
1196
1276
  LLAMA_API void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl);
1197
- LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
1277
+
1278
+ // return NULL if:
1279
+ // - the sampler is NULL
1280
+ // - the sampler is not a llama_sampler_chain
1281
+ // - the index is out of bounds, unless i == -1
1282
+ // - if i == -1, returns the chain itself (can be used to check if the sampler is a chain)
1283
+ LLAMA_API struct llama_sampler * llama_sampler_chain_get( struct llama_sampler * chain, int32_t i);
1284
+
1285
+ // the total number of samplers in the chain
1198
1286
  LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
1199
1287
 
1200
1288
  // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
@@ -87,9 +87,12 @@ add_library(llama
87
87
  models/llada.cpp
88
88
  models/llama-iswa.cpp
89
89
  models/llama.cpp
90
+ models/maincoder.cpp
90
91
  models/mamba.cpp
92
+ models/mimo2-iswa.cpp
91
93
  models/minicpm3.cpp
92
94
  models/minimax-m2.cpp
95
+ models/modern-bert.cpp
93
96
  models/mpt.cpp
94
97
  models/nemotron-h.cpp
95
98
  models/nemotron.cpp
@@ -105,6 +108,7 @@ add_library(llama
105
108
  models/phi3.cpp
106
109
  models/plamo.cpp
107
110
  models/plamo2.cpp
111
+ models/plamo3.cpp
108
112
  models/plm.cpp
109
113
  models/qwen.cpp
110
114
  models/qwen2.cpp
@@ -146,9 +146,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
146
146
  return nullptr;
147
147
  }
148
148
 
149
- static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
149
+ static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
150
150
  LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
151
151
 
152
+ llama_model & model = adapter.model;
153
+
152
154
  ggml_context * ctx_init;
153
155
  gguf_init_params meta_gguf_params = {
154
156
  /* .no_alloc = */ true,
@@ -411,14 +413,17 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
411
413
  }
412
414
  }
413
415
 
416
+ // update number of nodes used
417
+ model.n_lora_nodes += adapter.get_n_nodes();
418
+
414
419
  LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
415
420
  }
416
421
 
417
422
  llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
418
- llama_adapter_lora * adapter = new llama_adapter_lora();
423
+ llama_adapter_lora * adapter = new llama_adapter_lora(*model);
419
424
 
420
425
  try {
421
- llama_adapter_lora_init_impl(*model, path_lora, *adapter);
426
+ llama_adapter_lora_init_impl(path_lora, *adapter);
422
427
  return adapter;
423
428
  } catch (const std::exception & err) {
424
429
  LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
@@ -469,6 +474,10 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
469
474
  }
470
475
 
471
476
  void llama_adapter_lora_free(llama_adapter_lora * adapter) {
477
+ // update number of nodes used
478
+ GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
479
+ adapter->model.n_lora_nodes -= adapter->get_n_nodes();
480
+
472
481
  delete adapter;
473
482
  }
474
483
 
@@ -59,6 +59,8 @@ struct llama_adapter_lora_weight {
59
59
  };
60
60
 
61
61
  struct llama_adapter_lora {
62
+ llama_model & model;
63
+
62
64
  // map tensor name to lora_a_b
63
65
  std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
64
66
 
@@ -73,10 +75,14 @@ struct llama_adapter_lora {
73
75
  // activated lora (aLoRA)
74
76
  std::vector<llama_token> alora_invocation_tokens;
75
77
 
76
- llama_adapter_lora() = default;
78
+ llama_adapter_lora(llama_model & model) : model(model) {}
77
79
  ~llama_adapter_lora() = default;
78
80
 
79
81
  llama_adapter_lora_weight * get_weight(ggml_tensor * w);
82
+
83
+ uint32_t get_n_nodes() const {
84
+ return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat
85
+ }
80
86
  };
81
87
 
82
88
  using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
@@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
20
20
  { LLM_ARCH_STARCODER, "starcoder" },
21
21
  { LLM_ARCH_REFACT, "refact" },
22
22
  { LLM_ARCH_BERT, "bert" },
23
+ { LLM_ARCH_MODERN_BERT, "modern-bert" },
23
24
  { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
24
25
  { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
25
26
  { LLM_ARCH_NEO_BERT, "neo-bert" },
@@ -41,6 +42,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
41
42
  { LLM_ARCH_PHIMOE, "phimoe" },
42
43
  { LLM_ARCH_PLAMO, "plamo" },
43
44
  { LLM_ARCH_PLAMO2, "plamo2" },
45
+ { LLM_ARCH_PLAMO3, "plamo3" },
44
46
  { LLM_ARCH_CODESHELL, "codeshell" },
45
47
  { LLM_ARCH_ORION, "orion" },
46
48
  { LLM_ARCH_INTERNLM2, "internlm2" },
@@ -114,6 +116,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
114
116
  { LLM_ARCH_RND1, "rnd1" },
115
117
  { LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
116
118
  { LLM_ARCH_MISTRAL3, "mistral3" },
119
+ { LLM_ARCH_MIMO2, "mimo2" },
120
+ { LLM_ARCH_LLAMA_EMBED, "llama-embed" },
121
+ { LLM_ARCH_MAINCODER, "maincoder" },
117
122
  { LLM_ARCH_UNKNOWN, "(unknown)" },
118
123
  };
119
124
 
@@ -147,6 +152,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
147
152
  { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
148
153
  { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
149
154
  { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
155
+ { LLM_KV_EMBEDDING_LENGTH_OUT, "%s.embedding_length_out" },
150
156
  { LLM_KV_FEATURES_LENGTH, "%s.features_length" },
151
157
  { LLM_KV_BLOCK_COUNT, "%s.block_count" },
152
158
  { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
@@ -204,6 +210,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
204
210
  { LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" },
205
211
  { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
206
212
  { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
213
+ { LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, "%s.attention.sliding_window_pattern" },
207
214
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
208
215
  { LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
209
216
  { LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
@@ -214,6 +221,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
214
221
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
215
222
  { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
216
223
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
224
+ { LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
217
225
  { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
218
226
  { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
219
227
  { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
@@ -497,6 +505,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
497
505
  case LLM_ARCH_LLAMA:
498
506
  case LLM_ARCH_DECI:
499
507
  case LLM_ARCH_MISTRAL3:
508
+ case LLM_ARCH_LLAMA_EMBED:
500
509
  return {
501
510
  LLM_TENSOR_TOKEN_EMBD,
502
511
  LLM_TENSOR_OUTPUT_NORM,
@@ -778,6 +787,20 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
778
787
  LLM_TENSOR_CLS,
779
788
  LLM_TENSOR_CLS_OUT,
780
789
  };
790
+ case LLM_ARCH_MODERN_BERT:
791
+ return {
792
+ LLM_TENSOR_TOKEN_EMBD,
793
+ LLM_TENSOR_TOKEN_EMBD_NORM,
794
+ LLM_TENSOR_OUTPUT_NORM,
795
+ LLM_TENSOR_ATTN_NORM,
796
+ LLM_TENSOR_ATTN_OUT,
797
+ LLM_TENSOR_ATTN_QKV,
798
+ LLM_TENSOR_FFN_DOWN,
799
+ LLM_TENSOR_FFN_UP,
800
+ LLM_TENSOR_FFN_NORM,
801
+ LLM_TENSOR_CLS,
802
+ LLM_TENSOR_CLS_OUT,
803
+ };
781
804
  case LLM_ARCH_JINA_BERT_V2:
782
805
  return {
783
806
  LLM_TENSOR_TOKEN_EMBD,
@@ -1057,6 +1080,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
1057
1080
  LLM_TENSOR_ATTN_POST_NORM,
1058
1081
  LLM_TENSOR_FFN_POST_NORM,
1059
1082
  };
1083
+ case LLM_ARCH_PLAMO3:
1084
+ return {
1085
+ LLM_TENSOR_TOKEN_EMBD,
1086
+ LLM_TENSOR_OUTPUT_NORM,
1087
+ LLM_TENSOR_OUTPUT,
1088
+ LLM_TENSOR_ATTN_NORM,
1089
+ LLM_TENSOR_ATTN_QKV,
1090
+ LLM_TENSOR_ATTN_Q_NORM,
1091
+ LLM_TENSOR_ATTN_K_NORM,
1092
+ LLM_TENSOR_ATTN_OUT,
1093
+ LLM_TENSOR_ATTN_POST_NORM,
1094
+ LLM_TENSOR_FFN_NORM,
1095
+ LLM_TENSOR_FFN_POST_NORM,
1096
+ LLM_TENSOR_FFN_DOWN,
1097
+ LLM_TENSOR_FFN_UP,
1098
+ };
1060
1099
  case LLM_ARCH_CODESHELL:
1061
1100
  return {
1062
1101
  LLM_TENSOR_TOKEN_EMBD,
@@ -2037,6 +2076,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
2037
2076
  LLM_TENSOR_TOKEN_EMBD,
2038
2077
  LLM_TENSOR_OUTPUT_NORM_LFM2,
2039
2078
  LLM_TENSOR_OUTPUT,
2079
+ LLM_TENSOR_DENSE_2_OUT,
2040
2080
  };
2041
2081
  case LLM_ARCH_LFM2MOE:
2042
2082
  return {
@@ -2171,11 +2211,49 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
2171
2211
  LLM_TENSOR_VISEXP_FFN_DOWN,
2172
2212
  LLM_TENSOR_VISEXP_FFN_UP,
2173
2213
  };
2214
+ case LLM_ARCH_MIMO2:
2215
+ return {
2216
+ LLM_TENSOR_TOKEN_EMBD,
2217
+ LLM_TENSOR_OUTPUT_NORM,
2218
+ LLM_TENSOR_OUTPUT,
2219
+ LLM_TENSOR_ATTN_NORM,
2220
+ LLM_TENSOR_ATTN_Q,
2221
+ LLM_TENSOR_ATTN_K,
2222
+ LLM_TENSOR_ATTN_V,
2223
+ LLM_TENSOR_ATTN_SINKS,
2224
+ LLM_TENSOR_ATTN_OUT,
2225
+ LLM_TENSOR_FFN_NORM,
2226
+ LLM_TENSOR_FFN_GATE,
2227
+ LLM_TENSOR_FFN_DOWN,
2228
+ LLM_TENSOR_FFN_UP,
2229
+ LLM_TENSOR_FFN_GATE_INP,
2230
+ LLM_TENSOR_FFN_GATE_EXPS,
2231
+ LLM_TENSOR_FFN_DOWN_EXPS,
2232
+ LLM_TENSOR_FFN_UP_EXPS,
2233
+ LLM_TENSOR_FFN_EXP_PROBS_B,
2234
+ };
2174
2235
  case LLM_ARCH_GPTJ:
2175
2236
  case LLM_ARCH_UNKNOWN:
2176
2237
  return {
2177
2238
  LLM_TENSOR_TOKEN_EMBD,
2178
2239
  };
2240
+ case LLM_ARCH_MAINCODER:
2241
+ return {
2242
+ LLM_TENSOR_TOKEN_EMBD,
2243
+ LLM_TENSOR_OUTPUT_NORM,
2244
+ LLM_TENSOR_OUTPUT,
2245
+ LLM_TENSOR_ATTN_NORM,
2246
+ LLM_TENSOR_ATTN_Q,
2247
+ LLM_TENSOR_ATTN_Q_NORM,
2248
+ LLM_TENSOR_ATTN_K,
2249
+ LLM_TENSOR_ATTN_K_NORM,
2250
+ LLM_TENSOR_ATTN_V,
2251
+ LLM_TENSOR_ATTN_OUT,
2252
+ LLM_TENSOR_FFN_NORM,
2253
+ LLM_TENSOR_FFN_GATE,
2254
+ LLM_TENSOR_FFN_DOWN,
2255
+ LLM_TENSOR_FFN_UP,
2256
+ };
2179
2257
  default:
2180
2258
  GGML_ABORT("unknown architecture for tensor mapping");
2181
2259
  }
@@ -24,6 +24,7 @@ enum llm_arch {
24
24
  LLM_ARCH_STARCODER,
25
25
  LLM_ARCH_REFACT,
26
26
  LLM_ARCH_BERT,
27
+ LLM_ARCH_MODERN_BERT,
27
28
  LLM_ARCH_NOMIC_BERT,
28
29
  LLM_ARCH_NOMIC_BERT_MOE,
29
30
  LLM_ARCH_NEO_BERT,
@@ -45,6 +46,7 @@ enum llm_arch {
45
46
  LLM_ARCH_PHIMOE,
46
47
  LLM_ARCH_PLAMO,
47
48
  LLM_ARCH_PLAMO2,
49
+ LLM_ARCH_PLAMO3,
48
50
  LLM_ARCH_CODESHELL,
49
51
  LLM_ARCH_ORION,
50
52
  LLM_ARCH_INTERNLM2,
@@ -118,6 +120,9 @@ enum llm_arch {
118
120
  LLM_ARCH_RND1,
119
121
  LLM_ARCH_PANGU_EMBED,
120
122
  LLM_ARCH_MISTRAL3,
123
+ LLM_ARCH_MIMO2,
124
+ LLM_ARCH_LLAMA_EMBED,
125
+ LLM_ARCH_MAINCODER,
121
126
  LLM_ARCH_UNKNOWN,
122
127
  };
123
128
 
@@ -151,6 +156,7 @@ enum llm_kv {
151
156
  LLM_KV_VOCAB_SIZE,
152
157
  LLM_KV_CONTEXT_LENGTH,
153
158
  LLM_KV_EMBEDDING_LENGTH,
159
+ LLM_KV_EMBEDDING_LENGTH_OUT,
154
160
  LLM_KV_FEATURES_LENGTH,
155
161
  LLM_KV_BLOCK_COUNT,
156
162
  LLM_KV_LEADING_DENSE_BLOCK_COUNT,
@@ -208,6 +214,7 @@ enum llm_kv {
208
214
  LLM_KV_ATTENTION_GATE_LORA_RANK,
209
215
  LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
210
216
  LLM_KV_ATTENTION_SLIDING_WINDOW,
217
+ LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
211
218
  LLM_KV_ATTENTION_SCALE,
212
219
  LLM_KV_ATTENTION_OUTPUT_SCALE,
213
220
  LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
@@ -218,6 +225,7 @@ enum llm_kv {
218
225
  LLM_KV_ROPE_DIMENSION_COUNT,
219
226
  LLM_KV_ROPE_DIMENSION_SECTIONS,
220
227
  LLM_KV_ROPE_FREQ_BASE,
228
+ LLM_KV_ROPE_FREQ_BASE_SWA,
221
229
  LLM_KV_ROPE_SCALE_LINEAR,
222
230
  LLM_KV_ROPE_SCALING_TYPE,
223
231
  LLM_KV_ROPE_SCALING_FACTOR,
@@ -74,6 +74,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
74
74
  { "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
75
75
  { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
76
76
  { "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED },
77
+ { "solar-open", LLM_CHAT_TEMPLATE_SOLAR_OPEN },
77
78
  };
78
79
 
79
80
  llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -216,6 +217,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
216
217
  return LLM_CHAT_TEMPLATE_GROK_2;
217
218
  } else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) {
218
219
  return LLM_CHAT_TEMPLATE_PANGU_EMBED;
220
+ } else if (tmpl_contains("<|begin|>") && tmpl_contains("<|end|>") && tmpl_contains("<|content|>")) {
221
+ return LLM_CHAT_TEMPLATE_SOLAR_OPEN;
219
222
  }
220
223
  return LLM_CHAT_TEMPLATE_UNKNOWN;
221
224
  }
@@ -845,6 +848,14 @@ int32_t llm_chat_apply_template(
845
848
  if (add_ass) {
846
849
  ss << "[unused9]助手:";
847
850
  }
851
+ } else if (tmpl == LLM_CHAT_TEMPLATE_SOLAR_OPEN) {
852
+ for (auto message : chat) {
853
+ std::string role(message->role);
854
+ ss << "<|begin|>" << role << "<|content|>" << message->content << "<|end|>";
855
+ }
856
+ if (add_ass) {
857
+ ss << "<|begin|>assistant";
858
+ }
848
859
  } else {
849
860
  // template not supported
850
861
  return -1;
@@ -54,6 +54,7 @@ enum llm_chat_template {
54
54
  LLM_CHAT_TEMPLATE_SEED_OSS,
55
55
  LLM_CHAT_TEMPLATE_GROK_2,
56
56
  LLM_CHAT_TEMPLATE_PANGU_EMBED,
57
+ LLM_CHAT_TEMPLATE_SOLAR_OPEN,
57
58
  LLM_CHAT_TEMPLATE_UNKNOWN,
58
59
  };
59
60