@fugood/llama.node 1.4.11 → 1.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/package.json +15 -15
  2. package/scripts/llama.cpp.patch +30 -30
  3. package/src/llama.cpp/common/arg.cpp +29 -14
  4. package/src/llama.cpp/common/arg.h +1 -0
  5. package/src/llama.cpp/common/chat-parser.cpp +11 -0
  6. package/src/llama.cpp/common/chat.cpp +32 -3
  7. package/src/llama.cpp/common/chat.h +1 -0
  8. package/src/llama.cpp/common/common.cpp +23 -23
  9. package/src/llama.cpp/common/common.h +1 -1
  10. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  11. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
  12. package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  13. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  16. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  17. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  19. package/src/llama.cpp/include/llama.h +13 -4
  20. package/src/llama.cpp/src/CMakeLists.txt +4 -0
  21. package/src/llama.cpp/src/llama-adapter.cpp +12 -3
  22. package/src/llama.cpp/src/llama-adapter.h +7 -1
  23. package/src/llama.cpp/src/llama-arch.cpp +76 -0
  24. package/src/llama.cpp/src/llama-arch.h +7 -0
  25. package/src/llama.cpp/src/llama-chat.cpp +11 -0
  26. package/src/llama.cpp/src/llama-chat.h +1 -0
  27. package/src/llama.cpp/src/llama-context.cpp +22 -21
  28. package/src/llama.cpp/src/llama-hparams.h +4 -3
  29. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  30. package/src/llama.cpp/src/llama-mmap.cpp +11 -4
  31. package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
  32. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  33. package/src/llama.cpp/src/llama-model.cpp +287 -16
  34. package/src/llama.cpp/src/llama-model.h +13 -2
  35. package/src/llama.cpp/src/llama-sampling.cpp +44 -33
  36. package/src/llama.cpp/src/llama-sampling.h +3 -0
  37. package/src/llama.cpp/src/llama-vocab.cpp +101 -33
  38. package/src/llama.cpp/src/llama-vocab.h +2 -0
  39. package/src/llama.cpp/src/llama.cpp +52 -37
  40. package/src/llama.cpp/src/models/bert.cpp +4 -2
  41. package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
  42. package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
  43. package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  44. package/src/llama.cpp/src/models/gemma3.cpp +3 -4
  45. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  46. package/src/llama.cpp/src/models/llama.cpp +19 -6
  47. package/src/llama.cpp/src/models/maincoder.cpp +117 -0
  48. package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  49. package/src/llama.cpp/src/models/models.h +18 -0
  50. package/src/llama.cpp/src/models/modern-bert.cpp +115 -0
  51. package/src/llama.cpp/src/models/plamo3.cpp +128 -0
  52. package/src/llama.cpp/src/unicode.cpp +23 -14
@@ -14,10 +14,6 @@
14
14
  #include <arm_neon.h>
15
15
  #endif
16
16
 
17
- #if defined(__F16C__)
18
- #include <immintrin.h>
19
- #endif
20
-
21
17
  #if defined(__riscv_v_intrinsic)
22
18
  #include <riscv_vector.h>
23
19
  #endif
@@ -286,7 +286,7 @@ extern "C" {
286
286
  // NULL-terminated list of buffer types to use for tensors that match a pattern
287
287
  const struct llama_model_tensor_buft_override * tensor_buft_overrides;
288
288
 
289
- int32_t n_gpu_layers; // number of layers to store in VRAM
289
+ int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
290
290
  enum llama_split_mode split_mode; // how to split the model across multiple GPUs
291
291
 
292
292
  // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
@@ -467,10 +467,17 @@ extern "C" {
467
467
  // Frees all allocated memory
468
468
  LLAMA_API void llama_free(struct llama_context * ctx);
469
469
 
470
+ enum llama_params_fit_status {
471
+ LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
472
+ LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
473
+ LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occured, e.g. because no model could be found at the specified path
474
+ };
475
+
470
476
  // fits mparams and cparams to free device memory (assumes system memory is unlimited)
471
- // returns true if the parameters could be successfully modified to fit device memory
472
- // this function is NOT thread safe because it modifies the global llama logger state
473
- LLAMA_API bool llama_params_fit(
477
+ // - returns true if the parameters could be successfully modified to fit device memory
478
+ // - this function is NOT thread safe because it modifies the global llama logger state
479
+ // - only parameters that have the same value as in llama_default_model_params are modified
480
+ LLAMA_API enum llama_params_fit_status llama_params_fit(
474
481
  const char * path_model,
475
482
  struct llama_model_params * mparams,
476
483
  struct llama_context_params * cparams,
@@ -600,6 +607,8 @@ extern "C" {
600
607
  //
601
608
 
602
609
  // Load a LoRA adapter from file
610
+ // The adapter is valid as long as the associated model is not freed
611
+ // All adapters must be loaded before context creation
603
612
  LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
604
613
  struct llama_model * model,
605
614
  const char * path_lora);
@@ -87,9 +87,12 @@ add_library(llama
87
87
  models/llada.cpp
88
88
  models/llama-iswa.cpp
89
89
  models/llama.cpp
90
+ models/maincoder.cpp
90
91
  models/mamba.cpp
92
+ models/mimo2-iswa.cpp
91
93
  models/minicpm3.cpp
92
94
  models/minimax-m2.cpp
95
+ models/modern-bert.cpp
93
96
  models/mpt.cpp
94
97
  models/nemotron-h.cpp
95
98
  models/nemotron.cpp
@@ -105,6 +108,7 @@ add_library(llama
105
108
  models/phi3.cpp
106
109
  models/plamo.cpp
107
110
  models/plamo2.cpp
111
+ models/plamo3.cpp
108
112
  models/plm.cpp
109
113
  models/qwen.cpp
110
114
  models/qwen2.cpp
@@ -146,9 +146,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
146
146
  return nullptr;
147
147
  }
148
148
 
149
- static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
149
+ static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
150
150
  LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
151
151
 
152
+ llama_model & model = adapter.model;
153
+
152
154
  ggml_context * ctx_init;
153
155
  gguf_init_params meta_gguf_params = {
154
156
  /* .no_alloc = */ true,
@@ -411,14 +413,17 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
411
413
  }
412
414
  }
413
415
 
416
+ // update number of nodes used
417
+ model.n_lora_nodes += adapter.get_n_nodes();
418
+
414
419
  LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
415
420
  }
416
421
 
417
422
  llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
418
- llama_adapter_lora * adapter = new llama_adapter_lora();
423
+ llama_adapter_lora * adapter = new llama_adapter_lora(*model);
419
424
 
420
425
  try {
421
- llama_adapter_lora_init_impl(*model, path_lora, *adapter);
426
+ llama_adapter_lora_init_impl(path_lora, *adapter);
422
427
  return adapter;
423
428
  } catch (const std::exception & err) {
424
429
  LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
@@ -469,6 +474,10 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
469
474
  }
470
475
 
471
476
  void llama_adapter_lora_free(llama_adapter_lora * adapter) {
477
+ // update number of nodes used
478
+ GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
479
+ adapter->model.n_lora_nodes -= adapter->get_n_nodes();
480
+
472
481
  delete adapter;
473
482
  }
474
483
 
@@ -59,6 +59,8 @@ struct llama_adapter_lora_weight {
59
59
  };
60
60
 
61
61
  struct llama_adapter_lora {
62
+ llama_model & model;
63
+
62
64
  // map tensor name to lora_a_b
63
65
  std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
64
66
 
@@ -73,10 +75,14 @@ struct llama_adapter_lora {
73
75
  // activated lora (aLoRA)
74
76
  std::vector<llama_token> alora_invocation_tokens;
75
77
 
76
- llama_adapter_lora() = default;
78
+ llama_adapter_lora(llama_model & model) : model(model) {}
77
79
  ~llama_adapter_lora() = default;
78
80
 
79
81
  llama_adapter_lora_weight * get_weight(ggml_tensor * w);
82
+
83
+ uint32_t get_n_nodes() const {
84
+ return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat
85
+ }
80
86
  };
81
87
 
82
88
  using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
@@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
20
20
  { LLM_ARCH_STARCODER, "starcoder" },
21
21
  { LLM_ARCH_REFACT, "refact" },
22
22
  { LLM_ARCH_BERT, "bert" },
23
+ { LLM_ARCH_MODERN_BERT, "modern-bert" },
23
24
  { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
24
25
  { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
25
26
  { LLM_ARCH_NEO_BERT, "neo-bert" },
@@ -41,6 +42,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
41
42
  { LLM_ARCH_PHIMOE, "phimoe" },
42
43
  { LLM_ARCH_PLAMO, "plamo" },
43
44
  { LLM_ARCH_PLAMO2, "plamo2" },
45
+ { LLM_ARCH_PLAMO3, "plamo3" },
44
46
  { LLM_ARCH_CODESHELL, "codeshell" },
45
47
  { LLM_ARCH_ORION, "orion" },
46
48
  { LLM_ARCH_INTERNLM2, "internlm2" },
@@ -114,6 +116,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
114
116
  { LLM_ARCH_RND1, "rnd1" },
115
117
  { LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
116
118
  { LLM_ARCH_MISTRAL3, "mistral3" },
119
+ { LLM_ARCH_MIMO2, "mimo2" },
120
+ { LLM_ARCH_LLAMA_EMBED, "llama-embed" },
121
+ { LLM_ARCH_MAINCODER, "maincoder" },
117
122
  { LLM_ARCH_UNKNOWN, "(unknown)" },
118
123
  };
119
124
 
@@ -204,6 +209,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
204
209
  { LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" },
205
210
  { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
206
211
  { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
212
+ { LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, "%s.attention.sliding_window_pattern" },
207
213
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
208
214
  { LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
209
215
  { LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
@@ -214,6 +220,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
214
220
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
215
221
  { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
216
222
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
223
+ { LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
217
224
  { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
218
225
  { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
219
226
  { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
@@ -497,6 +504,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
497
504
  case LLM_ARCH_LLAMA:
498
505
  case LLM_ARCH_DECI:
499
506
  case LLM_ARCH_MISTRAL3:
507
+ case LLM_ARCH_LLAMA_EMBED:
500
508
  return {
501
509
  LLM_TENSOR_TOKEN_EMBD,
502
510
  LLM_TENSOR_OUTPUT_NORM,
@@ -778,6 +786,20 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
778
786
  LLM_TENSOR_CLS,
779
787
  LLM_TENSOR_CLS_OUT,
780
788
  };
789
+ case LLM_ARCH_MODERN_BERT:
790
+ return {
791
+ LLM_TENSOR_TOKEN_EMBD,
792
+ LLM_TENSOR_TOKEN_EMBD_NORM,
793
+ LLM_TENSOR_OUTPUT_NORM,
794
+ LLM_TENSOR_ATTN_NORM,
795
+ LLM_TENSOR_ATTN_OUT,
796
+ LLM_TENSOR_ATTN_QKV,
797
+ LLM_TENSOR_FFN_DOWN,
798
+ LLM_TENSOR_FFN_UP,
799
+ LLM_TENSOR_FFN_NORM,
800
+ LLM_TENSOR_CLS,
801
+ LLM_TENSOR_CLS_OUT,
802
+ };
781
803
  case LLM_ARCH_JINA_BERT_V2:
782
804
  return {
783
805
  LLM_TENSOR_TOKEN_EMBD,
@@ -1057,6 +1079,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
1057
1079
  LLM_TENSOR_ATTN_POST_NORM,
1058
1080
  LLM_TENSOR_FFN_POST_NORM,
1059
1081
  };
1082
+ case LLM_ARCH_PLAMO3:
1083
+ return {
1084
+ LLM_TENSOR_TOKEN_EMBD,
1085
+ LLM_TENSOR_OUTPUT_NORM,
1086
+ LLM_TENSOR_OUTPUT,
1087
+ LLM_TENSOR_ATTN_NORM,
1088
+ LLM_TENSOR_ATTN_QKV,
1089
+ LLM_TENSOR_ATTN_Q_NORM,
1090
+ LLM_TENSOR_ATTN_K_NORM,
1091
+ LLM_TENSOR_ATTN_OUT,
1092
+ LLM_TENSOR_ATTN_POST_NORM,
1093
+ LLM_TENSOR_FFN_NORM,
1094
+ LLM_TENSOR_FFN_POST_NORM,
1095
+ LLM_TENSOR_FFN_DOWN,
1096
+ LLM_TENSOR_FFN_UP,
1097
+ };
1060
1098
  case LLM_ARCH_CODESHELL:
1061
1099
  return {
1062
1100
  LLM_TENSOR_TOKEN_EMBD,
@@ -2171,11 +2209,49 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
2171
2209
  LLM_TENSOR_VISEXP_FFN_DOWN,
2172
2210
  LLM_TENSOR_VISEXP_FFN_UP,
2173
2211
  };
2212
+ case LLM_ARCH_MIMO2:
2213
+ return {
2214
+ LLM_TENSOR_TOKEN_EMBD,
2215
+ LLM_TENSOR_OUTPUT_NORM,
2216
+ LLM_TENSOR_OUTPUT,
2217
+ LLM_TENSOR_ATTN_NORM,
2218
+ LLM_TENSOR_ATTN_Q,
2219
+ LLM_TENSOR_ATTN_K,
2220
+ LLM_TENSOR_ATTN_V,
2221
+ LLM_TENSOR_ATTN_SINKS,
2222
+ LLM_TENSOR_ATTN_OUT,
2223
+ LLM_TENSOR_FFN_NORM,
2224
+ LLM_TENSOR_FFN_GATE,
2225
+ LLM_TENSOR_FFN_DOWN,
2226
+ LLM_TENSOR_FFN_UP,
2227
+ LLM_TENSOR_FFN_GATE_INP,
2228
+ LLM_TENSOR_FFN_GATE_EXPS,
2229
+ LLM_TENSOR_FFN_DOWN_EXPS,
2230
+ LLM_TENSOR_FFN_UP_EXPS,
2231
+ LLM_TENSOR_FFN_EXP_PROBS_B,
2232
+ };
2174
2233
  case LLM_ARCH_GPTJ:
2175
2234
  case LLM_ARCH_UNKNOWN:
2176
2235
  return {
2177
2236
  LLM_TENSOR_TOKEN_EMBD,
2178
2237
  };
2238
+ case LLM_ARCH_MAINCODER:
2239
+ return {
2240
+ LLM_TENSOR_TOKEN_EMBD,
2241
+ LLM_TENSOR_OUTPUT_NORM,
2242
+ LLM_TENSOR_OUTPUT,
2243
+ LLM_TENSOR_ATTN_NORM,
2244
+ LLM_TENSOR_ATTN_Q,
2245
+ LLM_TENSOR_ATTN_Q_NORM,
2246
+ LLM_TENSOR_ATTN_K,
2247
+ LLM_TENSOR_ATTN_K_NORM,
2248
+ LLM_TENSOR_ATTN_V,
2249
+ LLM_TENSOR_ATTN_OUT,
2250
+ LLM_TENSOR_FFN_NORM,
2251
+ LLM_TENSOR_FFN_GATE,
2252
+ LLM_TENSOR_FFN_DOWN,
2253
+ LLM_TENSOR_FFN_UP,
2254
+ };
2179
2255
  default:
2180
2256
  GGML_ABORT("unknown architecture for tensor mapping");
2181
2257
  }
@@ -24,6 +24,7 @@ enum llm_arch {
24
24
  LLM_ARCH_STARCODER,
25
25
  LLM_ARCH_REFACT,
26
26
  LLM_ARCH_BERT,
27
+ LLM_ARCH_MODERN_BERT,
27
28
  LLM_ARCH_NOMIC_BERT,
28
29
  LLM_ARCH_NOMIC_BERT_MOE,
29
30
  LLM_ARCH_NEO_BERT,
@@ -45,6 +46,7 @@ enum llm_arch {
45
46
  LLM_ARCH_PHIMOE,
46
47
  LLM_ARCH_PLAMO,
47
48
  LLM_ARCH_PLAMO2,
49
+ LLM_ARCH_PLAMO3,
48
50
  LLM_ARCH_CODESHELL,
49
51
  LLM_ARCH_ORION,
50
52
  LLM_ARCH_INTERNLM2,
@@ -118,6 +120,9 @@ enum llm_arch {
118
120
  LLM_ARCH_RND1,
119
121
  LLM_ARCH_PANGU_EMBED,
120
122
  LLM_ARCH_MISTRAL3,
123
+ LLM_ARCH_MIMO2,
124
+ LLM_ARCH_LLAMA_EMBED,
125
+ LLM_ARCH_MAINCODER,
121
126
  LLM_ARCH_UNKNOWN,
122
127
  };
123
128
 
@@ -208,6 +213,7 @@ enum llm_kv {
208
213
  LLM_KV_ATTENTION_GATE_LORA_RANK,
209
214
  LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
210
215
  LLM_KV_ATTENTION_SLIDING_WINDOW,
216
+ LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
211
217
  LLM_KV_ATTENTION_SCALE,
212
218
  LLM_KV_ATTENTION_OUTPUT_SCALE,
213
219
  LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
@@ -218,6 +224,7 @@ enum llm_kv {
218
224
  LLM_KV_ROPE_DIMENSION_COUNT,
219
225
  LLM_KV_ROPE_DIMENSION_SECTIONS,
220
226
  LLM_KV_ROPE_FREQ_BASE,
227
+ LLM_KV_ROPE_FREQ_BASE_SWA,
221
228
  LLM_KV_ROPE_SCALE_LINEAR,
222
229
  LLM_KV_ROPE_SCALING_TYPE,
223
230
  LLM_KV_ROPE_SCALING_FACTOR,
@@ -74,6 +74,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
74
74
  { "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
75
75
  { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
76
76
  { "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED },
77
+ { "solar-open", LLM_CHAT_TEMPLATE_SOLAR_OPEN },
77
78
  };
78
79
 
79
80
  llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -216,6 +217,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
216
217
  return LLM_CHAT_TEMPLATE_GROK_2;
217
218
  } else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) {
218
219
  return LLM_CHAT_TEMPLATE_PANGU_EMBED;
220
+ } else if (tmpl_contains("<|begin|>") && tmpl_contains("<|end|>") && tmpl_contains("<|content|>")) {
221
+ return LLM_CHAT_TEMPLATE_SOLAR_OPEN;
219
222
  }
220
223
  return LLM_CHAT_TEMPLATE_UNKNOWN;
221
224
  }
@@ -845,6 +848,14 @@ int32_t llm_chat_apply_template(
845
848
  if (add_ass) {
846
849
  ss << "[unused9]助手:";
847
850
  }
851
+ } else if (tmpl == LLM_CHAT_TEMPLATE_SOLAR_OPEN) {
852
+ for (auto message : chat) {
853
+ std::string role(message->role);
854
+ ss << "<|begin|>" << role << "<|content|>" << message->content << "<|end|>";
855
+ }
856
+ if (add_ass) {
857
+ ss << "<|begin|>assistant";
858
+ }
848
859
  } else {
849
860
  // template not supported
850
861
  return -1;
@@ -54,6 +54,7 @@ enum llm_chat_template {
54
54
  LLM_CHAT_TEMPLATE_SEED_OSS,
55
55
  LLM_CHAT_TEMPLATE_GROK_2,
56
56
  LLM_CHAT_TEMPLATE_PANGU_EMBED,
57
+ LLM_CHAT_TEMPLATE_SOLAR_OPEN,
57
58
  LLM_CHAT_TEMPLATE_UNKNOWN,
58
59
  };
59
60
 
@@ -294,8 +294,8 @@ llama_context::llama_context(
294
294
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
295
295
  bool pipeline_parallel =
296
296
  model.n_devices() > 1 &&
297
- model.params.n_gpu_layers > (int) model.hparams.n_layer &&
298
- model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
297
+ model.n_gpu_layers() > model.hparams.n_layer &&
298
+ model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
299
299
  cparams.offload_kqv &&
300
300
  !model.has_tensor_overrides();
301
301
 
@@ -459,23 +459,22 @@ llama_context::llama_context(
459
459
  }
460
460
 
461
461
  llama_context::~llama_context() {
462
- // FIXME this currently results in a use-after-free bug if the model is freed before the context
463
- // if (!model.hparams.no_alloc) {
464
- // for (size_t i = 0; i < backend_ptrs.size(); ++i) {
465
- // ggml_backend_t backend = backend_ptrs[i];
466
- // ggml_backend_buffer_type_t buft = backend_buft[i];
467
-
468
- // const size_t size_exp = backend_buf_exp_size[i];
469
- // const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
470
- // if (size_exp == size_act) {
471
- // LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
472
- // __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
473
- // } else {
474
- // LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
475
- // __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
476
- // }
477
- // }
478
- // }
462
+ if (!model.hparams.no_alloc) {
463
+ for (size_t i = 0; i < backend_ptrs.size(); ++i) {
464
+ ggml_backend_t backend = backend_ptrs[i];
465
+ ggml_backend_buffer_type_t buft = backend_buft[i];
466
+
467
+ const size_t size_exp = backend_buf_exp_size[i];
468
+ const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
469
+ if (size_exp == size_act) {
470
+ LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
471
+ __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
472
+ } else {
473
+ LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
474
+ __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
475
+ }
476
+ }
477
+ }
479
478
  ggml_opt_free(opt_ctx);
480
479
  }
481
480
 
@@ -1443,7 +1442,9 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
1443
1442
  if (model.arch == LLM_ARCH_QWEN3NEXT) {
1444
1443
  return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
1445
1444
  }
1446
- return std::max<uint32_t>(1024u, 8u*model.n_tensors());
1445
+ uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
1446
+ res += model.n_lora_nodes;
1447
+ return res;
1447
1448
  }
1448
1449
 
1449
1450
  llm_graph_result * llama_context::get_gf_res_reserve() const {
@@ -1571,7 +1572,7 @@ llm_graph_cb llama_context::graph_get_cb() const {
1571
1572
 
1572
1573
  // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
1573
1574
  // FIXME: fix in ggml_backend_sched
1574
- const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
1575
+ const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
1575
1576
  if (ubatch.n_tokens < 32 || full_offload) {
1576
1577
  if (il != -1 && strcmp(name, "norm") == 0) {
1577
1578
  const auto & dev_layer = model.dev_layer(il);
@@ -123,10 +123,11 @@ struct llama_hparams {
123
123
  llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
124
124
  // the size of the sliding window (0 - no SWA)
125
125
  uint32_t n_swa = 0;
126
- // if swa_layers[il] == true, then layer il is SWA
127
- // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
126
+ // if swa_layers[il] == 1, then layer il is SWA
127
+ // if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA)
128
128
  // by default, all layers are dense
129
- std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
129
+ // note: using uint32_t type for compatibility reason
130
+ std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
130
131
 
131
132
  // for State Space Models
132
133
  uint32_t ssm_d_conv = 0;
@@ -305,7 +305,7 @@ public:
305
305
  bool do_shift,
306
306
  stream_copy_info sc_info);
307
307
 
308
- // used to create a batch procesing context from a batch
308
+ // used to create a batch processing context from a batch
309
309
  llama_kv_cache_context(
310
310
  llama_kv_cache * kv,
311
311
  slot_info_vec_t sinfos,
@@ -240,9 +240,10 @@ struct llama_file::impl {
240
240
  throw std::runtime_error("unexpectedly reached end of file");
241
241
  }
242
242
  } else {
243
- bool successful = false;
244
- while (!successful) {
245
- off_t ret = read(fd, ptr, len);
243
+ size_t bytes_read = 0;
244
+ while (bytes_read < len) {
245
+ const size_t to_read = len - bytes_read;
246
+ ssize_t ret = ::read(fd, reinterpret_cast<char *>(ptr) + bytes_read, to_read);
246
247
 
247
248
  if (ret == -1) {
248
249
  if (errno == EINTR) {
@@ -251,10 +252,16 @@ struct llama_file::impl {
251
252
  throw std::runtime_error(format("read error: %s", strerror(errno)));
252
253
  }
253
254
  if (ret == 0) {
255
+ // EOF: allow if this read was only pulling alignment padding past file end
256
+ off_t pos = lseek(fd, 0, SEEK_CUR);
257
+ if (pos != -1 && (size_t) pos == size) {
258
+ std::memset(reinterpret_cast<char *>(ptr) + bytes_read, 0, len - bytes_read);
259
+ return;
260
+ }
254
261
  throw std::runtime_error("unexpectedly reached end of file");
255
262
  }
256
263
 
257
- successful = true;
264
+ bytes_read += (size_t) ret;
258
265
  }
259
266
  }
260
267
  }
@@ -462,6 +462,29 @@ namespace GGUFMeta {
462
462
  return get_key_or_arr(llm_kv(kid), result, n, required);
463
463
  }
464
464
 
465
+ bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
466
+ const std::string key = llm_kv(kid);
467
+
468
+ const int id = gguf_find_key(meta.get(), key.c_str());
469
+
470
+ if (id < 0) {
471
+ if (required) {
472
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
473
+ }
474
+ return false;
475
+ }
476
+
477
+ // throw and error if type is an array
478
+ if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
479
+ if (required) {
480
+ throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
481
+ }
482
+ return false;
483
+ }
484
+
485
+ return get_key(key, result, required);
486
+ }
487
+
465
488
  // TODO: this is not very clever - figure out something better
466
489
  template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
467
490
  template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
@@ -131,6 +131,8 @@ struct llama_model_loader {
131
131
  template<typename T>
132
132
  bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);
133
133
 
134
+ bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true);
135
+
134
136
  std::string get_arch_name() const;
135
137
 
136
138
  enum llm_arch get_arch() const;