@fugood/llama.node 1.2.3 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +33 -11
  3. package/src/llama.cpp/CMakeLists.txt +1 -0
  4. package/src/llama.cpp/common/CMakeLists.txt +46 -2
  5. package/src/llama.cpp/common/arg.cpp +322 -70
  6. package/src/llama.cpp/common/arg.h +0 -1
  7. package/src/llama.cpp/common/chat-parser.cpp +154 -13
  8. package/src/llama.cpp/common/chat-parser.h +3 -0
  9. package/src/llama.cpp/common/chat.cpp +217 -6
  10. package/src/llama.cpp/common/chat.h +5 -3
  11. package/src/llama.cpp/common/common.cpp +22 -6
  12. package/src/llama.cpp/common/common.h +6 -4
  13. package/src/llama.cpp/common/http.h +73 -0
  14. package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
  15. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  16. package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
  17. package/src/llama.cpp/ggml/include/ggml.h +22 -0
  18. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  20. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  24. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  25. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
  27. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
  28. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
  29. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +17 -17
  30. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
  31. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  32. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +6 -5
  39. package/src/llama.cpp/include/llama.h +8 -0
  40. package/src/llama.cpp/src/llama-arch.cpp +93 -0
  41. package/src/llama.cpp/src/llama-arch.h +22 -0
  42. package/src/llama.cpp/src/llama-chat.cpp +1 -1
  43. package/src/llama.cpp/src/llama-context.cpp +6 -0
  44. package/src/llama.cpp/src/llama-graph.cpp +57 -22
  45. package/src/llama.cpp/src/llama-graph.h +10 -1
  46. package/src/llama.cpp/src/llama-hparams.h +17 -2
  47. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +2 -2
  48. package/src/llama.cpp/src/llama-kv-cache.cpp +2 -5
  49. package/src/llama.cpp/src/llama-memory-hybrid.cpp +11 -9
  50. package/src/llama.cpp/src/llama-memory-recurrent.cpp +11 -3
  51. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  52. package/src/llama.cpp/src/llama-model.cpp +568 -41
  53. package/src/llama.cpp/src/llama-model.h +18 -0
  54. package/src/llama.cpp/src/llama-sampling.cpp +5 -0
  55. package/src/llama.cpp/src/llama-vocab.cpp +7 -1
  56. package/src/llama.cpp/src/llama-vocab.h +41 -40
  57. package/src/llama.cpp/src/unicode.h +43 -0
@@ -3,9 +3,12 @@
3
3
  #include "log.h"
4
4
  #include "regex-partial.h"
5
5
 
6
+ #include <algorithm>
7
+ #include <cctype>
6
8
  #include <optional>
7
9
  #include <stdexcept>
8
10
  #include <string>
11
+ #include <string_view>
9
12
  #include <vector>
10
13
 
11
14
  using json = nlohmann::ordered_json;
@@ -75,6 +78,35 @@ bool common_chat_msg_parser::add_tool_calls(const json & arr) {
75
78
  }
76
79
  return true;
77
80
  }
81
+
82
+ bool common_chat_msg_parser::add_tool_call_short_form(const json & tool_call) {
83
+ if (!tool_call.is_object() || tool_call.size() != 1) {
84
+ return false;
85
+ }
86
+
87
+ // Get the tool name (the single key in the object)
88
+ auto it = tool_call.begin();
89
+ std::string name = it.key();
90
+
91
+ if (name.empty()) {
92
+ return false;
93
+ }
94
+
95
+ // Get the arguments (the nested object)
96
+ const json & args_json = it.value();
97
+ std::string arguments = "";
98
+
99
+ if (args_json.is_object()) {
100
+ arguments = args_json.dump();
101
+ } else if (args_json.is_string()) {
102
+ arguments = args_json;
103
+ } else if (!args_json.is_null()) {
104
+ // For other types, convert to string representation
105
+ arguments = args_json.dump();
106
+ }
107
+
108
+ return add_tool_call(name, "", arguments);
109
+ }
78
110
  void common_chat_msg_parser::finish() {
79
111
  if (!is_partial_ && pos_ != input_.size()) {
80
112
  throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
@@ -137,6 +169,27 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
137
169
  }
138
170
 
139
171
  bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
172
+ std::string pending_reasoning_prefix;
173
+
174
+ if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
175
+ return false;
176
+ }
177
+
178
+ auto set_reasoning_prefix = [&](size_t prefix_pos) {
179
+ if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
180
+ return;
181
+ }
182
+ if (prefix_pos + start_think.size() > input_.size()) {
183
+ pending_reasoning_prefix.clear();
184
+ return;
185
+ }
186
+ // Capture the exact literal that opened the reasoning section so we can
187
+ // surface it back to callers. This ensures formats that force the
188
+ // reasoning tag open (e.g. DeepSeek R1) retain their original prefix
189
+ // instead of dropping it during parsing.
190
+ pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
191
+ };
192
+
140
193
  auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
141
194
  auto stripped_reasoning = string_strip(reasoning);
142
195
  if (stripped_reasoning.empty()) {
@@ -149,28 +202,116 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
149
202
  add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
150
203
  }
151
204
  } else {
205
+ if (!pending_reasoning_prefix.empty()) {
206
+ add_reasoning_content(pending_reasoning_prefix);
207
+ pending_reasoning_prefix.clear();
208
+ }
152
209
  add_reasoning_content(stripped_reasoning);
153
210
  }
154
211
  };
155
- if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
156
- if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
157
- if (auto res = try_find_literal(end_think)) {
158
- handle_reasoning(res->prelude, /* closed */ true);
159
- consume_spaces();
160
- return true;
161
- }
162
- auto rest = consume_rest();
212
+
213
+ const size_t saved_pos = pos_;
214
+ const size_t saved_content_size = result_.content.size();
215
+ const size_t saved_reasoning_size = result_.reasoning_content.size();
216
+
217
+ auto restore_state = [&]() {
218
+ move_to(saved_pos);
219
+ result_.content.resize(saved_content_size);
220
+ result_.reasoning_content.resize(saved_reasoning_size);
221
+ };
222
+
223
+ // Allow leading whitespace to be preserved as content when reasoning is present at the start
224
+ size_t cursor = pos_;
225
+ size_t whitespace_end = cursor;
226
+ while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
227
+ ++whitespace_end;
228
+ }
229
+
230
+ if (whitespace_end >= input_.size()) {
231
+ restore_state();
232
+ if (syntax_.thinking_forced_open) {
233
+ auto rest = input_.substr(saved_pos);
163
234
  if (!rest.empty()) {
164
235
  handle_reasoning(rest, /* closed */ !is_partial());
165
236
  }
166
- // Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
167
- // if (!syntax_.thinking_forced_open) {
168
- // throw common_chat_msg_partial_exception(end_think);
169
- // }
237
+ move_to(input_.size());
238
+ return true;
239
+ }
240
+ return false;
241
+ }
242
+
243
+ cursor = whitespace_end;
244
+ const size_t remaining = input_.size() - cursor;
245
+ const size_t start_prefix = std::min(start_think.size(), remaining);
246
+ const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
247
+
248
+ if (has_start_tag && start_prefix < start_think.size()) {
249
+ move_to(input_.size());
250
+ return true;
251
+ }
252
+
253
+ if (has_start_tag) {
254
+ if (whitespace_end > pos_) {
255
+ add_content(input_.substr(pos_, whitespace_end - pos_));
256
+ }
257
+ set_reasoning_prefix(cursor);
258
+ cursor += start_think.size();
259
+ } else if (syntax_.thinking_forced_open) {
260
+ cursor = whitespace_end;
261
+ } else {
262
+ restore_state();
263
+ return false;
264
+ }
265
+ while (true) {
266
+ if (cursor >= input_.size()) {
267
+ move_to(input_.size());
170
268
  return true;
171
269
  }
270
+
271
+ size_t end_pos = input_.find(end_think, cursor);
272
+ if (end_pos == std::string::npos) {
273
+ std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
274
+ size_t partial_off = string_find_partial_stop(remaining_view, end_think);
275
+ size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
276
+ if (reasoning_end > cursor) {
277
+ handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
278
+ }
279
+ move_to(input_.size());
280
+ return true;
281
+ }
282
+
283
+ if (end_pos > cursor) {
284
+ handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
285
+ } else {
286
+ handle_reasoning("", /* closed */ true);
287
+ }
288
+
289
+ cursor = end_pos + end_think.size();
290
+
291
+ while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
292
+ ++cursor;
293
+ }
294
+
295
+ const size_t next_remaining = input_.size() - cursor;
296
+ if (next_remaining == 0) {
297
+ move_to(cursor);
298
+ return true;
299
+ }
300
+
301
+ const size_t next_prefix = std::min(start_think.size(), next_remaining);
302
+ if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
303
+ if (next_prefix < start_think.size()) {
304
+ move_to(input_.size());
305
+ return true;
306
+ }
307
+ set_reasoning_prefix(cursor);
308
+ cursor += start_think.size();
309
+ continue;
310
+ }
311
+
312
+ move_to(cursor);
313
+ return true;
172
314
  }
173
- return false;
174
315
  }
175
316
 
176
317
  std::string common_chat_msg_parser::consume_rest() {
@@ -64,6 +64,9 @@ class common_chat_msg_parser {
64
64
  // Adds an array of tool calls using their "name", "id" and "arguments" fields.
65
65
  bool add_tool_calls(const nlohmann::ordered_json & arr);
66
66
 
67
+ // Adds a tool call using the short form: { "tool_name": { "arg1": val, "arg2": val } }
68
+ bool add_tool_call_short_form(const nlohmann::ordered_json & tool_call);
69
+
67
70
  void finish();
68
71
 
69
72
  bool consume_spaces();
@@ -612,6 +612,7 @@ const char * common_chat_format_name(common_chat_format format) {
612
612
  case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
613
613
  case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
614
614
  case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo";
615
+ case COMMON_CHAT_FORMAT_MAGISTRAL: return "Magistral";
615
616
  case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
616
617
  case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
617
618
  case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
@@ -625,6 +626,7 @@ const char * common_chat_format_name(common_chat_format format) {
625
626
  case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
626
627
  case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
627
628
  case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
629
+ case COMMON_CHAT_FORMAT_APERTUS: return "Apertus";
628
630
  default:
629
631
  throw std::runtime_error("Unknown chat format");
630
632
  }
@@ -788,6 +790,7 @@ static std::string apply(
788
790
  }
789
791
  tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
790
792
  tmpl_inputs.extra_context = inputs.extra_context;
793
+ tmpl_inputs.extra_context["enable_thinking"] = inputs.enable_thinking;
791
794
  if (additional_context) {
792
795
  tmpl_inputs.extra_context.merge_patch(*additional_context);
793
796
  }
@@ -968,6 +971,65 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
968
971
  data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
969
972
  return data;
970
973
  }
974
+
975
+ static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
976
+ common_chat_params data;
977
+ data.prompt = apply(tmpl, inputs);
978
+ data.format = COMMON_CHAT_FORMAT_MAGISTRAL;
979
+ data.preserved_tokens = {
980
+ "[THINK]",
981
+ "[/THINK]",
982
+ };
983
+
984
+ if (inputs.tools.is_array() && !inputs.tools.empty()) {
985
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
986
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
987
+ auto schemas = json::array();
988
+ foreach_function(inputs.tools, [&](const json & tool) {
989
+ const auto & function = tool.at("function");
990
+ schemas.push_back({
991
+ {"type", "object"},
992
+ {"properties", {
993
+ {"name", {
994
+ {"type", "string"},
995
+ {"const", function.at("name")},
996
+ }},
997
+ {"arguments", function.at("parameters")},
998
+ {"id", {
999
+ {"type", "string"},
1000
+ {"pattern", "^[a-zA-Z0-9]{9}$"},
1001
+ }},
1002
+ }},
1003
+ {"required", json::array({"name", "arguments", "id"})},
1004
+ });
1005
+ });
1006
+ auto schema = json {
1007
+ {"type", "array"},
1008
+ {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
1009
+ {"minItems", 1},
1010
+ };
1011
+ if (!inputs.parallel_tool_calls) {
1012
+ schema["maxItems"] = 1;
1013
+ }
1014
+ builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
1015
+ });
1016
+ data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"});
1017
+ data.preserved_tokens.push_back("[TOOL_CALLS]");
1018
+ } else {
1019
+ data.grammar_lazy = false;
1020
+ if (!inputs.json_schema.is_null()) {
1021
+ if (!inputs.grammar.empty()) {
1022
+ throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
1023
+ }
1024
+ data.grammar = json_schema_to_grammar(inputs.json_schema);
1025
+ } else {
1026
+ data.grammar = inputs.grammar;
1027
+ }
1028
+ }
1029
+
1030
+ return data;
1031
+ }
1032
+
971
1033
  static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
972
1034
  if (!builder.syntax().parse_tool_calls) {
973
1035
  builder.add_content(builder.consume_rest());
@@ -978,6 +1040,18 @@ static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
978
1040
  parse_prefixed_json_tool_call_array(builder, prefix);
979
1041
  }
980
1042
 
1043
+ static void common_chat_parse_magistral(common_chat_msg_parser & builder) {
1044
+ builder.try_parse_reasoning("[THINK]", "[/THINK]");
1045
+
1046
+ if (!builder.syntax().parse_tool_calls) {
1047
+ builder.add_content(builder.consume_rest());
1048
+ return;
1049
+ }
1050
+
1051
+ static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
1052
+ parse_prefixed_json_tool_call_array(builder, prefix);
1053
+ }
1054
+
981
1055
  static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct templates_params & inputs) {
982
1056
  common_chat_params data;
983
1057
 
@@ -1250,7 +1324,78 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
1250
1324
  }
1251
1325
  return data;
1252
1326
  }
1327
+
1328
+ static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) {
1329
+ common_chat_params data;
1330
+
1331
+ // Generate the prompt using the apply() function with the template
1332
+ data.prompt = apply(tmpl, inputs);
1333
+ data.format = COMMON_CHAT_FORMAT_APERTUS;
1334
+
1335
+ // Handle thinking tags appropriately based on inputs.enable_thinking
1336
+ if (string_ends_with(data.prompt, "<|inner_prefix|>")) {
1337
+ if (!inputs.enable_thinking) {
1338
+ data.prompt += "<|inner_suffix|>";
1339
+ } else {
1340
+ data.thinking_forced_open = true;
1341
+ }
1342
+ }
1343
+
1344
+ // When tools are present, build grammar for the <|tools_prefix|> format
1345
+ if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
1346
+ data.grammar_lazy = true;
1347
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1348
+ auto schemas = json::array();
1349
+ foreach_function(inputs.tools, [&](const json & tool) {
1350
+ const auto & function = tool.at("function");
1351
+ schemas.push_back({
1352
+ { "type", "object" },
1353
+ { "properties",
1354
+ {
1355
+ { function.at("name"), function.at("parameters") }
1356
+ } },
1357
+ { "required", json::array({ function.at("name") }) },
1358
+ });
1359
+ });
1360
+ auto schema = json{
1361
+ { "type", "array" },
1362
+ { "items", schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
1363
+ { "minItems", 1 },
1364
+ };
1365
+ if (!inputs.parallel_tool_calls) {
1366
+ schema["maxItems"] = 1;
1367
+ }
1368
+ builder.add_rule("root",
1369
+ std::string(data.thinking_forced_open ? "( \"<|inner_suffix|>\" space )? " : "") +
1370
+ "\"<|tools_prefix|>\"" + builder.add_schema("tool_calls", schema) + "\"<|tools_suffix|>\"");
1371
+ });
1372
+ data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1373
+ // If thinking_forced_open, then we capture the <|inner_suffix|> tag in the grammar,
1374
+ // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1375
+ std::string(data.thinking_forced_open ?
1376
+ "[\\s\\S]*?(<\\|inner_suffix\\|>\\s*)" :
1377
+ "(?:<\\|inner_prefix\\|>[\\s\\S]*?<\\|inner_suffix\\|>\\s*)?") +
1378
+ "(<\\|tools_prefix\\|>)[\\s\\S]*" });
1379
+ data.preserved_tokens = {
1380
+ "<|system_start|>",
1381
+ "<|system_end|>",
1382
+ "<|developer_start|>",
1383
+ "<|developer_end|>",
1384
+ "<|user_start|>",
1385
+ "<|user_end|>",
1386
+ "<|assistant_start|>",
1387
+ "<|assistant_end|>",
1388
+ "<|inner_prefix|>",
1389
+ "<|inner_suffix|>",
1390
+ "<|tools_prefix|>",
1391
+ "<|tools_suffix|>",
1392
+ };
1393
+ }
1394
+ return data;
1395
+ }
1253
1396
  static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
1397
+ builder.try_parse_reasoning("<think>", "</think>");
1398
+
1254
1399
  if (!builder.syntax().parse_tool_calls) {
1255
1400
  builder.add_content(builder.consume_rest());
1256
1401
  return;
@@ -1602,17 +1747,36 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
1602
1747
  );
1603
1748
  });
1604
1749
 
1605
- auto recipient_in_role = builder.add_rule("recipient_in_role",
1606
- "\"<|start|>assistant\"? \" to=functions.\" ( " +
1607
- string_join(tool_rules_recipient_in_role, " | ") + " )"
1608
- );
1609
-
1610
1750
  auto recipient_in_channel = builder.add_rule("recipient_in_channel",
1611
1751
  channel + " \" to=functions.\" ( " +
1612
1752
  string_join(tool_rules_recipient_in_channel, " | ") + " )"
1613
1753
  );
1614
1754
 
1615
- builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
1755
+ if (data.grammar_lazy) {
1756
+ auto recipient_in_role = builder.add_rule("recipient_in_role",
1757
+ "\"<|start|>assistant\"? \" to=functions.\" ( " +
1758
+ string_join(tool_rules_recipient_in_role, " | ") + " )"
1759
+ );
1760
+
1761
+ builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
1762
+ } else {
1763
+ auto not_end = builder.add_rule("not-end",
1764
+ "[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
1765
+ auto analysis = builder.add_rule("analysis",
1766
+ "\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
1767
+ auto commentary = builder.add_rule("commentary",
1768
+ "\"<|channel|>commentary<|message|>\" ( " + not_end + " )* \"<|end|>\"");
1769
+
1770
+ auto recipient_in_role = builder.add_rule("recipient_in_role",
1771
+ "\" to=functions.\" ( " + string_join(tool_rules_recipient_in_role, " | ") + " )"
1772
+ );
1773
+
1774
+ builder.add_rule("root",
1775
+ "( " + analysis + " \"<|start|>assistant\" )? " +
1776
+ "( " + commentary + " \"<|start|>assistant\" )? " +
1777
+ "( " + recipient_in_role + " | " + recipient_in_channel + " )"
1778
+ );
1779
+ }
1616
1780
 
1617
1781
  // Trigger on tool calls that appear in the commentary channel
1618
1782
  data.grammar_triggers.push_back({
@@ -2290,6 +2454,37 @@ static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
2290
2454
  builder.add_content(builder.consume_rest());
2291
2455
  }
2292
2456
 
2457
+ static void common_chat_parse_apertus(common_chat_msg_parser & builder) {
2458
+ // Parse thinking tags
2459
+ builder.try_parse_reasoning("<|inner_prefix|>", "<|inner_suffix|>");
2460
+ if (!builder.syntax().parse_tool_calls) {
2461
+ builder.add_content(builder.consume_rest());
2462
+ return;
2463
+ }
2464
+
2465
+ // Look for tool calls
2466
+ static const common_regex tool_call_regex(regex_escape("<|tools_prefix|>"));
2467
+ if (auto res = builder.try_find_regex(tool_call_regex)) {
2468
+ builder.move_to(res->groups[0].end);
2469
+
2470
+ auto tool_calls_data = builder.consume_json();
2471
+ if (tool_calls_data.json.is_array()) {
2472
+ builder.consume_spaces();
2473
+ if (!builder.try_consume_literal("<|tools_suffix|>")) {
2474
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2475
+ }
2476
+ for (const auto & value : tool_calls_data.json) {
2477
+ if (value.is_object()) {
2478
+ builder.add_tool_call_short_form(value);
2479
+ }
2480
+ }
2481
+ } else {
2482
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2483
+ }
2484
+ }
2485
+ builder.add_content(builder.consume_rest());
2486
+ }
2487
+
2293
2488
  static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
2294
2489
  // Parse thinking tags first - this handles the main reasoning content
2295
2490
  builder.try_parse_reasoning("<seed:think>", "</seed:think>");
@@ -2534,6 +2729,11 @@ static common_chat_params common_chat_templates_apply_jinja(
2534
2729
  return common_chat_params_init_nemotron_v2(tmpl, params);
2535
2730
  }
2536
2731
 
2732
+ // Apertus format detection
2733
+ if (src.find("<|system_start|>") != std::string::npos && src.find("<|tools_prefix|>") != std::string::npos) {
2734
+ return common_chat_params_init_apertus(tmpl, params);
2735
+ }
2736
+
2537
2737
  // Use generic handler when mixing tools + JSON schema.
2538
2738
  // TODO: support that mix in handlers below.
2539
2739
  if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2562,6 +2762,10 @@ static common_chat_params common_chat_templates_apply_jinja(
2562
2762
  return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
2563
2763
  }
2564
2764
 
2765
+ if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
2766
+ return common_chat_params_init_magistral(tmpl, params);
2767
+ }
2768
+
2565
2769
  // Plain handler (no tools)
2566
2770
  if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
2567
2771
  return common_chat_params_init_without_tools(tmpl, params);
@@ -2646,6 +2850,7 @@ common_chat_params common_chat_templates_apply(
2646
2850
  }
2647
2851
 
2648
2852
  static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
2853
+ builder.try_parse_reasoning("<think>", "</think>");
2649
2854
  builder.add_content(builder.consume_rest());
2650
2855
  }
2651
2856
 
@@ -2662,6 +2867,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
2662
2867
  case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
2663
2868
  common_chat_parse_mistral_nemo(builder);
2664
2869
  break;
2870
+ case COMMON_CHAT_FORMAT_MAGISTRAL:
2871
+ common_chat_parse_magistral(builder);
2872
+ break;
2665
2873
  case COMMON_CHAT_FORMAT_LLAMA_3_X:
2666
2874
  common_chat_parse_llama_3_1(builder);
2667
2875
  break;
@@ -2701,6 +2909,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
2701
2909
  case COMMON_CHAT_FORMAT_NEMOTRON_V2:
2702
2910
  common_chat_parse_nemotron_v2(builder);
2703
2911
  break;
2912
+ case COMMON_CHAT_FORMAT_APERTUS:
2913
+ common_chat_parse_apertus(builder);
2914
+ break;
2704
2915
  default:
2705
2916
  throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
2706
2917
  }
@@ -44,8 +44,8 @@ struct common_chat_msg_content_part {
44
44
  struct common_chat_msg {
45
45
  std::string role;
46
46
  std::string content;
47
- std::vector<common_chat_msg_content_part> content_parts = {};
48
- std::vector<common_chat_tool_call> tool_calls = {};
47
+ std::vector<common_chat_msg_content_part> content_parts;
48
+ std::vector<common_chat_tool_call> tool_calls;
49
49
  std::string reasoning_content;
50
50
  std::string tool_name;
51
51
  std::string tool_call_id;
@@ -55,7 +55,7 @@ struct common_chat_msg {
55
55
  bool empty() const {
56
56
  return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
57
57
  }
58
- void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
58
+ void set_tool_call_ids(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
59
59
  for (auto i = 0u; i < tool_calls.size(); i++) {
60
60
  if (ids_cache.size() <= i) {
61
61
  auto id = tool_calls[i].id;
@@ -112,6 +112,7 @@ enum common_chat_format {
112
112
  COMMON_CHAT_FORMAT_CONTENT_ONLY,
113
113
  COMMON_CHAT_FORMAT_GENERIC,
114
114
  COMMON_CHAT_FORMAT_MISTRAL_NEMO,
115
+ COMMON_CHAT_FORMAT_MAGISTRAL,
115
116
  COMMON_CHAT_FORMAT_LLAMA_3_X,
116
117
  COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
117
118
  COMMON_CHAT_FORMAT_DEEPSEEK_R1,
@@ -125,6 +126,7 @@ enum common_chat_format {
125
126
  COMMON_CHAT_FORMAT_GPT_OSS,
126
127
  COMMON_CHAT_FORMAT_SEED_OSS,
127
128
  COMMON_CHAT_FORMAT_NEMOTRON_V2,
129
+ COMMON_CHAT_FORMAT_APERTUS,
128
130
 
129
131
  COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
130
132
  };
@@ -51,6 +51,11 @@
51
51
  #include <unistd.h>
52
52
  #endif
53
53
 
54
+ #if defined(__linux__)
55
+ #include <sys/types.h>
56
+ #include <pwd.h>
57
+ #endif
58
+
54
59
  #if defined(_MSC_VER)
55
60
  #pragma warning(disable: 4244 4267) // possible loss of data
56
61
  #endif
@@ -865,8 +870,20 @@ std::string fs_get_cache_directory() {
865
870
  #if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
866
871
  if (std::getenv("XDG_CACHE_HOME")) {
867
872
  cache_directory = std::getenv("XDG_CACHE_HOME");
868
- } else {
873
+ } else if (std::getenv("HOME")) {
869
874
  cache_directory = std::getenv("HOME") + std::string("/.cache/");
875
+ } else {
876
+ #if defined(__linux__)
877
+ /* no $HOME is defined, fallback to getpwuid */
878
+ struct passwd *pw = getpwuid(getuid());
879
+ if ((!pw) || (!pw->pw_dir)) {
880
+ throw std::runtime_error("Failed to find $HOME directory");
881
+ }
882
+
883
+ cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
884
+ #else /* defined(__linux__) */
885
+ throw std::runtime_error("Failed to find $HOME directory");
886
+ #endif /* defined(__linux__) */
870
887
  }
871
888
  #elif defined(__APPLE__)
872
889
  cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
@@ -961,15 +978,13 @@ struct common_init_result common_init_from_params(common_params & params) {
961
978
 
962
979
  bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
963
980
  bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
981
+ bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
964
982
 
965
- if (!has_eos && !has_sep) {
966
- LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
983
+ if (!has_eos && !has_sep && !has_rerank_prompt) {
984
+ LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
967
985
  ok = false;
968
986
  } else if (!has_eos) {
969
987
  LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
970
- } else if (!has_sep) {
971
- LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
972
- ok = false;
973
988
  }
974
989
 
975
990
  if (!ok) {
@@ -1119,6 +1134,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
1119
1134
  mparams.use_mlock = params.use_mlock;
1120
1135
  mparams.check_tensors = params.check_tensors;
1121
1136
  mparams.use_extra_bufts = !params.no_extra_bufts;
1137
+ mparams.no_host = params.no_host;
1122
1138
 
1123
1139
  if (params.kv_overrides.empty()) {
1124
1140
  mparams.kv_overrides = NULL;
@@ -379,7 +379,7 @@ struct common_params {
379
379
  bool simple_io = false; // improves compatibility with subprocesses and limited consoles
380
380
  bool cont_batching = true; // insert new sequences for decoding on-the-fly
381
381
  bool no_perf = false; // disable performance metrics
382
- bool ctx_shift = false; // context shift on infinite text generation
382
+ bool ctx_shift = false; // context shift on infinite text generation
383
383
  bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
384
384
  bool kv_unified = false; // enable unified KV cache
385
385
 
@@ -393,6 +393,7 @@ struct common_params {
393
393
  bool check_tensors = false; // validate tensor data
394
394
  bool no_op_offload = false; // globally disable offload host tensor operations to device
395
395
  bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
396
+ bool no_host = false; // bypass host buffer allowing extra buffers to be used
396
397
 
397
398
  bool single_turn = false; // single turn chat conversation
398
399
 
@@ -425,7 +426,8 @@ struct common_params {
425
426
  int32_t timeout_write = timeout_read; // http write timeout in seconds
426
427
  int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
427
428
  int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
428
- int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot
429
+ int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
430
+ int32_t cache_ram_mib = 8192; // 0 = no limit, 1 = 1 MiB, etc.
429
431
 
430
432
  std::string hostname = "127.0.0.1";
431
433
  std::string public_path = ""; // NOLINT
@@ -433,7 +435,7 @@ struct common_params {
433
435
  std::string chat_template = ""; // NOLINT
434
436
  bool use_jinja = false; // NOLINT
435
437
  bool enable_chat_template = true;
436
- common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
438
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
437
439
  int reasoning_budget = -1;
438
440
  bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
439
441
 
@@ -739,7 +741,7 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
739
741
  // MoE utils
740
742
  //
741
743
 
742
- const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
744
+ const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
743
745
 
744
746
  static std::string llm_ffn_exps_block_regex(int idx) {
745
747
  return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);