@fugood/llama.node 1.2.3 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +33 -11
- package/src/llama.cpp/CMakeLists.txt +1 -0
- package/src/llama.cpp/common/CMakeLists.txt +46 -2
- package/src/llama.cpp/common/arg.cpp +484 -204
- package/src/llama.cpp/common/arg.h +0 -1
- package/src/llama.cpp/common/chat-parser.cpp +156 -15
- package/src/llama.cpp/common/chat-parser.h +3 -0
- package/src/llama.cpp/common/chat.cpp +217 -6
- package/src/llama.cpp/common/chat.h +5 -3
- package/src/llama.cpp/common/common.cpp +22 -6
- package/src/llama.cpp/common/common.h +6 -4
- package/src/llama.cpp/common/http.h +73 -0
- package/src/llama.cpp/common/json-partial.cpp +51 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
- package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
- package/src/llama.cpp/ggml/include/ggml.h +22 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +17 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +11 -9
- package/src/llama.cpp/include/llama.h +8 -0
- package/src/llama.cpp/src/llama-arch.cpp +93 -0
- package/src/llama.cpp/src/llama-arch.h +22 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -1
- package/src/llama.cpp/src/llama-context.cpp +6 -0
- package/src/llama.cpp/src/llama-graph.cpp +57 -22
- package/src/llama.cpp/src/llama-graph.h +10 -1
- package/src/llama.cpp/src/llama-hparams.cpp +5 -1
- package/src/llama.cpp/src/llama-hparams.h +17 -2
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +2 -2
- package/src/llama.cpp/src/llama-kv-cache.cpp +2 -5
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +11 -9
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +11 -3
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model.cpp +572 -45
- package/src/llama.cpp/src/llama-model.h +18 -0
- package/src/llama.cpp/src/llama-sampling.cpp +5 -0
- package/src/llama.cpp/src/llama-vocab.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.h +41 -40
- package/src/llama.cpp/src/unicode.h +43 -0
|
@@ -78,7 +78,6 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
|
|
78
78
|
|
|
79
79
|
// function to be used by test-arg-parser
|
|
80
80
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
|
81
|
-
bool common_has_curl();
|
|
82
81
|
|
|
83
82
|
struct common_remote_params {
|
|
84
83
|
std::vector<std::string> headers;
|
|
@@ -3,9 +3,12 @@
|
|
|
3
3
|
#include "log.h"
|
|
4
4
|
#include "regex-partial.h"
|
|
5
5
|
|
|
6
|
+
#include <algorithm>
|
|
7
|
+
#include <cctype>
|
|
6
8
|
#include <optional>
|
|
7
9
|
#include <stdexcept>
|
|
8
10
|
#include <string>
|
|
11
|
+
#include <string_view>
|
|
9
12
|
#include <vector>
|
|
10
13
|
|
|
11
14
|
using json = nlohmann::ordered_json;
|
|
@@ -75,6 +78,35 @@ bool common_chat_msg_parser::add_tool_calls(const json & arr) {
|
|
|
75
78
|
}
|
|
76
79
|
return true;
|
|
77
80
|
}
|
|
81
|
+
|
|
82
|
+
bool common_chat_msg_parser::add_tool_call_short_form(const json & tool_call) {
|
|
83
|
+
if (!tool_call.is_object() || tool_call.size() != 1) {
|
|
84
|
+
return false;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Get the tool name (the single key in the object)
|
|
88
|
+
auto it = tool_call.begin();
|
|
89
|
+
std::string name = it.key();
|
|
90
|
+
|
|
91
|
+
if (name.empty()) {
|
|
92
|
+
return false;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Get the arguments (the nested object)
|
|
96
|
+
const json & args_json = it.value();
|
|
97
|
+
std::string arguments = "";
|
|
98
|
+
|
|
99
|
+
if (args_json.is_object()) {
|
|
100
|
+
arguments = args_json.dump();
|
|
101
|
+
} else if (args_json.is_string()) {
|
|
102
|
+
arguments = args_json;
|
|
103
|
+
} else if (!args_json.is_null()) {
|
|
104
|
+
// For other types, convert to string representation
|
|
105
|
+
arguments = args_json.dump();
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
return add_tool_call(name, "", arguments);
|
|
109
|
+
}
|
|
78
110
|
void common_chat_msg_parser::finish() {
|
|
79
111
|
if (!is_partial_ && pos_ != input_.size()) {
|
|
80
112
|
throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
|
|
@@ -137,6 +169,27 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
|
|
|
137
169
|
}
|
|
138
170
|
|
|
139
171
|
bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
|
|
172
|
+
std::string pending_reasoning_prefix;
|
|
173
|
+
|
|
174
|
+
if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
|
|
175
|
+
return false;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
auto set_reasoning_prefix = [&](size_t prefix_pos) {
|
|
179
|
+
if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
|
|
180
|
+
return;
|
|
181
|
+
}
|
|
182
|
+
if (prefix_pos + start_think.size() > input_.size()) {
|
|
183
|
+
pending_reasoning_prefix.clear();
|
|
184
|
+
return;
|
|
185
|
+
}
|
|
186
|
+
// Capture the exact literal that opened the reasoning section so we can
|
|
187
|
+
// surface it back to callers. This ensures formats that force the
|
|
188
|
+
// reasoning tag open (e.g. DeepSeek R1) retain their original prefix
|
|
189
|
+
// instead of dropping it during parsing.
|
|
190
|
+
pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
|
|
191
|
+
};
|
|
192
|
+
|
|
140
193
|
auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
|
|
141
194
|
auto stripped_reasoning = string_strip(reasoning);
|
|
142
195
|
if (stripped_reasoning.empty()) {
|
|
@@ -149,28 +202,116 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
|
|
|
149
202
|
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
|
|
150
203
|
}
|
|
151
204
|
} else {
|
|
205
|
+
if (!pending_reasoning_prefix.empty()) {
|
|
206
|
+
add_reasoning_content(pending_reasoning_prefix);
|
|
207
|
+
pending_reasoning_prefix.clear();
|
|
208
|
+
}
|
|
152
209
|
add_reasoning_content(stripped_reasoning);
|
|
153
210
|
}
|
|
154
211
|
};
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
212
|
+
|
|
213
|
+
const size_t saved_pos = pos_;
|
|
214
|
+
const size_t saved_content_size = result_.content.size();
|
|
215
|
+
const size_t saved_reasoning_size = result_.reasoning_content.size();
|
|
216
|
+
|
|
217
|
+
auto restore_state = [&]() {
|
|
218
|
+
move_to(saved_pos);
|
|
219
|
+
result_.content.resize(saved_content_size);
|
|
220
|
+
result_.reasoning_content.resize(saved_reasoning_size);
|
|
221
|
+
};
|
|
222
|
+
|
|
223
|
+
// Allow leading whitespace to be preserved as content when reasoning is present at the start
|
|
224
|
+
size_t cursor = pos_;
|
|
225
|
+
size_t whitespace_end = cursor;
|
|
226
|
+
while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
|
|
227
|
+
++whitespace_end;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
if (whitespace_end >= input_.size()) {
|
|
231
|
+
restore_state();
|
|
232
|
+
if (syntax_.thinking_forced_open) {
|
|
233
|
+
auto rest = input_.substr(saved_pos);
|
|
163
234
|
if (!rest.empty()) {
|
|
164
235
|
handle_reasoning(rest, /* closed */ !is_partial());
|
|
165
236
|
}
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
237
|
+
move_to(input_.size());
|
|
238
|
+
return true;
|
|
239
|
+
}
|
|
240
|
+
return false;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
cursor = whitespace_end;
|
|
244
|
+
const size_t remaining = input_.size() - cursor;
|
|
245
|
+
const size_t start_prefix = std::min(start_think.size(), remaining);
|
|
246
|
+
const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
|
|
247
|
+
|
|
248
|
+
if (has_start_tag && start_prefix < start_think.size()) {
|
|
249
|
+
move_to(input_.size());
|
|
250
|
+
return true;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
if (has_start_tag) {
|
|
254
|
+
if (whitespace_end > pos_) {
|
|
255
|
+
add_content(input_.substr(pos_, whitespace_end - pos_));
|
|
256
|
+
}
|
|
257
|
+
set_reasoning_prefix(cursor);
|
|
258
|
+
cursor += start_think.size();
|
|
259
|
+
} else if (syntax_.thinking_forced_open) {
|
|
260
|
+
cursor = whitespace_end;
|
|
261
|
+
} else {
|
|
262
|
+
restore_state();
|
|
263
|
+
return false;
|
|
264
|
+
}
|
|
265
|
+
while (true) {
|
|
266
|
+
if (cursor >= input_.size()) {
|
|
267
|
+
move_to(input_.size());
|
|
170
268
|
return true;
|
|
171
269
|
}
|
|
270
|
+
|
|
271
|
+
size_t end_pos = input_.find(end_think, cursor);
|
|
272
|
+
if (end_pos == std::string::npos) {
|
|
273
|
+
std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
|
|
274
|
+
size_t partial_off = string_find_partial_stop(remaining_view, end_think);
|
|
275
|
+
size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
|
|
276
|
+
if (reasoning_end > cursor) {
|
|
277
|
+
handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
|
|
278
|
+
}
|
|
279
|
+
move_to(input_.size());
|
|
280
|
+
return true;
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
if (end_pos > cursor) {
|
|
284
|
+
handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
|
|
285
|
+
} else {
|
|
286
|
+
handle_reasoning("", /* closed */ true);
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
cursor = end_pos + end_think.size();
|
|
290
|
+
|
|
291
|
+
while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
|
|
292
|
+
++cursor;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
const size_t next_remaining = input_.size() - cursor;
|
|
296
|
+
if (next_remaining == 0) {
|
|
297
|
+
move_to(cursor);
|
|
298
|
+
return true;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
const size_t next_prefix = std::min(start_think.size(), next_remaining);
|
|
302
|
+
if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
|
|
303
|
+
if (next_prefix < start_think.size()) {
|
|
304
|
+
move_to(input_.size());
|
|
305
|
+
return true;
|
|
306
|
+
}
|
|
307
|
+
set_reasoning_prefix(cursor);
|
|
308
|
+
cursor += start_think.size();
|
|
309
|
+
continue;
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
move_to(cursor);
|
|
313
|
+
return true;
|
|
172
314
|
}
|
|
173
|
-
return false;
|
|
174
315
|
}
|
|
175
316
|
|
|
176
317
|
std::string common_chat_msg_parser::consume_rest() {
|
|
@@ -291,7 +432,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
|
|
|
291
432
|
if (is_arguments_path({})) {
|
|
292
433
|
// Entire JSON is the arguments and was parsed fully.
|
|
293
434
|
return consume_json_result {
|
|
294
|
-
partial->json.dump(),
|
|
435
|
+
partial->json.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true),
|
|
295
436
|
/* .is_partial = */ false,
|
|
296
437
|
};
|
|
297
438
|
}
|
|
@@ -303,7 +444,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
|
|
|
303
444
|
std::vector<std::string> path;
|
|
304
445
|
std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
|
|
305
446
|
if (is_arguments_path(path)) {
|
|
306
|
-
auto arguments = j.dump();
|
|
447
|
+
auto arguments = j.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true);
|
|
307
448
|
if (is_partial() && !partial->healing_marker.marker.empty()) {
|
|
308
449
|
auto idx = arguments.find(partial->healing_marker.json_dump_marker);
|
|
309
450
|
if (idx != std::string::npos) {
|
|
@@ -64,6 +64,9 @@ class common_chat_msg_parser {
|
|
|
64
64
|
// Adds an array of tool calls using their "name", "id" and "arguments" fields.
|
|
65
65
|
bool add_tool_calls(const nlohmann::ordered_json & arr);
|
|
66
66
|
|
|
67
|
+
// Adds a tool call using the short form: { "tool_name": { "arg1": val, "arg2": val } }
|
|
68
|
+
bool add_tool_call_short_form(const nlohmann::ordered_json & tool_call);
|
|
69
|
+
|
|
67
70
|
void finish();
|
|
68
71
|
|
|
69
72
|
bool consume_spaces();
|
|
@@ -612,6 +612,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
612
612
|
case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
|
|
613
613
|
case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
|
|
614
614
|
case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo";
|
|
615
|
+
case COMMON_CHAT_FORMAT_MAGISTRAL: return "Magistral";
|
|
615
616
|
case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
|
|
616
617
|
case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
|
|
617
618
|
case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
|
|
@@ -625,6 +626,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
625
626
|
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
|
|
626
627
|
case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
|
|
627
628
|
case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
|
|
629
|
+
case COMMON_CHAT_FORMAT_APERTUS: return "Apertus";
|
|
628
630
|
default:
|
|
629
631
|
throw std::runtime_error("Unknown chat format");
|
|
630
632
|
}
|
|
@@ -788,6 +790,7 @@ static std::string apply(
|
|
|
788
790
|
}
|
|
789
791
|
tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
|
|
790
792
|
tmpl_inputs.extra_context = inputs.extra_context;
|
|
793
|
+
tmpl_inputs.extra_context["enable_thinking"] = inputs.enable_thinking;
|
|
791
794
|
if (additional_context) {
|
|
792
795
|
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
793
796
|
}
|
|
@@ -968,6 +971,65 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
|
|
|
968
971
|
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
|
|
969
972
|
return data;
|
|
970
973
|
}
|
|
974
|
+
|
|
975
|
+
static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
976
|
+
common_chat_params data;
|
|
977
|
+
data.prompt = apply(tmpl, inputs);
|
|
978
|
+
data.format = COMMON_CHAT_FORMAT_MAGISTRAL;
|
|
979
|
+
data.preserved_tokens = {
|
|
980
|
+
"[THINK]",
|
|
981
|
+
"[/THINK]",
|
|
982
|
+
};
|
|
983
|
+
|
|
984
|
+
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
985
|
+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
986
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
987
|
+
auto schemas = json::array();
|
|
988
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
989
|
+
const auto & function = tool.at("function");
|
|
990
|
+
schemas.push_back({
|
|
991
|
+
{"type", "object"},
|
|
992
|
+
{"properties", {
|
|
993
|
+
{"name", {
|
|
994
|
+
{"type", "string"},
|
|
995
|
+
{"const", function.at("name")},
|
|
996
|
+
}},
|
|
997
|
+
{"arguments", function.at("parameters")},
|
|
998
|
+
{"id", {
|
|
999
|
+
{"type", "string"},
|
|
1000
|
+
{"pattern", "^[a-zA-Z0-9]{9}$"},
|
|
1001
|
+
}},
|
|
1002
|
+
}},
|
|
1003
|
+
{"required", json::array({"name", "arguments", "id"})},
|
|
1004
|
+
});
|
|
1005
|
+
});
|
|
1006
|
+
auto schema = json {
|
|
1007
|
+
{"type", "array"},
|
|
1008
|
+
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
|
|
1009
|
+
{"minItems", 1},
|
|
1010
|
+
};
|
|
1011
|
+
if (!inputs.parallel_tool_calls) {
|
|
1012
|
+
schema["maxItems"] = 1;
|
|
1013
|
+
}
|
|
1014
|
+
builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
|
|
1015
|
+
});
|
|
1016
|
+
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"});
|
|
1017
|
+
data.preserved_tokens.push_back("[TOOL_CALLS]");
|
|
1018
|
+
} else {
|
|
1019
|
+
data.grammar_lazy = false;
|
|
1020
|
+
if (!inputs.json_schema.is_null()) {
|
|
1021
|
+
if (!inputs.grammar.empty()) {
|
|
1022
|
+
throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
|
|
1023
|
+
}
|
|
1024
|
+
data.grammar = json_schema_to_grammar(inputs.json_schema);
|
|
1025
|
+
} else {
|
|
1026
|
+
data.grammar = inputs.grammar;
|
|
1027
|
+
}
|
|
1028
|
+
}
|
|
1029
|
+
|
|
1030
|
+
return data;
|
|
1031
|
+
}
|
|
1032
|
+
|
|
971
1033
|
static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
|
|
972
1034
|
if (!builder.syntax().parse_tool_calls) {
|
|
973
1035
|
builder.add_content(builder.consume_rest());
|
|
@@ -978,6 +1040,18 @@ static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
|
|
|
978
1040
|
parse_prefixed_json_tool_call_array(builder, prefix);
|
|
979
1041
|
}
|
|
980
1042
|
|
|
1043
|
+
static void common_chat_parse_magistral(common_chat_msg_parser & builder) {
|
|
1044
|
+
builder.try_parse_reasoning("[THINK]", "[/THINK]");
|
|
1045
|
+
|
|
1046
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
1047
|
+
builder.add_content(builder.consume_rest());
|
|
1048
|
+
return;
|
|
1049
|
+
}
|
|
1050
|
+
|
|
1051
|
+
static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
|
|
1052
|
+
parse_prefixed_json_tool_call_array(builder, prefix);
|
|
1053
|
+
}
|
|
1054
|
+
|
|
981
1055
|
static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
982
1056
|
common_chat_params data;
|
|
983
1057
|
|
|
@@ -1250,7 +1324,78 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
|
|
|
1250
1324
|
}
|
|
1251
1325
|
return data;
|
|
1252
1326
|
}
|
|
1327
|
+
|
|
1328
|
+
static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1329
|
+
common_chat_params data;
|
|
1330
|
+
|
|
1331
|
+
// Generate the prompt using the apply() function with the template
|
|
1332
|
+
data.prompt = apply(tmpl, inputs);
|
|
1333
|
+
data.format = COMMON_CHAT_FORMAT_APERTUS;
|
|
1334
|
+
|
|
1335
|
+
// Handle thinking tags appropriately based on inputs.enable_thinking
|
|
1336
|
+
if (string_ends_with(data.prompt, "<|inner_prefix|>")) {
|
|
1337
|
+
if (!inputs.enable_thinking) {
|
|
1338
|
+
data.prompt += "<|inner_suffix|>";
|
|
1339
|
+
} else {
|
|
1340
|
+
data.thinking_forced_open = true;
|
|
1341
|
+
}
|
|
1342
|
+
}
|
|
1343
|
+
|
|
1344
|
+
// When tools are present, build grammar for the <|tools_prefix|> format
|
|
1345
|
+
if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1346
|
+
data.grammar_lazy = true;
|
|
1347
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1348
|
+
auto schemas = json::array();
|
|
1349
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
1350
|
+
const auto & function = tool.at("function");
|
|
1351
|
+
schemas.push_back({
|
|
1352
|
+
{ "type", "object" },
|
|
1353
|
+
{ "properties",
|
|
1354
|
+
{
|
|
1355
|
+
{ function.at("name"), function.at("parameters") }
|
|
1356
|
+
} },
|
|
1357
|
+
{ "required", json::array({ function.at("name") }) },
|
|
1358
|
+
});
|
|
1359
|
+
});
|
|
1360
|
+
auto schema = json{
|
|
1361
|
+
{ "type", "array" },
|
|
1362
|
+
{ "items", schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
|
|
1363
|
+
{ "minItems", 1 },
|
|
1364
|
+
};
|
|
1365
|
+
if (!inputs.parallel_tool_calls) {
|
|
1366
|
+
schema["maxItems"] = 1;
|
|
1367
|
+
}
|
|
1368
|
+
builder.add_rule("root",
|
|
1369
|
+
std::string(data.thinking_forced_open ? "( \"<|inner_suffix|>\" space )? " : "") +
|
|
1370
|
+
"\"<|tools_prefix|>\"" + builder.add_schema("tool_calls", schema) + "\"<|tools_suffix|>\"");
|
|
1371
|
+
});
|
|
1372
|
+
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
|
1373
|
+
// If thinking_forced_open, then we capture the <|inner_suffix|> tag in the grammar,
|
|
1374
|
+
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
|
|
1375
|
+
std::string(data.thinking_forced_open ?
|
|
1376
|
+
"[\\s\\S]*?(<\\|inner_suffix\\|>\\s*)" :
|
|
1377
|
+
"(?:<\\|inner_prefix\\|>[\\s\\S]*?<\\|inner_suffix\\|>\\s*)?") +
|
|
1378
|
+
"(<\\|tools_prefix\\|>)[\\s\\S]*" });
|
|
1379
|
+
data.preserved_tokens = {
|
|
1380
|
+
"<|system_start|>",
|
|
1381
|
+
"<|system_end|>",
|
|
1382
|
+
"<|developer_start|>",
|
|
1383
|
+
"<|developer_end|>",
|
|
1384
|
+
"<|user_start|>",
|
|
1385
|
+
"<|user_end|>",
|
|
1386
|
+
"<|assistant_start|>",
|
|
1387
|
+
"<|assistant_end|>",
|
|
1388
|
+
"<|inner_prefix|>",
|
|
1389
|
+
"<|inner_suffix|>",
|
|
1390
|
+
"<|tools_prefix|>",
|
|
1391
|
+
"<|tools_suffix|>",
|
|
1392
|
+
};
|
|
1393
|
+
}
|
|
1394
|
+
return data;
|
|
1395
|
+
}
|
|
1253
1396
|
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
|
|
1397
|
+
builder.try_parse_reasoning("<think>", "</think>");
|
|
1398
|
+
|
|
1254
1399
|
if (!builder.syntax().parse_tool_calls) {
|
|
1255
1400
|
builder.add_content(builder.consume_rest());
|
|
1256
1401
|
return;
|
|
@@ -1602,17 +1747,36 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
|
|
1602
1747
|
);
|
|
1603
1748
|
});
|
|
1604
1749
|
|
|
1605
|
-
auto recipient_in_role = builder.add_rule("recipient_in_role",
|
|
1606
|
-
"\"<|start|>assistant\"? \" to=functions.\" ( " +
|
|
1607
|
-
string_join(tool_rules_recipient_in_role, " | ") + " )"
|
|
1608
|
-
);
|
|
1609
|
-
|
|
1610
1750
|
auto recipient_in_channel = builder.add_rule("recipient_in_channel",
|
|
1611
1751
|
channel + " \" to=functions.\" ( " +
|
|
1612
1752
|
string_join(tool_rules_recipient_in_channel, " | ") + " )"
|
|
1613
1753
|
);
|
|
1614
1754
|
|
|
1615
|
-
|
|
1755
|
+
if (data.grammar_lazy) {
|
|
1756
|
+
auto recipient_in_role = builder.add_rule("recipient_in_role",
|
|
1757
|
+
"\"<|start|>assistant\"? \" to=functions.\" ( " +
|
|
1758
|
+
string_join(tool_rules_recipient_in_role, " | ") + " )"
|
|
1759
|
+
);
|
|
1760
|
+
|
|
1761
|
+
builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
|
|
1762
|
+
} else {
|
|
1763
|
+
auto not_end = builder.add_rule("not-end",
|
|
1764
|
+
"[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
|
|
1765
|
+
auto analysis = builder.add_rule("analysis",
|
|
1766
|
+
"\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
|
|
1767
|
+
auto commentary = builder.add_rule("commentary",
|
|
1768
|
+
"\"<|channel|>commentary<|message|>\" ( " + not_end + " )* \"<|end|>\"");
|
|
1769
|
+
|
|
1770
|
+
auto recipient_in_role = builder.add_rule("recipient_in_role",
|
|
1771
|
+
"\" to=functions.\" ( " + string_join(tool_rules_recipient_in_role, " | ") + " )"
|
|
1772
|
+
);
|
|
1773
|
+
|
|
1774
|
+
builder.add_rule("root",
|
|
1775
|
+
"( " + analysis + " \"<|start|>assistant\" )? " +
|
|
1776
|
+
"( " + commentary + " \"<|start|>assistant\" )? " +
|
|
1777
|
+
"( " + recipient_in_role + " | " + recipient_in_channel + " )"
|
|
1778
|
+
);
|
|
1779
|
+
}
|
|
1616
1780
|
|
|
1617
1781
|
// Trigger on tool calls that appear in the commentary channel
|
|
1618
1782
|
data.grammar_triggers.push_back({
|
|
@@ -2290,6 +2454,37 @@ static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
|
|
|
2290
2454
|
builder.add_content(builder.consume_rest());
|
|
2291
2455
|
}
|
|
2292
2456
|
|
|
2457
|
+
static void common_chat_parse_apertus(common_chat_msg_parser & builder) {
|
|
2458
|
+
// Parse thinking tags
|
|
2459
|
+
builder.try_parse_reasoning("<|inner_prefix|>", "<|inner_suffix|>");
|
|
2460
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
2461
|
+
builder.add_content(builder.consume_rest());
|
|
2462
|
+
return;
|
|
2463
|
+
}
|
|
2464
|
+
|
|
2465
|
+
// Look for tool calls
|
|
2466
|
+
static const common_regex tool_call_regex(regex_escape("<|tools_prefix|>"));
|
|
2467
|
+
if (auto res = builder.try_find_regex(tool_call_regex)) {
|
|
2468
|
+
builder.move_to(res->groups[0].end);
|
|
2469
|
+
|
|
2470
|
+
auto tool_calls_data = builder.consume_json();
|
|
2471
|
+
if (tool_calls_data.json.is_array()) {
|
|
2472
|
+
builder.consume_spaces();
|
|
2473
|
+
if (!builder.try_consume_literal("<|tools_suffix|>")) {
|
|
2474
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2475
|
+
}
|
|
2476
|
+
for (const auto & value : tool_calls_data.json) {
|
|
2477
|
+
if (value.is_object()) {
|
|
2478
|
+
builder.add_tool_call_short_form(value);
|
|
2479
|
+
}
|
|
2480
|
+
}
|
|
2481
|
+
} else {
|
|
2482
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2483
|
+
}
|
|
2484
|
+
}
|
|
2485
|
+
builder.add_content(builder.consume_rest());
|
|
2486
|
+
}
|
|
2487
|
+
|
|
2293
2488
|
static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
|
|
2294
2489
|
// Parse thinking tags first - this handles the main reasoning content
|
|
2295
2490
|
builder.try_parse_reasoning("<seed:think>", "</seed:think>");
|
|
@@ -2534,6 +2729,11 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2534
2729
|
return common_chat_params_init_nemotron_v2(tmpl, params);
|
|
2535
2730
|
}
|
|
2536
2731
|
|
|
2732
|
+
// Apertus format detection
|
|
2733
|
+
if (src.find("<|system_start|>") != std::string::npos && src.find("<|tools_prefix|>") != std::string::npos) {
|
|
2734
|
+
return common_chat_params_init_apertus(tmpl, params);
|
|
2735
|
+
}
|
|
2736
|
+
|
|
2537
2737
|
// Use generic handler when mixing tools + JSON schema.
|
|
2538
2738
|
// TODO: support that mix in handlers below.
|
|
2539
2739
|
if ((params.tools.is_array() && params.json_schema.is_object())) {
|
|
@@ -2562,6 +2762,10 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2562
2762
|
return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
|
|
2563
2763
|
}
|
|
2564
2764
|
|
|
2765
|
+
if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
|
|
2766
|
+
return common_chat_params_init_magistral(tmpl, params);
|
|
2767
|
+
}
|
|
2768
|
+
|
|
2565
2769
|
// Plain handler (no tools)
|
|
2566
2770
|
if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
|
|
2567
2771
|
return common_chat_params_init_without_tools(tmpl, params);
|
|
@@ -2646,6 +2850,7 @@ common_chat_params common_chat_templates_apply(
|
|
|
2646
2850
|
}
|
|
2647
2851
|
|
|
2648
2852
|
static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
|
|
2853
|
+
builder.try_parse_reasoning("<think>", "</think>");
|
|
2649
2854
|
builder.add_content(builder.consume_rest());
|
|
2650
2855
|
}
|
|
2651
2856
|
|
|
@@ -2662,6 +2867,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
2662
2867
|
case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
|
|
2663
2868
|
common_chat_parse_mistral_nemo(builder);
|
|
2664
2869
|
break;
|
|
2870
|
+
case COMMON_CHAT_FORMAT_MAGISTRAL:
|
|
2871
|
+
common_chat_parse_magistral(builder);
|
|
2872
|
+
break;
|
|
2665
2873
|
case COMMON_CHAT_FORMAT_LLAMA_3_X:
|
|
2666
2874
|
common_chat_parse_llama_3_1(builder);
|
|
2667
2875
|
break;
|
|
@@ -2701,6 +2909,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
2701
2909
|
case COMMON_CHAT_FORMAT_NEMOTRON_V2:
|
|
2702
2910
|
common_chat_parse_nemotron_v2(builder);
|
|
2703
2911
|
break;
|
|
2912
|
+
case COMMON_CHAT_FORMAT_APERTUS:
|
|
2913
|
+
common_chat_parse_apertus(builder);
|
|
2914
|
+
break;
|
|
2704
2915
|
default:
|
|
2705
2916
|
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
|
2706
2917
|
}
|
|
@@ -44,8 +44,8 @@ struct common_chat_msg_content_part {
|
|
|
44
44
|
struct common_chat_msg {
|
|
45
45
|
std::string role;
|
|
46
46
|
std::string content;
|
|
47
|
-
std::vector<common_chat_msg_content_part> content_parts
|
|
48
|
-
std::vector<common_chat_tool_call> tool_calls
|
|
47
|
+
std::vector<common_chat_msg_content_part> content_parts;
|
|
48
|
+
std::vector<common_chat_tool_call> tool_calls;
|
|
49
49
|
std::string reasoning_content;
|
|
50
50
|
std::string tool_name;
|
|
51
51
|
std::string tool_call_id;
|
|
@@ -55,7 +55,7 @@ struct common_chat_msg {
|
|
|
55
55
|
bool empty() const {
|
|
56
56
|
return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
|
|
57
57
|
}
|
|
58
|
-
void
|
|
58
|
+
void set_tool_call_ids(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
|
|
59
59
|
for (auto i = 0u; i < tool_calls.size(); i++) {
|
|
60
60
|
if (ids_cache.size() <= i) {
|
|
61
61
|
auto id = tool_calls[i].id;
|
|
@@ -112,6 +112,7 @@ enum common_chat_format {
|
|
|
112
112
|
COMMON_CHAT_FORMAT_CONTENT_ONLY,
|
|
113
113
|
COMMON_CHAT_FORMAT_GENERIC,
|
|
114
114
|
COMMON_CHAT_FORMAT_MISTRAL_NEMO,
|
|
115
|
+
COMMON_CHAT_FORMAT_MAGISTRAL,
|
|
115
116
|
COMMON_CHAT_FORMAT_LLAMA_3_X,
|
|
116
117
|
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
|
|
117
118
|
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
|
|
@@ -125,6 +126,7 @@ enum common_chat_format {
|
|
|
125
126
|
COMMON_CHAT_FORMAT_GPT_OSS,
|
|
126
127
|
COMMON_CHAT_FORMAT_SEED_OSS,
|
|
127
128
|
COMMON_CHAT_FORMAT_NEMOTRON_V2,
|
|
129
|
+
COMMON_CHAT_FORMAT_APERTUS,
|
|
128
130
|
|
|
129
131
|
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
|
130
132
|
};
|
|
@@ -51,6 +51,11 @@
|
|
|
51
51
|
#include <unistd.h>
|
|
52
52
|
#endif
|
|
53
53
|
|
|
54
|
+
#if defined(__linux__)
|
|
55
|
+
#include <sys/types.h>
|
|
56
|
+
#include <pwd.h>
|
|
57
|
+
#endif
|
|
58
|
+
|
|
54
59
|
#if defined(_MSC_VER)
|
|
55
60
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
56
61
|
#endif
|
|
@@ -865,8 +870,20 @@ std::string fs_get_cache_directory() {
|
|
|
865
870
|
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
|
|
866
871
|
if (std::getenv("XDG_CACHE_HOME")) {
|
|
867
872
|
cache_directory = std::getenv("XDG_CACHE_HOME");
|
|
868
|
-
} else {
|
|
873
|
+
} else if (std::getenv("HOME")) {
|
|
869
874
|
cache_directory = std::getenv("HOME") + std::string("/.cache/");
|
|
875
|
+
} else {
|
|
876
|
+
#if defined(__linux__)
|
|
877
|
+
/* no $HOME is defined, fallback to getpwuid */
|
|
878
|
+
struct passwd *pw = getpwuid(getuid());
|
|
879
|
+
if ((!pw) || (!pw->pw_dir)) {
|
|
880
|
+
throw std::runtime_error("Failed to find $HOME directory");
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
|
|
884
|
+
#else /* defined(__linux__) */
|
|
885
|
+
throw std::runtime_error("Failed to find $HOME directory");
|
|
886
|
+
#endif /* defined(__linux__) */
|
|
870
887
|
}
|
|
871
888
|
#elif defined(__APPLE__)
|
|
872
889
|
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
|
@@ -961,15 +978,13 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
961
978
|
|
|
962
979
|
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
|
963
980
|
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
|
981
|
+
bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
|
|
964
982
|
|
|
965
|
-
if (!has_eos && !has_sep) {
|
|
966
|
-
LOG_WRN("%s: warning: vocab does not have an EOS token
|
|
983
|
+
if (!has_eos && !has_sep && !has_rerank_prompt) {
|
|
984
|
+
LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
|
|
967
985
|
ok = false;
|
|
968
986
|
} else if (!has_eos) {
|
|
969
987
|
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
|
970
|
-
} else if (!has_sep) {
|
|
971
|
-
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
|
972
|
-
ok = false;
|
|
973
988
|
}
|
|
974
989
|
|
|
975
990
|
if (!ok) {
|
|
@@ -1119,6 +1134,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
|
1119
1134
|
mparams.use_mlock = params.use_mlock;
|
|
1120
1135
|
mparams.check_tensors = params.check_tensors;
|
|
1121
1136
|
mparams.use_extra_bufts = !params.no_extra_bufts;
|
|
1137
|
+
mparams.no_host = params.no_host;
|
|
1122
1138
|
|
|
1123
1139
|
if (params.kv_overrides.empty()) {
|
|
1124
1140
|
mparams.kv_overrides = NULL;
|