@fugood/llama.node 1.2.3 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +33 -11
- package/src/llama.cpp/CMakeLists.txt +1 -0
- package/src/llama.cpp/common/CMakeLists.txt +46 -2
- package/src/llama.cpp/common/arg.cpp +322 -70
- package/src/llama.cpp/common/arg.h +0 -1
- package/src/llama.cpp/common/chat-parser.cpp +154 -13
- package/src/llama.cpp/common/chat-parser.h +3 -0
- package/src/llama.cpp/common/chat.cpp +217 -6
- package/src/llama.cpp/common/chat.h +5 -3
- package/src/llama.cpp/common/common.cpp +22 -6
- package/src/llama.cpp/common/common.h +6 -4
- package/src/llama.cpp/common/http.h +73 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
- package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
- package/src/llama.cpp/ggml/include/ggml.h +22 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +17 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +6 -5
- package/src/llama.cpp/include/llama.h +8 -0
- package/src/llama.cpp/src/llama-arch.cpp +93 -0
- package/src/llama.cpp/src/llama-arch.h +22 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -1
- package/src/llama.cpp/src/llama-context.cpp +6 -0
- package/src/llama.cpp/src/llama-graph.cpp +57 -22
- package/src/llama.cpp/src/llama-graph.h +10 -1
- package/src/llama.cpp/src/llama-hparams.h +17 -2
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +2 -2
- package/src/llama.cpp/src/llama-kv-cache.cpp +2 -5
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +11 -9
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +11 -3
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model.cpp +568 -41
- package/src/llama.cpp/src/llama-model.h +18 -0
- package/src/llama.cpp/src/llama-sampling.cpp +5 -0
- package/src/llama.cpp/src/llama-vocab.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.h +41 -40
- package/src/llama.cpp/src/unicode.h +43 -0
|
@@ -3,9 +3,12 @@
|
|
|
3
3
|
#include "log.h"
|
|
4
4
|
#include "regex-partial.h"
|
|
5
5
|
|
|
6
|
+
#include <algorithm>
|
|
7
|
+
#include <cctype>
|
|
6
8
|
#include <optional>
|
|
7
9
|
#include <stdexcept>
|
|
8
10
|
#include <string>
|
|
11
|
+
#include <string_view>
|
|
9
12
|
#include <vector>
|
|
10
13
|
|
|
11
14
|
using json = nlohmann::ordered_json;
|
|
@@ -75,6 +78,35 @@ bool common_chat_msg_parser::add_tool_calls(const json & arr) {
|
|
|
75
78
|
}
|
|
76
79
|
return true;
|
|
77
80
|
}
|
|
81
|
+
|
|
82
|
+
bool common_chat_msg_parser::add_tool_call_short_form(const json & tool_call) {
|
|
83
|
+
if (!tool_call.is_object() || tool_call.size() != 1) {
|
|
84
|
+
return false;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Get the tool name (the single key in the object)
|
|
88
|
+
auto it = tool_call.begin();
|
|
89
|
+
std::string name = it.key();
|
|
90
|
+
|
|
91
|
+
if (name.empty()) {
|
|
92
|
+
return false;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Get the arguments (the nested object)
|
|
96
|
+
const json & args_json = it.value();
|
|
97
|
+
std::string arguments = "";
|
|
98
|
+
|
|
99
|
+
if (args_json.is_object()) {
|
|
100
|
+
arguments = args_json.dump();
|
|
101
|
+
} else if (args_json.is_string()) {
|
|
102
|
+
arguments = args_json;
|
|
103
|
+
} else if (!args_json.is_null()) {
|
|
104
|
+
// For other types, convert to string representation
|
|
105
|
+
arguments = args_json.dump();
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
return add_tool_call(name, "", arguments);
|
|
109
|
+
}
|
|
78
110
|
void common_chat_msg_parser::finish() {
|
|
79
111
|
if (!is_partial_ && pos_ != input_.size()) {
|
|
80
112
|
throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
|
|
@@ -137,6 +169,27 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
|
|
|
137
169
|
}
|
|
138
170
|
|
|
139
171
|
bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
|
|
172
|
+
std::string pending_reasoning_prefix;
|
|
173
|
+
|
|
174
|
+
if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
|
|
175
|
+
return false;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
auto set_reasoning_prefix = [&](size_t prefix_pos) {
|
|
179
|
+
if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
|
|
180
|
+
return;
|
|
181
|
+
}
|
|
182
|
+
if (prefix_pos + start_think.size() > input_.size()) {
|
|
183
|
+
pending_reasoning_prefix.clear();
|
|
184
|
+
return;
|
|
185
|
+
}
|
|
186
|
+
// Capture the exact literal that opened the reasoning section so we can
|
|
187
|
+
// surface it back to callers. This ensures formats that force the
|
|
188
|
+
// reasoning tag open (e.g. DeepSeek R1) retain their original prefix
|
|
189
|
+
// instead of dropping it during parsing.
|
|
190
|
+
pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
|
|
191
|
+
};
|
|
192
|
+
|
|
140
193
|
auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
|
|
141
194
|
auto stripped_reasoning = string_strip(reasoning);
|
|
142
195
|
if (stripped_reasoning.empty()) {
|
|
@@ -149,28 +202,116 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
|
|
|
149
202
|
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
|
|
150
203
|
}
|
|
151
204
|
} else {
|
|
205
|
+
if (!pending_reasoning_prefix.empty()) {
|
|
206
|
+
add_reasoning_content(pending_reasoning_prefix);
|
|
207
|
+
pending_reasoning_prefix.clear();
|
|
208
|
+
}
|
|
152
209
|
add_reasoning_content(stripped_reasoning);
|
|
153
210
|
}
|
|
154
211
|
};
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
212
|
+
|
|
213
|
+
const size_t saved_pos = pos_;
|
|
214
|
+
const size_t saved_content_size = result_.content.size();
|
|
215
|
+
const size_t saved_reasoning_size = result_.reasoning_content.size();
|
|
216
|
+
|
|
217
|
+
auto restore_state = [&]() {
|
|
218
|
+
move_to(saved_pos);
|
|
219
|
+
result_.content.resize(saved_content_size);
|
|
220
|
+
result_.reasoning_content.resize(saved_reasoning_size);
|
|
221
|
+
};
|
|
222
|
+
|
|
223
|
+
// Allow leading whitespace to be preserved as content when reasoning is present at the start
|
|
224
|
+
size_t cursor = pos_;
|
|
225
|
+
size_t whitespace_end = cursor;
|
|
226
|
+
while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
|
|
227
|
+
++whitespace_end;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
if (whitespace_end >= input_.size()) {
|
|
231
|
+
restore_state();
|
|
232
|
+
if (syntax_.thinking_forced_open) {
|
|
233
|
+
auto rest = input_.substr(saved_pos);
|
|
163
234
|
if (!rest.empty()) {
|
|
164
235
|
handle_reasoning(rest, /* closed */ !is_partial());
|
|
165
236
|
}
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
237
|
+
move_to(input_.size());
|
|
238
|
+
return true;
|
|
239
|
+
}
|
|
240
|
+
return false;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
cursor = whitespace_end;
|
|
244
|
+
const size_t remaining = input_.size() - cursor;
|
|
245
|
+
const size_t start_prefix = std::min(start_think.size(), remaining);
|
|
246
|
+
const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
|
|
247
|
+
|
|
248
|
+
if (has_start_tag && start_prefix < start_think.size()) {
|
|
249
|
+
move_to(input_.size());
|
|
250
|
+
return true;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
if (has_start_tag) {
|
|
254
|
+
if (whitespace_end > pos_) {
|
|
255
|
+
add_content(input_.substr(pos_, whitespace_end - pos_));
|
|
256
|
+
}
|
|
257
|
+
set_reasoning_prefix(cursor);
|
|
258
|
+
cursor += start_think.size();
|
|
259
|
+
} else if (syntax_.thinking_forced_open) {
|
|
260
|
+
cursor = whitespace_end;
|
|
261
|
+
} else {
|
|
262
|
+
restore_state();
|
|
263
|
+
return false;
|
|
264
|
+
}
|
|
265
|
+
while (true) {
|
|
266
|
+
if (cursor >= input_.size()) {
|
|
267
|
+
move_to(input_.size());
|
|
170
268
|
return true;
|
|
171
269
|
}
|
|
270
|
+
|
|
271
|
+
size_t end_pos = input_.find(end_think, cursor);
|
|
272
|
+
if (end_pos == std::string::npos) {
|
|
273
|
+
std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
|
|
274
|
+
size_t partial_off = string_find_partial_stop(remaining_view, end_think);
|
|
275
|
+
size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
|
|
276
|
+
if (reasoning_end > cursor) {
|
|
277
|
+
handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
|
|
278
|
+
}
|
|
279
|
+
move_to(input_.size());
|
|
280
|
+
return true;
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
if (end_pos > cursor) {
|
|
284
|
+
handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
|
|
285
|
+
} else {
|
|
286
|
+
handle_reasoning("", /* closed */ true);
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
cursor = end_pos + end_think.size();
|
|
290
|
+
|
|
291
|
+
while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
|
|
292
|
+
++cursor;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
const size_t next_remaining = input_.size() - cursor;
|
|
296
|
+
if (next_remaining == 0) {
|
|
297
|
+
move_to(cursor);
|
|
298
|
+
return true;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
const size_t next_prefix = std::min(start_think.size(), next_remaining);
|
|
302
|
+
if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
|
|
303
|
+
if (next_prefix < start_think.size()) {
|
|
304
|
+
move_to(input_.size());
|
|
305
|
+
return true;
|
|
306
|
+
}
|
|
307
|
+
set_reasoning_prefix(cursor);
|
|
308
|
+
cursor += start_think.size();
|
|
309
|
+
continue;
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
move_to(cursor);
|
|
313
|
+
return true;
|
|
172
314
|
}
|
|
173
|
-
return false;
|
|
174
315
|
}
|
|
175
316
|
|
|
176
317
|
std::string common_chat_msg_parser::consume_rest() {
|
|
@@ -64,6 +64,9 @@ class common_chat_msg_parser {
|
|
|
64
64
|
// Adds an array of tool calls using their "name", "id" and "arguments" fields.
|
|
65
65
|
bool add_tool_calls(const nlohmann::ordered_json & arr);
|
|
66
66
|
|
|
67
|
+
// Adds a tool call using the short form: { "tool_name": { "arg1": val, "arg2": val } }
|
|
68
|
+
bool add_tool_call_short_form(const nlohmann::ordered_json & tool_call);
|
|
69
|
+
|
|
67
70
|
void finish();
|
|
68
71
|
|
|
69
72
|
bool consume_spaces();
|
|
@@ -612,6 +612,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
612
612
|
case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
|
|
613
613
|
case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
|
|
614
614
|
case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo";
|
|
615
|
+
case COMMON_CHAT_FORMAT_MAGISTRAL: return "Magistral";
|
|
615
616
|
case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
|
|
616
617
|
case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
|
|
617
618
|
case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
|
|
@@ -625,6 +626,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
625
626
|
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
|
|
626
627
|
case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
|
|
627
628
|
case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
|
|
629
|
+
case COMMON_CHAT_FORMAT_APERTUS: return "Apertus";
|
|
628
630
|
default:
|
|
629
631
|
throw std::runtime_error("Unknown chat format");
|
|
630
632
|
}
|
|
@@ -788,6 +790,7 @@ static std::string apply(
|
|
|
788
790
|
}
|
|
789
791
|
tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
|
|
790
792
|
tmpl_inputs.extra_context = inputs.extra_context;
|
|
793
|
+
tmpl_inputs.extra_context["enable_thinking"] = inputs.enable_thinking;
|
|
791
794
|
if (additional_context) {
|
|
792
795
|
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
793
796
|
}
|
|
@@ -968,6 +971,65 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
|
|
|
968
971
|
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
|
|
969
972
|
return data;
|
|
970
973
|
}
|
|
974
|
+
|
|
975
|
+
static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
976
|
+
common_chat_params data;
|
|
977
|
+
data.prompt = apply(tmpl, inputs);
|
|
978
|
+
data.format = COMMON_CHAT_FORMAT_MAGISTRAL;
|
|
979
|
+
data.preserved_tokens = {
|
|
980
|
+
"[THINK]",
|
|
981
|
+
"[/THINK]",
|
|
982
|
+
};
|
|
983
|
+
|
|
984
|
+
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
985
|
+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
986
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
987
|
+
auto schemas = json::array();
|
|
988
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
989
|
+
const auto & function = tool.at("function");
|
|
990
|
+
schemas.push_back({
|
|
991
|
+
{"type", "object"},
|
|
992
|
+
{"properties", {
|
|
993
|
+
{"name", {
|
|
994
|
+
{"type", "string"},
|
|
995
|
+
{"const", function.at("name")},
|
|
996
|
+
}},
|
|
997
|
+
{"arguments", function.at("parameters")},
|
|
998
|
+
{"id", {
|
|
999
|
+
{"type", "string"},
|
|
1000
|
+
{"pattern", "^[a-zA-Z0-9]{9}$"},
|
|
1001
|
+
}},
|
|
1002
|
+
}},
|
|
1003
|
+
{"required", json::array({"name", "arguments", "id"})},
|
|
1004
|
+
});
|
|
1005
|
+
});
|
|
1006
|
+
auto schema = json {
|
|
1007
|
+
{"type", "array"},
|
|
1008
|
+
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
|
|
1009
|
+
{"minItems", 1},
|
|
1010
|
+
};
|
|
1011
|
+
if (!inputs.parallel_tool_calls) {
|
|
1012
|
+
schema["maxItems"] = 1;
|
|
1013
|
+
}
|
|
1014
|
+
builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
|
|
1015
|
+
});
|
|
1016
|
+
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"});
|
|
1017
|
+
data.preserved_tokens.push_back("[TOOL_CALLS]");
|
|
1018
|
+
} else {
|
|
1019
|
+
data.grammar_lazy = false;
|
|
1020
|
+
if (!inputs.json_schema.is_null()) {
|
|
1021
|
+
if (!inputs.grammar.empty()) {
|
|
1022
|
+
throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
|
|
1023
|
+
}
|
|
1024
|
+
data.grammar = json_schema_to_grammar(inputs.json_schema);
|
|
1025
|
+
} else {
|
|
1026
|
+
data.grammar = inputs.grammar;
|
|
1027
|
+
}
|
|
1028
|
+
}
|
|
1029
|
+
|
|
1030
|
+
return data;
|
|
1031
|
+
}
|
|
1032
|
+
|
|
971
1033
|
static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
|
|
972
1034
|
if (!builder.syntax().parse_tool_calls) {
|
|
973
1035
|
builder.add_content(builder.consume_rest());
|
|
@@ -978,6 +1040,18 @@ static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
|
|
|
978
1040
|
parse_prefixed_json_tool_call_array(builder, prefix);
|
|
979
1041
|
}
|
|
980
1042
|
|
|
1043
|
+
static void common_chat_parse_magistral(common_chat_msg_parser & builder) {
|
|
1044
|
+
builder.try_parse_reasoning("[THINK]", "[/THINK]");
|
|
1045
|
+
|
|
1046
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
1047
|
+
builder.add_content(builder.consume_rest());
|
|
1048
|
+
return;
|
|
1049
|
+
}
|
|
1050
|
+
|
|
1051
|
+
static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
|
|
1052
|
+
parse_prefixed_json_tool_call_array(builder, prefix);
|
|
1053
|
+
}
|
|
1054
|
+
|
|
981
1055
|
static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
982
1056
|
common_chat_params data;
|
|
983
1057
|
|
|
@@ -1250,7 +1324,78 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
|
|
|
1250
1324
|
}
|
|
1251
1325
|
return data;
|
|
1252
1326
|
}
|
|
1327
|
+
|
|
1328
|
+
static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1329
|
+
common_chat_params data;
|
|
1330
|
+
|
|
1331
|
+
// Generate the prompt using the apply() function with the template
|
|
1332
|
+
data.prompt = apply(tmpl, inputs);
|
|
1333
|
+
data.format = COMMON_CHAT_FORMAT_APERTUS;
|
|
1334
|
+
|
|
1335
|
+
// Handle thinking tags appropriately based on inputs.enable_thinking
|
|
1336
|
+
if (string_ends_with(data.prompt, "<|inner_prefix|>")) {
|
|
1337
|
+
if (!inputs.enable_thinking) {
|
|
1338
|
+
data.prompt += "<|inner_suffix|>";
|
|
1339
|
+
} else {
|
|
1340
|
+
data.thinking_forced_open = true;
|
|
1341
|
+
}
|
|
1342
|
+
}
|
|
1343
|
+
|
|
1344
|
+
// When tools are present, build grammar for the <|tools_prefix|> format
|
|
1345
|
+
if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1346
|
+
data.grammar_lazy = true;
|
|
1347
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1348
|
+
auto schemas = json::array();
|
|
1349
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
1350
|
+
const auto & function = tool.at("function");
|
|
1351
|
+
schemas.push_back({
|
|
1352
|
+
{ "type", "object" },
|
|
1353
|
+
{ "properties",
|
|
1354
|
+
{
|
|
1355
|
+
{ function.at("name"), function.at("parameters") }
|
|
1356
|
+
} },
|
|
1357
|
+
{ "required", json::array({ function.at("name") }) },
|
|
1358
|
+
});
|
|
1359
|
+
});
|
|
1360
|
+
auto schema = json{
|
|
1361
|
+
{ "type", "array" },
|
|
1362
|
+
{ "items", schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
|
|
1363
|
+
{ "minItems", 1 },
|
|
1364
|
+
};
|
|
1365
|
+
if (!inputs.parallel_tool_calls) {
|
|
1366
|
+
schema["maxItems"] = 1;
|
|
1367
|
+
}
|
|
1368
|
+
builder.add_rule("root",
|
|
1369
|
+
std::string(data.thinking_forced_open ? "( \"<|inner_suffix|>\" space )? " : "") +
|
|
1370
|
+
"\"<|tools_prefix|>\"" + builder.add_schema("tool_calls", schema) + "\"<|tools_suffix|>\"");
|
|
1371
|
+
});
|
|
1372
|
+
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
|
1373
|
+
// If thinking_forced_open, then we capture the <|inner_suffix|> tag in the grammar,
|
|
1374
|
+
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
|
|
1375
|
+
std::string(data.thinking_forced_open ?
|
|
1376
|
+
"[\\s\\S]*?(<\\|inner_suffix\\|>\\s*)" :
|
|
1377
|
+
"(?:<\\|inner_prefix\\|>[\\s\\S]*?<\\|inner_suffix\\|>\\s*)?") +
|
|
1378
|
+
"(<\\|tools_prefix\\|>)[\\s\\S]*" });
|
|
1379
|
+
data.preserved_tokens = {
|
|
1380
|
+
"<|system_start|>",
|
|
1381
|
+
"<|system_end|>",
|
|
1382
|
+
"<|developer_start|>",
|
|
1383
|
+
"<|developer_end|>",
|
|
1384
|
+
"<|user_start|>",
|
|
1385
|
+
"<|user_end|>",
|
|
1386
|
+
"<|assistant_start|>",
|
|
1387
|
+
"<|assistant_end|>",
|
|
1388
|
+
"<|inner_prefix|>",
|
|
1389
|
+
"<|inner_suffix|>",
|
|
1390
|
+
"<|tools_prefix|>",
|
|
1391
|
+
"<|tools_suffix|>",
|
|
1392
|
+
};
|
|
1393
|
+
}
|
|
1394
|
+
return data;
|
|
1395
|
+
}
|
|
1253
1396
|
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
|
|
1397
|
+
builder.try_parse_reasoning("<think>", "</think>");
|
|
1398
|
+
|
|
1254
1399
|
if (!builder.syntax().parse_tool_calls) {
|
|
1255
1400
|
builder.add_content(builder.consume_rest());
|
|
1256
1401
|
return;
|
|
@@ -1602,17 +1747,36 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
|
|
1602
1747
|
);
|
|
1603
1748
|
});
|
|
1604
1749
|
|
|
1605
|
-
auto recipient_in_role = builder.add_rule("recipient_in_role",
|
|
1606
|
-
"\"<|start|>assistant\"? \" to=functions.\" ( " +
|
|
1607
|
-
string_join(tool_rules_recipient_in_role, " | ") + " )"
|
|
1608
|
-
);
|
|
1609
|
-
|
|
1610
1750
|
auto recipient_in_channel = builder.add_rule("recipient_in_channel",
|
|
1611
1751
|
channel + " \" to=functions.\" ( " +
|
|
1612
1752
|
string_join(tool_rules_recipient_in_channel, " | ") + " )"
|
|
1613
1753
|
);
|
|
1614
1754
|
|
|
1615
|
-
|
|
1755
|
+
if (data.grammar_lazy) {
|
|
1756
|
+
auto recipient_in_role = builder.add_rule("recipient_in_role",
|
|
1757
|
+
"\"<|start|>assistant\"? \" to=functions.\" ( " +
|
|
1758
|
+
string_join(tool_rules_recipient_in_role, " | ") + " )"
|
|
1759
|
+
);
|
|
1760
|
+
|
|
1761
|
+
builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
|
|
1762
|
+
} else {
|
|
1763
|
+
auto not_end = builder.add_rule("not-end",
|
|
1764
|
+
"[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
|
|
1765
|
+
auto analysis = builder.add_rule("analysis",
|
|
1766
|
+
"\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
|
|
1767
|
+
auto commentary = builder.add_rule("commentary",
|
|
1768
|
+
"\"<|channel|>commentary<|message|>\" ( " + not_end + " )* \"<|end|>\"");
|
|
1769
|
+
|
|
1770
|
+
auto recipient_in_role = builder.add_rule("recipient_in_role",
|
|
1771
|
+
"\" to=functions.\" ( " + string_join(tool_rules_recipient_in_role, " | ") + " )"
|
|
1772
|
+
);
|
|
1773
|
+
|
|
1774
|
+
builder.add_rule("root",
|
|
1775
|
+
"( " + analysis + " \"<|start|>assistant\" )? " +
|
|
1776
|
+
"( " + commentary + " \"<|start|>assistant\" )? " +
|
|
1777
|
+
"( " + recipient_in_role + " | " + recipient_in_channel + " )"
|
|
1778
|
+
);
|
|
1779
|
+
}
|
|
1616
1780
|
|
|
1617
1781
|
// Trigger on tool calls that appear in the commentary channel
|
|
1618
1782
|
data.grammar_triggers.push_back({
|
|
@@ -2290,6 +2454,37 @@ static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
|
|
|
2290
2454
|
builder.add_content(builder.consume_rest());
|
|
2291
2455
|
}
|
|
2292
2456
|
|
|
2457
|
+
static void common_chat_parse_apertus(common_chat_msg_parser & builder) {
|
|
2458
|
+
// Parse thinking tags
|
|
2459
|
+
builder.try_parse_reasoning("<|inner_prefix|>", "<|inner_suffix|>");
|
|
2460
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
2461
|
+
builder.add_content(builder.consume_rest());
|
|
2462
|
+
return;
|
|
2463
|
+
}
|
|
2464
|
+
|
|
2465
|
+
// Look for tool calls
|
|
2466
|
+
static const common_regex tool_call_regex(regex_escape("<|tools_prefix|>"));
|
|
2467
|
+
if (auto res = builder.try_find_regex(tool_call_regex)) {
|
|
2468
|
+
builder.move_to(res->groups[0].end);
|
|
2469
|
+
|
|
2470
|
+
auto tool_calls_data = builder.consume_json();
|
|
2471
|
+
if (tool_calls_data.json.is_array()) {
|
|
2472
|
+
builder.consume_spaces();
|
|
2473
|
+
if (!builder.try_consume_literal("<|tools_suffix|>")) {
|
|
2474
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2475
|
+
}
|
|
2476
|
+
for (const auto & value : tool_calls_data.json) {
|
|
2477
|
+
if (value.is_object()) {
|
|
2478
|
+
builder.add_tool_call_short_form(value);
|
|
2479
|
+
}
|
|
2480
|
+
}
|
|
2481
|
+
} else {
|
|
2482
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2483
|
+
}
|
|
2484
|
+
}
|
|
2485
|
+
builder.add_content(builder.consume_rest());
|
|
2486
|
+
}
|
|
2487
|
+
|
|
2293
2488
|
static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
|
|
2294
2489
|
// Parse thinking tags first - this handles the main reasoning content
|
|
2295
2490
|
builder.try_parse_reasoning("<seed:think>", "</seed:think>");
|
|
@@ -2534,6 +2729,11 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2534
2729
|
return common_chat_params_init_nemotron_v2(tmpl, params);
|
|
2535
2730
|
}
|
|
2536
2731
|
|
|
2732
|
+
// Apertus format detection
|
|
2733
|
+
if (src.find("<|system_start|>") != std::string::npos && src.find("<|tools_prefix|>") != std::string::npos) {
|
|
2734
|
+
return common_chat_params_init_apertus(tmpl, params);
|
|
2735
|
+
}
|
|
2736
|
+
|
|
2537
2737
|
// Use generic handler when mixing tools + JSON schema.
|
|
2538
2738
|
// TODO: support that mix in handlers below.
|
|
2539
2739
|
if ((params.tools.is_array() && params.json_schema.is_object())) {
|
|
@@ -2562,6 +2762,10 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2562
2762
|
return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
|
|
2563
2763
|
}
|
|
2564
2764
|
|
|
2765
|
+
if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
|
|
2766
|
+
return common_chat_params_init_magistral(tmpl, params);
|
|
2767
|
+
}
|
|
2768
|
+
|
|
2565
2769
|
// Plain handler (no tools)
|
|
2566
2770
|
if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
|
|
2567
2771
|
return common_chat_params_init_without_tools(tmpl, params);
|
|
@@ -2646,6 +2850,7 @@ common_chat_params common_chat_templates_apply(
|
|
|
2646
2850
|
}
|
|
2647
2851
|
|
|
2648
2852
|
static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
|
|
2853
|
+
builder.try_parse_reasoning("<think>", "</think>");
|
|
2649
2854
|
builder.add_content(builder.consume_rest());
|
|
2650
2855
|
}
|
|
2651
2856
|
|
|
@@ -2662,6 +2867,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
2662
2867
|
case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
|
|
2663
2868
|
common_chat_parse_mistral_nemo(builder);
|
|
2664
2869
|
break;
|
|
2870
|
+
case COMMON_CHAT_FORMAT_MAGISTRAL:
|
|
2871
|
+
common_chat_parse_magistral(builder);
|
|
2872
|
+
break;
|
|
2665
2873
|
case COMMON_CHAT_FORMAT_LLAMA_3_X:
|
|
2666
2874
|
common_chat_parse_llama_3_1(builder);
|
|
2667
2875
|
break;
|
|
@@ -2701,6 +2909,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
2701
2909
|
case COMMON_CHAT_FORMAT_NEMOTRON_V2:
|
|
2702
2910
|
common_chat_parse_nemotron_v2(builder);
|
|
2703
2911
|
break;
|
|
2912
|
+
case COMMON_CHAT_FORMAT_APERTUS:
|
|
2913
|
+
common_chat_parse_apertus(builder);
|
|
2914
|
+
break;
|
|
2704
2915
|
default:
|
|
2705
2916
|
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
|
2706
2917
|
}
|
|
@@ -44,8 +44,8 @@ struct common_chat_msg_content_part {
|
|
|
44
44
|
struct common_chat_msg {
|
|
45
45
|
std::string role;
|
|
46
46
|
std::string content;
|
|
47
|
-
std::vector<common_chat_msg_content_part> content_parts
|
|
48
|
-
std::vector<common_chat_tool_call> tool_calls
|
|
47
|
+
std::vector<common_chat_msg_content_part> content_parts;
|
|
48
|
+
std::vector<common_chat_tool_call> tool_calls;
|
|
49
49
|
std::string reasoning_content;
|
|
50
50
|
std::string tool_name;
|
|
51
51
|
std::string tool_call_id;
|
|
@@ -55,7 +55,7 @@ struct common_chat_msg {
|
|
|
55
55
|
bool empty() const {
|
|
56
56
|
return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
|
|
57
57
|
}
|
|
58
|
-
void
|
|
58
|
+
void set_tool_call_ids(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
|
|
59
59
|
for (auto i = 0u; i < tool_calls.size(); i++) {
|
|
60
60
|
if (ids_cache.size() <= i) {
|
|
61
61
|
auto id = tool_calls[i].id;
|
|
@@ -112,6 +112,7 @@ enum common_chat_format {
|
|
|
112
112
|
COMMON_CHAT_FORMAT_CONTENT_ONLY,
|
|
113
113
|
COMMON_CHAT_FORMAT_GENERIC,
|
|
114
114
|
COMMON_CHAT_FORMAT_MISTRAL_NEMO,
|
|
115
|
+
COMMON_CHAT_FORMAT_MAGISTRAL,
|
|
115
116
|
COMMON_CHAT_FORMAT_LLAMA_3_X,
|
|
116
117
|
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
|
|
117
118
|
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
|
|
@@ -125,6 +126,7 @@ enum common_chat_format {
|
|
|
125
126
|
COMMON_CHAT_FORMAT_GPT_OSS,
|
|
126
127
|
COMMON_CHAT_FORMAT_SEED_OSS,
|
|
127
128
|
COMMON_CHAT_FORMAT_NEMOTRON_V2,
|
|
129
|
+
COMMON_CHAT_FORMAT_APERTUS,
|
|
128
130
|
|
|
129
131
|
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
|
130
132
|
};
|
|
@@ -51,6 +51,11 @@
|
|
|
51
51
|
#include <unistd.h>
|
|
52
52
|
#endif
|
|
53
53
|
|
|
54
|
+
#if defined(__linux__)
|
|
55
|
+
#include <sys/types.h>
|
|
56
|
+
#include <pwd.h>
|
|
57
|
+
#endif
|
|
58
|
+
|
|
54
59
|
#if defined(_MSC_VER)
|
|
55
60
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
56
61
|
#endif
|
|
@@ -865,8 +870,20 @@ std::string fs_get_cache_directory() {
|
|
|
865
870
|
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
|
|
866
871
|
if (std::getenv("XDG_CACHE_HOME")) {
|
|
867
872
|
cache_directory = std::getenv("XDG_CACHE_HOME");
|
|
868
|
-
} else {
|
|
873
|
+
} else if (std::getenv("HOME")) {
|
|
869
874
|
cache_directory = std::getenv("HOME") + std::string("/.cache/");
|
|
875
|
+
} else {
|
|
876
|
+
#if defined(__linux__)
|
|
877
|
+
/* no $HOME is defined, fallback to getpwuid */
|
|
878
|
+
struct passwd *pw = getpwuid(getuid());
|
|
879
|
+
if ((!pw) || (!pw->pw_dir)) {
|
|
880
|
+
throw std::runtime_error("Failed to find $HOME directory");
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
|
|
884
|
+
#else /* defined(__linux__) */
|
|
885
|
+
throw std::runtime_error("Failed to find $HOME directory");
|
|
886
|
+
#endif /* defined(__linux__) */
|
|
870
887
|
}
|
|
871
888
|
#elif defined(__APPLE__)
|
|
872
889
|
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
|
@@ -961,15 +978,13 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
961
978
|
|
|
962
979
|
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
|
963
980
|
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
|
981
|
+
bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
|
|
964
982
|
|
|
965
|
-
if (!has_eos && !has_sep) {
|
|
966
|
-
LOG_WRN("%s: warning: vocab does not have an EOS token
|
|
983
|
+
if (!has_eos && !has_sep && !has_rerank_prompt) {
|
|
984
|
+
LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
|
|
967
985
|
ok = false;
|
|
968
986
|
} else if (!has_eos) {
|
|
969
987
|
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
|
970
|
-
} else if (!has_sep) {
|
|
971
|
-
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
|
972
|
-
ok = false;
|
|
973
988
|
}
|
|
974
989
|
|
|
975
990
|
if (!ok) {
|
|
@@ -1119,6 +1134,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
|
1119
1134
|
mparams.use_mlock = params.use_mlock;
|
|
1120
1135
|
mparams.check_tensors = params.check_tensors;
|
|
1121
1136
|
mparams.use_extra_bufts = !params.no_extra_bufts;
|
|
1137
|
+
mparams.no_host = params.no_host;
|
|
1122
1138
|
|
|
1123
1139
|
if (params.kv_overrides.empty()) {
|
|
1124
1140
|
mparams.kv_overrides = NULL;
|
|
@@ -379,7 +379,7 @@ struct common_params {
|
|
|
379
379
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
|
380
380
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
|
381
381
|
bool no_perf = false; // disable performance metrics
|
|
382
|
-
bool ctx_shift = false;
|
|
382
|
+
bool ctx_shift = false; // context shift on infinite text generation
|
|
383
383
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
384
384
|
bool kv_unified = false; // enable unified KV cache
|
|
385
385
|
|
|
@@ -393,6 +393,7 @@ struct common_params {
|
|
|
393
393
|
bool check_tensors = false; // validate tensor data
|
|
394
394
|
bool no_op_offload = false; // globally disable offload host tensor operations to device
|
|
395
395
|
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
|
|
396
|
+
bool no_host = false; // bypass host buffer allowing extra buffers to be used
|
|
396
397
|
|
|
397
398
|
bool single_turn = false; // single turn chat conversation
|
|
398
399
|
|
|
@@ -425,7 +426,8 @@ struct common_params {
|
|
|
425
426
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
|
426
427
|
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
|
427
428
|
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
|
428
|
-
int32_t
|
|
429
|
+
int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
|
|
430
|
+
int32_t cache_ram_mib = 8192; // 0 = no limit, 1 = 1 MiB, etc.
|
|
429
431
|
|
|
430
432
|
std::string hostname = "127.0.0.1";
|
|
431
433
|
std::string public_path = ""; // NOLINT
|
|
@@ -433,7 +435,7 @@ struct common_params {
|
|
|
433
435
|
std::string chat_template = ""; // NOLINT
|
|
434
436
|
bool use_jinja = false; // NOLINT
|
|
435
437
|
bool enable_chat_template = true;
|
|
436
|
-
common_reasoning_format reasoning_format =
|
|
438
|
+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
|
437
439
|
int reasoning_budget = -1;
|
|
438
440
|
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
|
439
441
|
|
|
@@ -739,7 +741,7 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
|
|
739
741
|
// MoE utils
|
|
740
742
|
//
|
|
741
743
|
|
|
742
|
-
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)
|
|
744
|
+
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
|
|
743
745
|
|
|
744
746
|
static std::string llm_ffn_exps_block_regex(int idx) {
|
|
745
747
|
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
|