@fugood/llama.node 1.4.15 → 1.6.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +1 -5
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +76 -61
- package/src/LlamaContext.cpp +20 -32
- package/src/llama.cpp/common/CMakeLists.txt +12 -0
- package/src/llama.cpp/common/arg.cpp +20 -0
- package/src/llama.cpp/common/chat-parser.cpp +3 -3
- package/src/llama.cpp/common/chat-parser.h +4 -4
- package/src/llama.cpp/common/chat.cpp +289 -34
- package/src/llama.cpp/common/chat.h +32 -20
- package/src/llama.cpp/common/common.cpp +0 -1
- package/src/llama.cpp/common/common.h +31 -25
- package/src/llama.cpp/common/download.cpp +19 -14
- package/src/llama.cpp/common/jinja/caps.cpp +237 -0
- package/src/llama.cpp/common/jinja/caps.h +24 -0
- package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
- package/src/llama.cpp/common/jinja/lexer.h +157 -0
- package/src/llama.cpp/common/jinja/parser.cpp +591 -0
- package/src/llama.cpp/common/jinja/parser.h +21 -0
- package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
- package/src/llama.cpp/common/jinja/runtime.h +628 -0
- package/src/llama.cpp/common/jinja/string.cpp +207 -0
- package/src/llama.cpp/common/jinja/string.h +58 -0
- package/src/llama.cpp/common/jinja/utils.h +49 -0
- package/src/llama.cpp/common/jinja/value.cpp +1221 -0
- package/src/llama.cpp/common/jinja/value.h +464 -0
- package/src/llama.cpp/common/json-partial.h +1 -0
- package/src/llama.cpp/common/sampling.cpp +52 -19
- package/src/llama.cpp/ggml/include/ggml.h +39 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
- package/src/llama.cpp/include/llama-cpp.h +3 -1
- package/src/llama.cpp/include/llama.h +29 -2
- package/src/llama.cpp/src/llama-adapter.cpp +7 -13
- package/src/llama.cpp/src/llama-adapter.h +1 -3
- package/src/llama.cpp/src/llama-context.cpp +232 -144
- package/src/llama.cpp/src/llama-context.h +10 -0
- package/src/llama.cpp/src/llama-cparams.h +2 -0
- package/src/llama.cpp/src/llama-hparams.cpp +0 -36
- package/src/llama.cpp/src/llama-hparams.h +38 -1
- package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
- package/src/llama.cpp/src/llama-kv-cache.h +0 -2
- package/src/llama.cpp/src/llama-mmap.cpp +5 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
- package/src/llama.cpp/src/llama-model.cpp +5 -1
- package/src/llama.cpp/src/llama-model.h +3 -2
- package/src/llama.cpp/src/llama-sampling.cpp +170 -13
|
@@ -7,6 +7,14 @@
|
|
|
7
7
|
#include "log.h"
|
|
8
8
|
#include "regex-partial.h"
|
|
9
9
|
|
|
10
|
+
// #include <minja/chat-template.hpp>
|
|
11
|
+
// #include <minja/minja.hpp>
|
|
12
|
+
|
|
13
|
+
#include "jinja/parser.h"
|
|
14
|
+
#include "jinja/value.h"
|
|
15
|
+
#include "jinja/runtime.h"
|
|
16
|
+
#include "jinja/caps.h"
|
|
17
|
+
|
|
10
18
|
#include <algorithm>
|
|
11
19
|
#include <cstdio>
|
|
12
20
|
#include <cctype>
|
|
@@ -132,6 +140,77 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
|
|
|
132
140
|
return diffs;
|
|
133
141
|
}
|
|
134
142
|
|
|
143
|
+
using chat_template_caps = jinja::caps;
|
|
144
|
+
|
|
145
|
+
struct common_chat_template {
|
|
146
|
+
jinja::program prog;
|
|
147
|
+
std::string bos_tok;
|
|
148
|
+
std::string eos_tok;
|
|
149
|
+
std::string src;
|
|
150
|
+
chat_template_caps caps;
|
|
151
|
+
|
|
152
|
+
common_chat_template(const std::string & src, const std::string & bos_token, const std::string & eos_token) {
|
|
153
|
+
jinja::lexer lexer;
|
|
154
|
+
auto lexer_res = lexer.tokenize(src);
|
|
155
|
+
this->prog = jinja::parse_from_tokens(lexer_res);
|
|
156
|
+
|
|
157
|
+
this->src = lexer_res.source;
|
|
158
|
+
this->bos_tok = bos_token;
|
|
159
|
+
this->eos_tok = eos_token;
|
|
160
|
+
|
|
161
|
+
this->caps = jinja::caps_get(prog);
|
|
162
|
+
// LOG_INF("%s: caps:\n%s\n", __func__, this->caps.to_string().c_str());
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
const std::string & source() const { return src; }
|
|
166
|
+
const std::string & bos_token() const { return bos_tok; }
|
|
167
|
+
const std::string & eos_token() const { return eos_tok; }
|
|
168
|
+
|
|
169
|
+
// TODO: this is ugly, refactor it somehow
|
|
170
|
+
json add_system(const json & messages, const std::string & system_prompt) const {
|
|
171
|
+
GGML_ASSERT(messages.is_array());
|
|
172
|
+
auto msgs_copy = messages;
|
|
173
|
+
if (!caps.supports_system_role) {
|
|
174
|
+
if (msgs_copy.empty()) {
|
|
175
|
+
msgs_copy.insert(msgs_copy.begin(), json{
|
|
176
|
+
{"role", "user"},
|
|
177
|
+
{"content", system_prompt}
|
|
178
|
+
});
|
|
179
|
+
} else {
|
|
180
|
+
auto & first_msg = msgs_copy[0];
|
|
181
|
+
if (!first_msg.contains("content")) {
|
|
182
|
+
first_msg["content"] = "";
|
|
183
|
+
}
|
|
184
|
+
first_msg["content"] = system_prompt + "\n\n"
|
|
185
|
+
+ first_msg["content"].get<std::string>();
|
|
186
|
+
}
|
|
187
|
+
} else {
|
|
188
|
+
if (msgs_copy.empty() || msgs_copy[0].at("role") != "system") {
|
|
189
|
+
msgs_copy.insert(msgs_copy.begin(), json{
|
|
190
|
+
{"role", "system"},
|
|
191
|
+
{"content", system_prompt}
|
|
192
|
+
});
|
|
193
|
+
} else if (msgs_copy[0].at("role") == "system") {
|
|
194
|
+
msgs_copy[0]["content"] = system_prompt;
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
return msgs_copy;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
chat_template_caps original_caps() const {
|
|
201
|
+
return caps;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
};
|
|
205
|
+
|
|
206
|
+
struct common_chat_templates {
|
|
207
|
+
bool add_bos;
|
|
208
|
+
bool add_eos;
|
|
209
|
+
bool has_explicit_template; // Model had builtin template or template overridde was specified.
|
|
210
|
+
std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
|
|
211
|
+
std::unique_ptr<common_chat_template> template_tool_use;
|
|
212
|
+
};
|
|
213
|
+
|
|
135
214
|
struct templates_params {
|
|
136
215
|
json messages;
|
|
137
216
|
json tools;
|
|
@@ -148,6 +227,7 @@ struct templates_params {
|
|
|
148
227
|
bool add_bos;
|
|
149
228
|
bool add_eos;
|
|
150
229
|
bool is_inference = true;
|
|
230
|
+
bool mark_input = true; // whether to mark input strings in the jinja context
|
|
151
231
|
};
|
|
152
232
|
|
|
153
233
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
|
|
@@ -521,18 +601,49 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
|
|
|
521
601
|
return tmpls->has_explicit_template;
|
|
522
602
|
}
|
|
523
603
|
|
|
524
|
-
|
|
525
|
-
if (variant
|
|
526
|
-
if (
|
|
604
|
+
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
|
|
605
|
+
if (!variant.empty()) {
|
|
606
|
+
if (variant == "tool_use") {
|
|
527
607
|
if (tmpls->template_tool_use) {
|
|
528
|
-
return tmpls->template_tool_use->source()
|
|
608
|
+
return tmpls->template_tool_use->source();
|
|
529
609
|
}
|
|
530
|
-
return
|
|
610
|
+
return "";
|
|
531
611
|
} else {
|
|
532
|
-
LOG_DBG("%s: unknown template variant: %s\n", __func__, variant);
|
|
612
|
+
LOG_DBG("%s: unknown template variant: %s\n", __func__, variant.c_str());
|
|
533
613
|
}
|
|
534
614
|
}
|
|
535
|
-
return tmpls->template_default->source()
|
|
615
|
+
return tmpls->template_default->source();
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
common_chat_template_caps common_chat_templates_get_caps(const struct common_chat_templates * tmpls, const std::string & variant) {
|
|
619
|
+
common_chat_template_caps result;
|
|
620
|
+
const common_chat_template * tmpl = nullptr;
|
|
621
|
+
|
|
622
|
+
if (!variant.empty() && variant == "tool_use") {
|
|
623
|
+
tmpl = tmpls->template_tool_use.get();
|
|
624
|
+
} else {
|
|
625
|
+
tmpl = tmpls->template_default.get();
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
if (tmpl) {
|
|
629
|
+
auto caps = tmpl->original_caps();
|
|
630
|
+
result.supports_tools = caps.supports_tools;
|
|
631
|
+
result.supports_tool_calls = caps.supports_tool_calls;
|
|
632
|
+
result.supports_system_role = caps.supports_system_role;
|
|
633
|
+
result.supports_parallel_tool_calls = caps.supports_parallel_tool_calls;
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
return result;
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
bool common_chat_templates_has_variant(const struct common_chat_templates * tmpls, const std::string & variant) {
|
|
640
|
+
if (variant.empty() || variant == "default") {
|
|
641
|
+
return tmpls->template_default != nullptr;
|
|
642
|
+
}
|
|
643
|
+
if (variant == "tool_use") {
|
|
644
|
+
return tmpls->template_tool_use != nullptr;
|
|
645
|
+
}
|
|
646
|
+
return false;
|
|
536
647
|
}
|
|
537
648
|
|
|
538
649
|
common_chat_templates_ptr common_chat_templates_init(
|
|
@@ -614,14 +725,16 @@ common_chat_templates_ptr common_chat_templates_init(
|
|
|
614
725
|
tmpls->add_bos = add_bos;
|
|
615
726
|
tmpls->add_eos = add_eos;
|
|
616
727
|
try {
|
|
617
|
-
tmpls->template_default = std::make_unique<
|
|
728
|
+
tmpls->template_default = std::make_unique<common_chat_template>(default_template_src, token_bos, token_eos);
|
|
618
729
|
} catch (const std::exception & e) {
|
|
619
|
-
LOG_ERR("%s:
|
|
620
|
-
|
|
730
|
+
LOG_ERR("%s: error: %s\n", __func__, e.what());
|
|
731
|
+
LOG_ERR("%s: failed to initialize chat template\n", __func__);
|
|
732
|
+
LOG_ERR("%s: please consider disabling jinja via --no-jinja, or using another chat template\n", __func__);
|
|
733
|
+
throw e;
|
|
621
734
|
}
|
|
622
735
|
if (!template_tool_use_src.empty()) {
|
|
623
736
|
try {
|
|
624
|
-
tmpls->template_tool_use = std::make_unique<
|
|
737
|
+
tmpls->template_tool_use = std::make_unique<common_chat_template>(template_tool_use_src, token_bos, token_eos);
|
|
625
738
|
} catch (const std::exception & e) {
|
|
626
739
|
LOG_ERR("%s: failed to parse tool use chat template (ignoring it): %s\n", __func__, e.what());
|
|
627
740
|
}
|
|
@@ -726,27 +839,44 @@ static std::string apply(
|
|
|
726
839
|
const std::optional<json> & tools_override = std::nullopt,
|
|
727
840
|
const std::optional<json> & additional_context = std::nullopt)
|
|
728
841
|
{
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
842
|
+
jinja::context ctx(tmpl.source());
|
|
843
|
+
|
|
844
|
+
nlohmann::ordered_json inp = nlohmann::ordered_json{
|
|
845
|
+
{"messages", messages_override.has_value() ? *messages_override : inputs.messages},
|
|
846
|
+
{"tools", tools_override.has_value() ? *tools_override : inputs.tools},
|
|
847
|
+
{"bos_token", tmpl.bos_token()},
|
|
848
|
+
{"eos_token", tmpl.eos_token()},
|
|
849
|
+
};
|
|
850
|
+
if (inputs.extra_context.is_object()) {
|
|
851
|
+
// TODO: do we need to merge, or replacing is fine?
|
|
852
|
+
for (const auto & [k, v] : inputs.extra_context.items()) {
|
|
853
|
+
inp[k] = v;
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
if (additional_context.has_value()) {
|
|
857
|
+
// TODO: merge properly instead of overwriting (matching old behavior)
|
|
858
|
+
for (const auto & [k, v] : additional_context->items()) {
|
|
859
|
+
inp[k] = v;
|
|
860
|
+
}
|
|
861
|
+
}
|
|
862
|
+
if (inputs.add_generation_prompt) {
|
|
863
|
+
inp["add_generation_prompt"] = true;
|
|
864
|
+
}
|
|
865
|
+
// Remove tools key when null, so templates can check "{% if tools is defined %}"
|
|
866
|
+
if (inp["tools"].is_null() || (inp["tools"].is_array() && inp["tools"].empty())) {
|
|
867
|
+
inp.erase("tools");
|
|
868
|
+
}
|
|
869
|
+
|
|
870
|
+
jinja::global_from_json(ctx, inp, inputs.mark_input);
|
|
871
|
+
|
|
872
|
+
// render
|
|
873
|
+
jinja::runtime runtime(ctx);
|
|
874
|
+
const jinja::value results = runtime.execute(tmpl.prog);
|
|
875
|
+
auto parts = runtime.gather_string_parts(results);
|
|
876
|
+
|
|
877
|
+
std::string result = parts->as_string().str();
|
|
878
|
+
|
|
879
|
+
// TODO: improve this later
|
|
750
880
|
if (inputs.add_bos && string_starts_with(result, tmpl.bos_token())) {
|
|
751
881
|
result = result.substr(tmpl.bos_token().size());
|
|
752
882
|
}
|
|
@@ -833,10 +963,17 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
|
|
|
833
963
|
builder.add_schema("root", schema);
|
|
834
964
|
});
|
|
835
965
|
|
|
836
|
-
auto tweaked_messages =
|
|
966
|
+
auto tweaked_messages = tmpl.add_system(
|
|
837
967
|
inputs.messages,
|
|
838
968
|
"Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
|
|
839
969
|
|
|
970
|
+
// ensure all messages has "content" field
|
|
971
|
+
for (auto & message : tweaked_messages) {
|
|
972
|
+
if (!message.contains("content") || message["content"].is_null()) {
|
|
973
|
+
message["content"] = "";
|
|
974
|
+
}
|
|
975
|
+
}
|
|
976
|
+
|
|
840
977
|
data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
|
|
841
978
|
data.format = COMMON_CHAT_FORMAT_GENERIC;
|
|
842
979
|
return data;
|
|
@@ -1351,7 +1488,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
|
|
|
1351
1488
|
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json {
|
|
1352
1489
|
{"date_string", format_time(inputs.now, "%d %b %Y")},
|
|
1353
1490
|
{"tools_in_user_message", false},
|
|
1354
|
-
{"builtin_tools", builtin_tools
|
|
1491
|
+
{"builtin_tools", builtin_tools},
|
|
1355
1492
|
});
|
|
1356
1493
|
return data;
|
|
1357
1494
|
}
|
|
@@ -2656,6 +2793,107 @@ static common_chat_params common_chat_params_init_seed_oss(
|
|
|
2656
2793
|
return data;
|
|
2657
2794
|
}
|
|
2658
2795
|
|
|
2796
|
+
// various workarounds for known issues with certain templates or model behaviors
|
|
2797
|
+
// TODO @ngxson : improve this (how?)
|
|
2798
|
+
namespace workaround {
|
|
2799
|
+
|
|
2800
|
+
// if first message is system and template does not support it, merge it with next message
|
|
2801
|
+
static void system_message_not_supported(json & messages) {
|
|
2802
|
+
if (!messages.empty() && messages.front().at("role") == "system") {
|
|
2803
|
+
if (messages.size() > 1) {
|
|
2804
|
+
LOG_DBG("Merging system prompt into next message\n");
|
|
2805
|
+
auto & first_msg = messages.front();
|
|
2806
|
+
auto & second_msg = messages[1];
|
|
2807
|
+
second_msg["content"] = first_msg.at("content").get<std::string>()
|
|
2808
|
+
+ "\n" + second_msg.at("content").get<std::string>();
|
|
2809
|
+
messages.erase(messages.begin());
|
|
2810
|
+
} else {
|
|
2811
|
+
LOG_WRN("Removing system prompt due to template not supporting system role\n");
|
|
2812
|
+
messages.erase(messages.begin());
|
|
2813
|
+
}
|
|
2814
|
+
}
|
|
2815
|
+
}
|
|
2816
|
+
|
|
2817
|
+
static void func_args_not_string(json & messages) {
|
|
2818
|
+
GGML_ASSERT(messages.is_array());
|
|
2819
|
+
for (auto & message : messages) {
|
|
2820
|
+
if (message.contains("tool_calls")) {
|
|
2821
|
+
for (auto & tool_call : message["tool_calls"]) {
|
|
2822
|
+
if (tool_call.contains("function") && tool_call["function"].contains("arguments")) {
|
|
2823
|
+
auto & args = tool_call["function"]["arguments"];
|
|
2824
|
+
if (args.is_string()) {
|
|
2825
|
+
try {
|
|
2826
|
+
args = json::parse(args.get<std::string>());
|
|
2827
|
+
} catch (const std::exception & e) {
|
|
2828
|
+
throw std::runtime_error("Failed to parse tool call arguments as JSON: " + std::string(e.what()));
|
|
2829
|
+
}
|
|
2830
|
+
}
|
|
2831
|
+
}
|
|
2832
|
+
}
|
|
2833
|
+
}
|
|
2834
|
+
}
|
|
2835
|
+
}
|
|
2836
|
+
|
|
2837
|
+
static void move_tool_calls_to_content(json & messages, int indent_spaces = 2) {
|
|
2838
|
+
GGML_ASSERT(messages.is_array());
|
|
2839
|
+
for (auto & message : messages) {
|
|
2840
|
+
if (message.contains("tool_calls")) {
|
|
2841
|
+
auto tool_calls_new = json{
|
|
2842
|
+
{"tool_calls", message.at("tool_calls")}
|
|
2843
|
+
};
|
|
2844
|
+
message.erase("tool_calls");
|
|
2845
|
+
auto content = message.at("content");
|
|
2846
|
+
std::string content_new = content.is_null() ? "" : content.get<std::string>();
|
|
2847
|
+
message["content"] = content_new + tool_calls_new.dump(indent_spaces, ' ', false, json::error_handler_t::replace);
|
|
2848
|
+
}
|
|
2849
|
+
}
|
|
2850
|
+
}
|
|
2851
|
+
|
|
2852
|
+
// TODO @ngxson : we may remove support for generic schema in the future
|
|
2853
|
+
static void use_generic_schema(json & messages) {
|
|
2854
|
+
GGML_ASSERT(messages.is_array());
|
|
2855
|
+
for (auto & message : messages) {
|
|
2856
|
+
if (message.contains("tool_calls") && message.at("tool_calls").is_array()) {
|
|
2857
|
+
auto & tool_calls = message.at("tool_calls");
|
|
2858
|
+
for (auto & tool_call : tool_calls) {
|
|
2859
|
+
if (tool_call.contains("type") && tool_call.at("type") == "function" &&
|
|
2860
|
+
tool_call.contains("function") && tool_call.at("function").is_object()) {
|
|
2861
|
+
// Copy values before erasing to avoid use-after-free
|
|
2862
|
+
json name_value;
|
|
2863
|
+
json arguments_value;
|
|
2864
|
+
json id_value;
|
|
2865
|
+
const auto & function = tool_call.at("function");
|
|
2866
|
+
if (function.contains("name")) {
|
|
2867
|
+
name_value = function.at("name");
|
|
2868
|
+
}
|
|
2869
|
+
if (function.contains("arguments")) {
|
|
2870
|
+
arguments_value = function.at("arguments");
|
|
2871
|
+
}
|
|
2872
|
+
if (tool_call.contains("id")) {
|
|
2873
|
+
id_value = tool_call.at("id");
|
|
2874
|
+
}
|
|
2875
|
+
// Now safely erase and assign in the correct order
|
|
2876
|
+
tool_call.erase("type");
|
|
2877
|
+
tool_call.erase("function");
|
|
2878
|
+
tool_call.erase("id");
|
|
2879
|
+
// Reassign in desired order: name, arguments, id
|
|
2880
|
+
if (!name_value.is_null()) {
|
|
2881
|
+
tool_call["name"] = name_value;
|
|
2882
|
+
}
|
|
2883
|
+
if (!arguments_value.is_null()) {
|
|
2884
|
+
tool_call["arguments"] = arguments_value;
|
|
2885
|
+
}
|
|
2886
|
+
if (!id_value.is_null()) {
|
|
2887
|
+
tool_call["id"] = id_value;
|
|
2888
|
+
}
|
|
2889
|
+
}
|
|
2890
|
+
}
|
|
2891
|
+
}
|
|
2892
|
+
}
|
|
2893
|
+
}
|
|
2894
|
+
|
|
2895
|
+
} // namespace workaround
|
|
2896
|
+
|
|
2659
2897
|
static common_chat_params common_chat_templates_apply_jinja(
|
|
2660
2898
|
const struct common_chat_templates * tmpls,
|
|
2661
2899
|
const struct common_chat_templates_inputs & inputs)
|
|
@@ -2677,6 +2915,10 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2677
2915
|
params.add_bos = tmpls->add_bos;
|
|
2678
2916
|
params.add_eos = tmpls->add_eos;
|
|
2679
2917
|
|
|
2918
|
+
if (!tmpl.original_caps().supports_system_role) {
|
|
2919
|
+
workaround::system_message_not_supported(params.messages);
|
|
2920
|
+
}
|
|
2921
|
+
|
|
2680
2922
|
params.extra_context = json::object();
|
|
2681
2923
|
for (auto el : inputs.chat_template_kwargs) {
|
|
2682
2924
|
params.extra_context[el.first] = json::parse(el.second);
|
|
@@ -2715,11 +2957,15 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2715
2957
|
|
|
2716
2958
|
// Command R7B: : use handler in all cases except json schema (thinking / tools).
|
|
2717
2959
|
if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos && params.json_schema.is_null()) {
|
|
2960
|
+
workaround::func_args_not_string(params.messages);
|
|
2718
2961
|
return common_chat_params_init_command_r7b(tmpl, params);
|
|
2719
2962
|
}
|
|
2720
2963
|
|
|
2721
2964
|
// Granite (IBM) - detects thinking / tools support
|
|
2722
2965
|
if (src.find("elif thinking") != std::string::npos && src.find("<|tool_call|>") != std::string::npos) {
|
|
2966
|
+
workaround::func_args_not_string(params.messages);
|
|
2967
|
+
workaround::use_generic_schema(params.messages);
|
|
2968
|
+
workaround::move_tool_calls_to_content(params.messages);
|
|
2723
2969
|
return common_chat_params_init_granite(tmpl, params);
|
|
2724
2970
|
}
|
|
2725
2971
|
|
|
@@ -2728,6 +2974,7 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2728
2974
|
src.find("<arg_key>") != std::string::npos &&
|
|
2729
2975
|
src.find("<arg_value>") != std::string::npos &&
|
|
2730
2976
|
params.json_schema.is_null()) {
|
|
2977
|
+
workaround::func_args_not_string(params.messages);
|
|
2731
2978
|
return common_chat_params_init_glm_4_5(tmpl, params);
|
|
2732
2979
|
}
|
|
2733
2980
|
|
|
@@ -2739,6 +2986,7 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2739
2986
|
src.find("<function=") != std::string::npos &&
|
|
2740
2987
|
src.find("<parameters>") != std::string::npos &&
|
|
2741
2988
|
src.find("<parameter=") != std::string::npos) {
|
|
2989
|
+
workaround::func_args_not_string(params.messages);
|
|
2742
2990
|
// Nemotron 3 Nano 30B A3B
|
|
2743
2991
|
if (src.find("<think>") != std::string::npos) {
|
|
2744
2992
|
return common_chat_params_init_nemotron_v3(tmpl, params);
|
|
@@ -2775,6 +3023,7 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2775
3023
|
|
|
2776
3024
|
// Seed-OSS
|
|
2777
3025
|
if (src.find("<seed:think>") != std::string::npos) {
|
|
3026
|
+
workaround::func_args_not_string(params.messages);
|
|
2778
3027
|
return common_chat_params_init_seed_oss(tmpl, params, inputs);
|
|
2779
3028
|
}
|
|
2780
3029
|
|
|
@@ -2796,6 +3045,7 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2796
3045
|
|
|
2797
3046
|
// MiniMax-M2 format detection
|
|
2798
3047
|
if (src.find("]~!b[") != std::string::npos && src.find("]~b]") != std::string::npos) {
|
|
3048
|
+
workaround::func_args_not_string(params.messages);
|
|
2799
3049
|
return common_chat_params_init_minimax_m2(tmpl, params);
|
|
2800
3050
|
}
|
|
2801
3051
|
|
|
@@ -2842,6 +3092,7 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2842
3092
|
// Llama 3.1, 3.2, 3.3 (also requires date_string so using it even w/o tools)
|
|
2843
3093
|
if (src.find("<|start_header_id|>ipython<|end_header_id|>") != std::string::npos) {
|
|
2844
3094
|
auto allow_python_tag_builtin_tools = src.find("<|python_tag|>") != std::string::npos;
|
|
3095
|
+
workaround::func_args_not_string(params.messages);
|
|
2845
3096
|
return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
|
|
2846
3097
|
}
|
|
2847
3098
|
|
|
@@ -2870,10 +3121,14 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2870
3121
|
|
|
2871
3122
|
// Mistral Nemo (w/ tools)
|
|
2872
3123
|
if (src.find("[TOOL_CALLS]") != std::string::npos) {
|
|
3124
|
+
workaround::func_args_not_string(params.messages);
|
|
2873
3125
|
return common_chat_params_init_mistral_nemo(tmpl, params);
|
|
2874
3126
|
}
|
|
2875
3127
|
|
|
2876
3128
|
// Generic fallback
|
|
3129
|
+
workaround::func_args_not_string(params.messages);
|
|
3130
|
+
workaround::use_generic_schema(params.messages);
|
|
3131
|
+
workaround::move_tool_calls_to_content(params.messages);
|
|
2877
3132
|
return common_chat_params_init_generic(tmpl, params);
|
|
2878
3133
|
}
|
|
2879
3134
|
|
|
@@ -10,18 +10,7 @@
|
|
|
10
10
|
#include <vector>
|
|
11
11
|
#include <map>
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
#include "minja/minja.hpp"
|
|
15
|
-
|
|
16
|
-
typedef minja::chat_template common_chat_template;
|
|
17
|
-
|
|
18
|
-
struct common_chat_templates {
|
|
19
|
-
bool add_bos;
|
|
20
|
-
bool add_eos;
|
|
21
|
-
bool has_explicit_template; // Model had builtin template or template overridde was specified.
|
|
22
|
-
std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
|
|
23
|
-
std::unique_ptr<common_chat_template> template_tool_use;
|
|
24
|
-
};
|
|
13
|
+
struct common_chat_templates;
|
|
25
14
|
|
|
26
15
|
struct common_chat_tool_call {
|
|
27
16
|
std::string name;
|
|
@@ -156,7 +145,7 @@ struct common_chat_templates_inputs {
|
|
|
156
145
|
std::vector<common_chat_tool> tools;
|
|
157
146
|
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
|
|
158
147
|
bool parallel_tool_calls = false;
|
|
159
|
-
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
|
148
|
+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool enable_thinking"
|
|
160
149
|
bool enable_thinking = true;
|
|
161
150
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
162
151
|
std::map<std::string, std::string> chat_template_kwargs;
|
|
@@ -176,14 +165,21 @@ struct common_chat_params {
|
|
|
176
165
|
std::string parser;
|
|
177
166
|
};
|
|
178
167
|
|
|
179
|
-
|
|
168
|
+
// per-message parsing syntax
|
|
169
|
+
// should be derived from common_chat_params
|
|
170
|
+
struct common_chat_parser_params {
|
|
180
171
|
common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
181
|
-
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
|
172
|
+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning"
|
|
182
173
|
// Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
|
|
183
174
|
bool reasoning_in_content = false;
|
|
184
175
|
bool thinking_forced_open = false;
|
|
185
176
|
bool parse_tool_calls = true;
|
|
186
177
|
common_peg_arena parser = {};
|
|
178
|
+
common_chat_parser_params() = default;
|
|
179
|
+
common_chat_parser_params(const common_chat_params & chat_params) {
|
|
180
|
+
format = chat_params.format;
|
|
181
|
+
thinking_forced_open = chat_params.thinking_forced_open;
|
|
182
|
+
}
|
|
187
183
|
};
|
|
188
184
|
|
|
189
185
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
|
@@ -202,7 +198,7 @@ common_chat_templates_ptr common_chat_templates_init(
|
|
|
202
198
|
const std::string & eos_token_override = "");
|
|
203
199
|
|
|
204
200
|
bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
|
|
205
|
-
|
|
201
|
+
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
|
|
206
202
|
|
|
207
203
|
|
|
208
204
|
struct common_chat_params common_chat_templates_apply(
|
|
@@ -224,15 +220,31 @@ std::string common_chat_format_example(
|
|
|
224
220
|
const std::map<std::string, std::string> & chat_template_kwargs);
|
|
225
221
|
|
|
226
222
|
const char* common_chat_format_name(common_chat_format format);
|
|
227
|
-
const
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
223
|
+
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
|
|
224
|
+
common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
|
|
225
|
+
|
|
226
|
+
// used by arg and server
|
|
227
|
+
const char * common_reasoning_format_name(common_reasoning_format format);
|
|
228
|
+
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
|
|
231
229
|
|
|
232
230
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
|
233
231
|
|
|
234
232
|
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
|
|
235
233
|
|
|
234
|
+
// Template capabilities structure (for exposing capabilities to external code)
|
|
235
|
+
struct common_chat_template_caps {
|
|
236
|
+
bool supports_tools = true;
|
|
237
|
+
bool supports_tool_calls = true;
|
|
238
|
+
bool supports_system_role = true;
|
|
239
|
+
bool supports_parallel_tool_calls = true;
|
|
240
|
+
};
|
|
241
|
+
|
|
242
|
+
// Get template capabilities for a specific variant ("" for default, "tool_use" for tool_use template)
|
|
243
|
+
common_chat_template_caps common_chat_templates_get_caps(const struct common_chat_templates * tmpls, const std::string & variant = "");
|
|
244
|
+
|
|
245
|
+
// Check if a template variant exists
|
|
246
|
+
bool common_chat_templates_has_variant(const struct common_chat_templates * tmpls, const std::string & variant);
|
|
247
|
+
|
|
236
248
|
// Parses a JSON array of messages in OpenAI's chat completion API format.
|
|
237
249
|
// T can be std::string containing JSON or nlohmann::ordered_json
|
|
238
250
|
template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
|
|
@@ -1172,7 +1172,6 @@ common_init_result::common_init_result(common_params & params) :
|
|
|
1172
1172
|
pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
|
|
1173
1173
|
}
|
|
1174
1174
|
|
|
1175
|
-
// TODO: temporarily gated behind a flag
|
|
1176
1175
|
if (params.sampling.backend_sampling) {
|
|
1177
1176
|
cparams.samplers = pimpl->samplers_seq_config.data();
|
|
1178
1177
|
cparams.n_samplers = pimpl->samplers_seq_config.size();
|
|
@@ -57,6 +57,8 @@ extern const char * LLAMA_COMMIT;
|
|
|
57
57
|
extern const char * LLAMA_COMPILER;
|
|
58
58
|
extern const char * LLAMA_BUILD_TARGET;
|
|
59
59
|
|
|
60
|
+
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
|
|
61
|
+
|
|
60
62
|
struct common_control_vector_load_info;
|
|
61
63
|
|
|
62
64
|
//
|
|
@@ -119,6 +121,7 @@ enum common_sampler_type {
|
|
|
119
121
|
COMMON_SAMPLER_TYPE_INFILL = 9,
|
|
120
122
|
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
|
121
123
|
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
|
|
124
|
+
COMMON_SAMPLER_TYPE_ADAPTIVE_P = 12,
|
|
122
125
|
};
|
|
123
126
|
|
|
124
127
|
// dimensionality reduction methods, used by cvector-generator
|
|
@@ -166,32 +169,34 @@ enum common_params_sampling_config : uint64_t {
|
|
|
166
169
|
struct common_params_sampling {
|
|
167
170
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
|
168
171
|
|
|
169
|
-
int32_t n_prev = 64;
|
|
170
|
-
int32_t n_probs = 0;
|
|
171
|
-
int32_t min_keep = 0;
|
|
172
|
-
int32_t top_k = 40;
|
|
173
|
-
float top_p = 0.95f;
|
|
174
|
-
float min_p = 0.05f;
|
|
175
|
-
float xtc_probability = 0.00f;
|
|
176
|
-
float xtc_threshold = 0.10f;
|
|
177
|
-
float typ_p = 1.00f;
|
|
178
|
-
float temp = 0.80f;
|
|
179
|
-
float dynatemp_range = 0.00f;
|
|
180
|
-
float dynatemp_exponent = 1.00f;
|
|
181
|
-
int32_t penalty_last_n = 64;
|
|
182
|
-
float penalty_repeat = 1.00f;
|
|
183
|
-
float penalty_freq = 0.00f;
|
|
184
|
-
float penalty_present = 0.00f;
|
|
185
|
-
float dry_multiplier = 0.0f;
|
|
186
|
-
float dry_base = 1.75f;
|
|
187
|
-
int32_t dry_allowed_length = 2;
|
|
188
|
-
int32_t dry_penalty_last_n = -1;
|
|
189
|
-
|
|
190
|
-
float
|
|
191
|
-
|
|
192
|
-
float
|
|
172
|
+
int32_t n_prev = 64; // number of previous tokens to remember
|
|
173
|
+
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
|
174
|
+
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
|
175
|
+
int32_t top_k = 40; // <= 0 to use vocab size
|
|
176
|
+
float top_p = 0.95f; // 1.0 = disabled
|
|
177
|
+
float min_p = 0.05f; // 0.0 = disabled
|
|
178
|
+
float xtc_probability = 0.00f; // 0.0 = disabled
|
|
179
|
+
float xtc_threshold = 0.10f; // > 0.5 disables XTC
|
|
180
|
+
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
|
181
|
+
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
|
182
|
+
float dynatemp_range = 0.00f; // 0.0 = disabled
|
|
183
|
+
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
|
184
|
+
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
185
|
+
float penalty_repeat = 1.00f; // 1.0 = disabled
|
|
186
|
+
float penalty_freq = 0.00f; // 0.0 = disabled
|
|
187
|
+
float penalty_present = 0.00f; // 0.0 = disabled
|
|
188
|
+
float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
|
|
189
|
+
float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
|
|
190
|
+
int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
|
|
191
|
+
int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
|
|
192
|
+
float adaptive_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
|
|
193
|
+
float adaptive_decay = 0.90f; // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99)
|
|
194
|
+
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
195
|
+
float top_n_sigma = -1.00f; // -1.0 = disabled
|
|
196
|
+
float mirostat_tau = 5.00f; // target entropy
|
|
197
|
+
float mirostat_eta = 0.10f; // learning rate
|
|
193
198
|
bool ignore_eos = false;
|
|
194
|
-
bool no_perf = false;
|
|
199
|
+
bool no_perf = false; // disable performance metrics
|
|
195
200
|
bool timing_per_token = false;
|
|
196
201
|
|
|
197
202
|
uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
|
|
@@ -281,6 +286,7 @@ struct common_params_diffusion {
|
|
|
281
286
|
};
|
|
282
287
|
|
|
283
288
|
// reasoning API response format (not to be confused as chat template's reasoning format)
|
|
289
|
+
// only used by server
|
|
284
290
|
enum common_reasoning_format {
|
|
285
291
|
COMMON_REASONING_FORMAT_NONE,
|
|
286
292
|
COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
|