@fugood/llama.node 1.4.15 → 1.5.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/lib/binding.ts +1 -5
  2. package/lib/index.js +2 -2
  3. package/lib/index.ts +2 -2
  4. package/package.json +15 -15
  5. package/scripts/llama.cpp.patch +76 -61
  6. package/src/LlamaContext.cpp +20 -32
  7. package/src/llama.cpp/common/CMakeLists.txt +12 -0
  8. package/src/llama.cpp/common/arg.cpp +20 -0
  9. package/src/llama.cpp/common/chat.cpp +289 -34
  10. package/src/llama.cpp/common/chat.h +16 -13
  11. package/src/llama.cpp/common/common.cpp +0 -1
  12. package/src/llama.cpp/common/common.h +28 -25
  13. package/src/llama.cpp/common/jinja/caps.cpp +237 -0
  14. package/src/llama.cpp/common/jinja/caps.h +24 -0
  15. package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
  16. package/src/llama.cpp/common/jinja/lexer.h +157 -0
  17. package/src/llama.cpp/common/jinja/parser.cpp +591 -0
  18. package/src/llama.cpp/common/jinja/parser.h +21 -0
  19. package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
  20. package/src/llama.cpp/common/jinja/runtime.h +628 -0
  21. package/src/llama.cpp/common/jinja/string.cpp +207 -0
  22. package/src/llama.cpp/common/jinja/string.h +58 -0
  23. package/src/llama.cpp/common/jinja/utils.h +49 -0
  24. package/src/llama.cpp/common/jinja/value.cpp +1221 -0
  25. package/src/llama.cpp/common/jinja/value.h +464 -0
  26. package/src/llama.cpp/common/sampling.cpp +52 -19
  27. package/src/llama.cpp/ggml/include/ggml.h +39 -7
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
  29. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
  30. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
  31. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
  32. package/src/llama.cpp/include/llama-cpp.h +3 -1
  33. package/src/llama.cpp/include/llama.h +29 -2
  34. package/src/llama.cpp/src/llama-adapter.cpp +7 -13
  35. package/src/llama.cpp/src/llama-adapter.h +1 -3
  36. package/src/llama.cpp/src/llama-context.cpp +232 -144
  37. package/src/llama.cpp/src/llama-context.h +10 -0
  38. package/src/llama.cpp/src/llama-cparams.h +2 -0
  39. package/src/llama.cpp/src/llama-hparams.cpp +0 -36
  40. package/src/llama.cpp/src/llama-hparams.h +38 -1
  41. package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
  42. package/src/llama.cpp/src/llama-kv-cache.h +0 -2
  43. package/src/llama.cpp/src/llama-mmap.cpp +5 -1
  44. package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
  45. package/src/llama.cpp/src/llama-model.cpp +5 -1
  46. package/src/llama.cpp/src/llama-model.h +3 -2
  47. package/src/llama.cpp/src/llama-sampling.cpp +170 -13
@@ -7,6 +7,14 @@
7
7
  #include "log.h"
8
8
  #include "regex-partial.h"
9
9
 
10
+ // #include <minja/chat-template.hpp>
11
+ // #include <minja/minja.hpp>
12
+
13
+ #include "jinja/parser.h"
14
+ #include "jinja/value.h"
15
+ #include "jinja/runtime.h"
16
+ #include "jinja/caps.h"
17
+
10
18
  #include <algorithm>
11
19
  #include <cstdio>
12
20
  #include <cctype>
@@ -132,6 +140,77 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
132
140
  return diffs;
133
141
  }
134
142
 
143
+ using chat_template_caps = jinja::caps;
144
+
145
+ struct common_chat_template {
146
+ jinja::program prog;
147
+ std::string bos_tok;
148
+ std::string eos_tok;
149
+ std::string src;
150
+ chat_template_caps caps;
151
+
152
+ common_chat_template(const std::string & src, const std::string & bos_token, const std::string & eos_token) {
153
+ jinja::lexer lexer;
154
+ auto lexer_res = lexer.tokenize(src);
155
+ this->prog = jinja::parse_from_tokens(lexer_res);
156
+
157
+ this->src = lexer_res.source;
158
+ this->bos_tok = bos_token;
159
+ this->eos_tok = eos_token;
160
+
161
+ this->caps = jinja::caps_get(prog);
162
+ // LOG_INF("%s: caps:\n%s\n", __func__, this->caps.to_string().c_str());
163
+ }
164
+
165
+ const std::string & source() const { return src; }
166
+ const std::string & bos_token() const { return bos_tok; }
167
+ const std::string & eos_token() const { return eos_tok; }
168
+
169
+ // TODO: this is ugly, refactor it somehow
170
+ json add_system(const json & messages, const std::string & system_prompt) const {
171
+ GGML_ASSERT(messages.is_array());
172
+ auto msgs_copy = messages;
173
+ if (!caps.supports_system_role) {
174
+ if (msgs_copy.empty()) {
175
+ msgs_copy.insert(msgs_copy.begin(), json{
176
+ {"role", "user"},
177
+ {"content", system_prompt}
178
+ });
179
+ } else {
180
+ auto & first_msg = msgs_copy[0];
181
+ if (!first_msg.contains("content")) {
182
+ first_msg["content"] = "";
183
+ }
184
+ first_msg["content"] = system_prompt + "\n\n"
185
+ + first_msg["content"].get<std::string>();
186
+ }
187
+ } else {
188
+ if (msgs_copy.empty() || msgs_copy[0].at("role") != "system") {
189
+ msgs_copy.insert(msgs_copy.begin(), json{
190
+ {"role", "system"},
191
+ {"content", system_prompt}
192
+ });
193
+ } else if (msgs_copy[0].at("role") == "system") {
194
+ msgs_copy[0]["content"] = system_prompt;
195
+ }
196
+ }
197
+ return msgs_copy;
198
+ }
199
+
200
+ chat_template_caps original_caps() const {
201
+ return caps;
202
+ }
203
+
204
+ };
205
+
206
+ struct common_chat_templates {
207
+ bool add_bos;
208
+ bool add_eos;
209
+ bool has_explicit_template; // Model had builtin template or template overridde was specified.
210
+ std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
211
+ std::unique_ptr<common_chat_template> template_tool_use;
212
+ };
213
+
135
214
  struct templates_params {
136
215
  json messages;
137
216
  json tools;
@@ -148,6 +227,7 @@ struct templates_params {
148
227
  bool add_bos;
149
228
  bool add_eos;
150
229
  bool is_inference = true;
230
+ bool mark_input = true; // whether to mark input strings in the jinja context
151
231
  };
152
232
 
153
233
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -521,18 +601,49 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
521
601
  return tmpls->has_explicit_template;
522
602
  }
523
603
 
524
- const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant) {
525
- if (variant != nullptr) {
526
- if (strcmp(variant, "tool_use") == 0) {
604
+ std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
605
+ if (!variant.empty()) {
606
+ if (variant == "tool_use") {
527
607
  if (tmpls->template_tool_use) {
528
- return tmpls->template_tool_use->source().c_str();
608
+ return tmpls->template_tool_use->source();
529
609
  }
530
- return nullptr;
610
+ return "";
531
611
  } else {
532
- LOG_DBG("%s: unknown template variant: %s\n", __func__, variant);
612
+ LOG_DBG("%s: unknown template variant: %s\n", __func__, variant.c_str());
533
613
  }
534
614
  }
535
- return tmpls->template_default->source().c_str();
615
+ return tmpls->template_default->source();
616
+ }
617
+
618
+ common_chat_template_caps common_chat_templates_get_caps(const struct common_chat_templates * tmpls, const std::string & variant) {
619
+ common_chat_template_caps result;
620
+ const common_chat_template * tmpl = nullptr;
621
+
622
+ if (!variant.empty() && variant == "tool_use") {
623
+ tmpl = tmpls->template_tool_use.get();
624
+ } else {
625
+ tmpl = tmpls->template_default.get();
626
+ }
627
+
628
+ if (tmpl) {
629
+ auto caps = tmpl->original_caps();
630
+ result.supports_tools = caps.supports_tools;
631
+ result.supports_tool_calls = caps.supports_tool_calls;
632
+ result.supports_system_role = caps.supports_system_role;
633
+ result.supports_parallel_tool_calls = caps.supports_parallel_tool_calls;
634
+ }
635
+
636
+ return result;
637
+ }
638
+
639
+ bool common_chat_templates_has_variant(const struct common_chat_templates * tmpls, const std::string & variant) {
640
+ if (variant.empty() || variant == "default") {
641
+ return tmpls->template_default != nullptr;
642
+ }
643
+ if (variant == "tool_use") {
644
+ return tmpls->template_tool_use != nullptr;
645
+ }
646
+ return false;
536
647
  }
537
648
 
538
649
  common_chat_templates_ptr common_chat_templates_init(
@@ -614,14 +725,16 @@ common_chat_templates_ptr common_chat_templates_init(
614
725
  tmpls->add_bos = add_bos;
615
726
  tmpls->add_eos = add_eos;
616
727
  try {
617
- tmpls->template_default = std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos);
728
+ tmpls->template_default = std::make_unique<common_chat_template>(default_template_src, token_bos, token_eos);
618
729
  } catch (const std::exception & e) {
619
- LOG_ERR("%s: failed to parse chat template (defaulting to chatml): %s \n", __func__, e.what());
620
- tmpls->template_default = std::make_unique<minja::chat_template>(CHATML_TEMPLATE_SRC, token_bos, token_eos);
730
+ LOG_ERR("%s: error: %s\n", __func__, e.what());
731
+ LOG_ERR("%s: failed to initialize chat template\n", __func__);
732
+ LOG_ERR("%s: please consider disabling jinja via --no-jinja, or using another chat template\n", __func__);
733
+ throw e;
621
734
  }
622
735
  if (!template_tool_use_src.empty()) {
623
736
  try {
624
- tmpls->template_tool_use = std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos);
737
+ tmpls->template_tool_use = std::make_unique<common_chat_template>(template_tool_use_src, token_bos, token_eos);
625
738
  } catch (const std::exception & e) {
626
739
  LOG_ERR("%s: failed to parse tool use chat template (ignoring it): %s\n", __func__, e.what());
627
740
  }
@@ -726,27 +839,44 @@ static std::string apply(
726
839
  const std::optional<json> & tools_override = std::nullopt,
727
840
  const std::optional<json> & additional_context = std::nullopt)
728
841
  {
729
- minja::chat_template_inputs tmpl_inputs;
730
- tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages;
731
- if (tools_override) {
732
- tmpl_inputs.tools = *tools_override;
733
- } else {
734
- tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools;
735
- }
736
- tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
737
- tmpl_inputs.extra_context = inputs.extra_context;
738
- tmpl_inputs.extra_context["enable_thinking"] = inputs.enable_thinking;
739
- if (additional_context) {
740
- tmpl_inputs.extra_context.merge_patch(*additional_context);
741
- }
742
- // TODO: add flag to control date/time, if only for testing purposes.
743
- tmpl_inputs.now = inputs.now;
744
-
745
- minja::chat_template_options tmpl_opts;
746
- // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
747
- // instead of using `chat_template_options.use_bos_token = false`, since these tokens
748
- // may be needed inside the template / between messages too.
749
- auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
842
+ jinja::context ctx(tmpl.source());
843
+
844
+ nlohmann::ordered_json inp = nlohmann::ordered_json{
845
+ {"messages", messages_override.has_value() ? *messages_override : inputs.messages},
846
+ {"tools", tools_override.has_value() ? *tools_override : inputs.tools},
847
+ {"bos_token", tmpl.bos_token()},
848
+ {"eos_token", tmpl.eos_token()},
849
+ };
850
+ if (inputs.extra_context.is_object()) {
851
+ // TODO: do we need to merge, or replacing is fine?
852
+ for (const auto & [k, v] : inputs.extra_context.items()) {
853
+ inp[k] = v;
854
+ }
855
+ }
856
+ if (additional_context.has_value()) {
857
+ // TODO: merge properly instead of overwriting (matching old behavior)
858
+ for (const auto & [k, v] : additional_context->items()) {
859
+ inp[k] = v;
860
+ }
861
+ }
862
+ if (inputs.add_generation_prompt) {
863
+ inp["add_generation_prompt"] = true;
864
+ }
865
+ // Remove tools key when null, so templates can check "{% if tools is defined %}"
866
+ if (inp["tools"].is_null() || (inp["tools"].is_array() && inp["tools"].empty())) {
867
+ inp.erase("tools");
868
+ }
869
+
870
+ jinja::global_from_json(ctx, inp, inputs.mark_input);
871
+
872
+ // render
873
+ jinja::runtime runtime(ctx);
874
+ const jinja::value results = runtime.execute(tmpl.prog);
875
+ auto parts = runtime.gather_string_parts(results);
876
+
877
+ std::string result = parts->as_string().str();
878
+
879
+ // TODO: improve this later
750
880
  if (inputs.add_bos && string_starts_with(result, tmpl.bos_token())) {
751
881
  result = result.substr(tmpl.bos_token().size());
752
882
  }
@@ -833,10 +963,17 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
833
963
  builder.add_schema("root", schema);
834
964
  });
835
965
 
836
- auto tweaked_messages = common_chat_template::add_system(
966
+ auto tweaked_messages = tmpl.add_system(
837
967
  inputs.messages,
838
968
  "Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
839
969
 
970
+ // ensure all messages has "content" field
971
+ for (auto & message : tweaked_messages) {
972
+ if (!message.contains("content") || message["content"].is_null()) {
973
+ message["content"] = "";
974
+ }
975
+ }
976
+
840
977
  data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
841
978
  data.format = COMMON_CHAT_FORMAT_GENERIC;
842
979
  return data;
@@ -1351,7 +1488,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
1351
1488
  data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json {
1352
1489
  {"date_string", format_time(inputs.now, "%d %b %Y")},
1353
1490
  {"tools_in_user_message", false},
1354
- {"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
1491
+ {"builtin_tools", builtin_tools},
1355
1492
  });
1356
1493
  return data;
1357
1494
  }
@@ -2656,6 +2793,107 @@ static common_chat_params common_chat_params_init_seed_oss(
2656
2793
  return data;
2657
2794
  }
2658
2795
 
2796
+ // various workarounds for known issues with certain templates or model behaviors
2797
+ // TODO @ngxson : improve this (how?)
2798
+ namespace workaround {
2799
+
2800
+ // if first message is system and template does not support it, merge it with next message
2801
+ static void system_message_not_supported(json & messages) {
2802
+ if (!messages.empty() && messages.front().at("role") == "system") {
2803
+ if (messages.size() > 1) {
2804
+ LOG_DBG("Merging system prompt into next message\n");
2805
+ auto & first_msg = messages.front();
2806
+ auto & second_msg = messages[1];
2807
+ second_msg["content"] = first_msg.at("content").get<std::string>()
2808
+ + "\n" + second_msg.at("content").get<std::string>();
2809
+ messages.erase(messages.begin());
2810
+ } else {
2811
+ LOG_WRN("Removing system prompt due to template not supporting system role\n");
2812
+ messages.erase(messages.begin());
2813
+ }
2814
+ }
2815
+ }
2816
+
2817
+ static void func_args_not_string(json & messages) {
2818
+ GGML_ASSERT(messages.is_array());
2819
+ for (auto & message : messages) {
2820
+ if (message.contains("tool_calls")) {
2821
+ for (auto & tool_call : message["tool_calls"]) {
2822
+ if (tool_call.contains("function") && tool_call["function"].contains("arguments")) {
2823
+ auto & args = tool_call["function"]["arguments"];
2824
+ if (args.is_string()) {
2825
+ try {
2826
+ args = json::parse(args.get<std::string>());
2827
+ } catch (const std::exception & e) {
2828
+ throw std::runtime_error("Failed to parse tool call arguments as JSON: " + std::string(e.what()));
2829
+ }
2830
+ }
2831
+ }
2832
+ }
2833
+ }
2834
+ }
2835
+ }
2836
+
2837
+ static void move_tool_calls_to_content(json & messages, int indent_spaces = 2) {
2838
+ GGML_ASSERT(messages.is_array());
2839
+ for (auto & message : messages) {
2840
+ if (message.contains("tool_calls")) {
2841
+ auto tool_calls_new = json{
2842
+ {"tool_calls", message.at("tool_calls")}
2843
+ };
2844
+ message.erase("tool_calls");
2845
+ auto content = message.at("content");
2846
+ std::string content_new = content.is_null() ? "" : content.get<std::string>();
2847
+ message["content"] = content_new + tool_calls_new.dump(indent_spaces, ' ', false, json::error_handler_t::replace);
2848
+ }
2849
+ }
2850
+ }
2851
+
2852
+ // TODO @ngxson : we may remove support for generic schema in the future
2853
+ static void use_generic_schema(json & messages) {
2854
+ GGML_ASSERT(messages.is_array());
2855
+ for (auto & message : messages) {
2856
+ if (message.contains("tool_calls") && message.at("tool_calls").is_array()) {
2857
+ auto & tool_calls = message.at("tool_calls");
2858
+ for (auto & tool_call : tool_calls) {
2859
+ if (tool_call.contains("type") && tool_call.at("type") == "function" &&
2860
+ tool_call.contains("function") && tool_call.at("function").is_object()) {
2861
+ // Copy values before erasing to avoid use-after-free
2862
+ json name_value;
2863
+ json arguments_value;
2864
+ json id_value;
2865
+ const auto & function = tool_call.at("function");
2866
+ if (function.contains("name")) {
2867
+ name_value = function.at("name");
2868
+ }
2869
+ if (function.contains("arguments")) {
2870
+ arguments_value = function.at("arguments");
2871
+ }
2872
+ if (tool_call.contains("id")) {
2873
+ id_value = tool_call.at("id");
2874
+ }
2875
+ // Now safely erase and assign in the correct order
2876
+ tool_call.erase("type");
2877
+ tool_call.erase("function");
2878
+ tool_call.erase("id");
2879
+ // Reassign in desired order: name, arguments, id
2880
+ if (!name_value.is_null()) {
2881
+ tool_call["name"] = name_value;
2882
+ }
2883
+ if (!arguments_value.is_null()) {
2884
+ tool_call["arguments"] = arguments_value;
2885
+ }
2886
+ if (!id_value.is_null()) {
2887
+ tool_call["id"] = id_value;
2888
+ }
2889
+ }
2890
+ }
2891
+ }
2892
+ }
2893
+ }
2894
+
2895
+ } // namespace workaround
2896
+
2659
2897
  static common_chat_params common_chat_templates_apply_jinja(
2660
2898
  const struct common_chat_templates * tmpls,
2661
2899
  const struct common_chat_templates_inputs & inputs)
@@ -2677,6 +2915,10 @@ static common_chat_params common_chat_templates_apply_jinja(
2677
2915
  params.add_bos = tmpls->add_bos;
2678
2916
  params.add_eos = tmpls->add_eos;
2679
2917
 
2918
+ if (!tmpl.original_caps().supports_system_role) {
2919
+ workaround::system_message_not_supported(params.messages);
2920
+ }
2921
+
2680
2922
  params.extra_context = json::object();
2681
2923
  for (auto el : inputs.chat_template_kwargs) {
2682
2924
  params.extra_context[el.first] = json::parse(el.second);
@@ -2715,11 +2957,15 @@ static common_chat_params common_chat_templates_apply_jinja(
2715
2957
 
2716
2958
  // Command R7B: : use handler in all cases except json schema (thinking / tools).
2717
2959
  if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos && params.json_schema.is_null()) {
2960
+ workaround::func_args_not_string(params.messages);
2718
2961
  return common_chat_params_init_command_r7b(tmpl, params);
2719
2962
  }
2720
2963
 
2721
2964
  // Granite (IBM) - detects thinking / tools support
2722
2965
  if (src.find("elif thinking") != std::string::npos && src.find("<|tool_call|>") != std::string::npos) {
2966
+ workaround::func_args_not_string(params.messages);
2967
+ workaround::use_generic_schema(params.messages);
2968
+ workaround::move_tool_calls_to_content(params.messages);
2723
2969
  return common_chat_params_init_granite(tmpl, params);
2724
2970
  }
2725
2971
 
@@ -2728,6 +2974,7 @@ static common_chat_params common_chat_templates_apply_jinja(
2728
2974
  src.find("<arg_key>") != std::string::npos &&
2729
2975
  src.find("<arg_value>") != std::string::npos &&
2730
2976
  params.json_schema.is_null()) {
2977
+ workaround::func_args_not_string(params.messages);
2731
2978
  return common_chat_params_init_glm_4_5(tmpl, params);
2732
2979
  }
2733
2980
 
@@ -2739,6 +2986,7 @@ static common_chat_params common_chat_templates_apply_jinja(
2739
2986
  src.find("<function=") != std::string::npos &&
2740
2987
  src.find("<parameters>") != std::string::npos &&
2741
2988
  src.find("<parameter=") != std::string::npos) {
2989
+ workaround::func_args_not_string(params.messages);
2742
2990
  // Nemotron 3 Nano 30B A3B
2743
2991
  if (src.find("<think>") != std::string::npos) {
2744
2992
  return common_chat_params_init_nemotron_v3(tmpl, params);
@@ -2775,6 +3023,7 @@ static common_chat_params common_chat_templates_apply_jinja(
2775
3023
 
2776
3024
  // Seed-OSS
2777
3025
  if (src.find("<seed:think>") != std::string::npos) {
3026
+ workaround::func_args_not_string(params.messages);
2778
3027
  return common_chat_params_init_seed_oss(tmpl, params, inputs);
2779
3028
  }
2780
3029
 
@@ -2796,6 +3045,7 @@ static common_chat_params common_chat_templates_apply_jinja(
2796
3045
 
2797
3046
  // MiniMax-M2 format detection
2798
3047
  if (src.find("]~!b[") != std::string::npos && src.find("]~b]") != std::string::npos) {
3048
+ workaround::func_args_not_string(params.messages);
2799
3049
  return common_chat_params_init_minimax_m2(tmpl, params);
2800
3050
  }
2801
3051
 
@@ -2842,6 +3092,7 @@ static common_chat_params common_chat_templates_apply_jinja(
2842
3092
  // Llama 3.1, 3.2, 3.3 (also requires date_string so using it even w/o tools)
2843
3093
  if (src.find("<|start_header_id|>ipython<|end_header_id|>") != std::string::npos) {
2844
3094
  auto allow_python_tag_builtin_tools = src.find("<|python_tag|>") != std::string::npos;
3095
+ workaround::func_args_not_string(params.messages);
2845
3096
  return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
2846
3097
  }
2847
3098
 
@@ -2870,10 +3121,14 @@ static common_chat_params common_chat_templates_apply_jinja(
2870
3121
 
2871
3122
  // Mistral Nemo (w/ tools)
2872
3123
  if (src.find("[TOOL_CALLS]") != std::string::npos) {
3124
+ workaround::func_args_not_string(params.messages);
2873
3125
  return common_chat_params_init_mistral_nemo(tmpl, params);
2874
3126
  }
2875
3127
 
2876
3128
  // Generic fallback
3129
+ workaround::func_args_not_string(params.messages);
3130
+ workaround::use_generic_schema(params.messages);
3131
+ workaround::move_tool_calls_to_content(params.messages);
2877
3132
  return common_chat_params_init_generic(tmpl, params);
2878
3133
  }
2879
3134
 
@@ -10,18 +10,7 @@
10
10
  #include <vector>
11
11
  #include <map>
12
12
 
13
- #include "minja/chat-template.hpp"
14
- #include "minja/minja.hpp"
15
-
16
- typedef minja::chat_template common_chat_template;
17
-
18
- struct common_chat_templates {
19
- bool add_bos;
20
- bool add_eos;
21
- bool has_explicit_template; // Model had builtin template or template overridde was specified.
22
- std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
23
- std::unique_ptr<common_chat_template> template_tool_use;
24
- };
13
+ struct common_chat_templates;
25
14
 
26
15
  struct common_chat_tool_call {
27
16
  std::string name;
@@ -202,7 +191,7 @@ common_chat_templates_ptr common_chat_templates_init(
202
191
  const std::string & eos_token_override = "");
203
192
 
204
193
  bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
205
- const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr);
194
+ std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
206
195
 
207
196
 
208
197
  struct common_chat_params common_chat_templates_apply(
@@ -233,6 +222,20 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
233
222
 
234
223
  bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
235
224
 
225
+ // Template capabilities structure (for exposing capabilities to external code)
226
+ struct common_chat_template_caps {
227
+ bool supports_tools = true;
228
+ bool supports_tool_calls = true;
229
+ bool supports_system_role = true;
230
+ bool supports_parallel_tool_calls = true;
231
+ };
232
+
233
+ // Get template capabilities for a specific variant ("" for default, "tool_use" for tool_use template)
234
+ common_chat_template_caps common_chat_templates_get_caps(const struct common_chat_templates * tmpls, const std::string & variant = "");
235
+
236
+ // Check if a template variant exists
237
+ bool common_chat_templates_has_variant(const struct common_chat_templates * tmpls, const std::string & variant);
238
+
236
239
  // Parses a JSON array of messages in OpenAI's chat completion API format.
237
240
  // T can be std::string containing JSON or nlohmann::ordered_json
238
241
  template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
@@ -1172,7 +1172,6 @@ common_init_result::common_init_result(common_params & params) :
1172
1172
  pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
1173
1173
  }
1174
1174
 
1175
- // TODO: temporarily gated behind a flag
1176
1175
  if (params.sampling.backend_sampling) {
1177
1176
  cparams.samplers = pimpl->samplers_seq_config.data();
1178
1177
  cparams.n_samplers = pimpl->samplers_seq_config.size();
@@ -119,6 +119,7 @@ enum common_sampler_type {
119
119
  COMMON_SAMPLER_TYPE_INFILL = 9,
120
120
  COMMON_SAMPLER_TYPE_PENALTIES = 10,
121
121
  COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
122
+ COMMON_SAMPLER_TYPE_ADAPTIVE_P = 12,
122
123
  };
123
124
 
124
125
  // dimensionality reduction methods, used by cvector-generator
@@ -166,32 +167,34 @@ enum common_params_sampling_config : uint64_t {
166
167
  struct common_params_sampling {
167
168
  uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
168
169
 
169
- int32_t n_prev = 64; // number of previous tokens to remember
170
- int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
171
- int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
172
- int32_t top_k = 40; // <= 0 to use vocab size
173
- float top_p = 0.95f; // 1.0 = disabled
174
- float min_p = 0.05f; // 0.0 = disabled
175
- float xtc_probability = 0.00f; // 0.0 = disabled
176
- float xtc_threshold = 0.10f; // > 0.5 disables XTC
177
- float typ_p = 1.00f; // typical_p, 1.0 = disabled
178
- float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
179
- float dynatemp_range = 0.00f; // 0.0 = disabled
180
- float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
181
- int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
182
- float penalty_repeat = 1.00f; // 1.0 = disabled
183
- float penalty_freq = 0.00f; // 0.0 = disabled
184
- float penalty_present = 0.00f; // 0.0 = disabled
185
- float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
186
- float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
187
- int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
188
- int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
189
- int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
190
- float top_n_sigma = -1.00f;// -1.0 = disabled
191
- float mirostat_tau = 5.00f; // target entropy
192
- float mirostat_eta = 0.10f; // learning rate
170
+ int32_t n_prev = 64; // number of previous tokens to remember
171
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
172
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
173
+ int32_t top_k = 40; // <= 0 to use vocab size
174
+ float top_p = 0.95f; // 1.0 = disabled
175
+ float min_p = 0.05f; // 0.0 = disabled
176
+ float xtc_probability = 0.00f; // 0.0 = disabled
177
+ float xtc_threshold = 0.10f; // > 0.5 disables XTC
178
+ float typ_p = 1.00f; // typical_p, 1.0 = disabled
179
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
180
+ float dynatemp_range = 0.00f; // 0.0 = disabled
181
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
182
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
183
+ float penalty_repeat = 1.00f; // 1.0 = disabled
184
+ float penalty_freq = 0.00f; // 0.0 = disabled
185
+ float penalty_present = 0.00f; // 0.0 = disabled
186
+ float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
187
+ float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
188
+ int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
189
+ int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
190
+ float adaptive_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
191
+ float adaptive_decay = 0.90f; // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99)
192
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
193
+ float top_n_sigma = -1.00f; // -1.0 = disabled
194
+ float mirostat_tau = 5.00f; // target entropy
195
+ float mirostat_eta = 0.10f; // learning rate
193
196
  bool ignore_eos = false;
194
- bool no_perf = false; // disable performance metrics
197
+ bool no_perf = false; // disable performance metrics
195
198
  bool timing_per_token = false;
196
199
 
197
200
  uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers