@fugood/llama.node 1.4.5 → 1.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/binding.ts CHANGED
@@ -309,6 +309,45 @@ export type BackendDeviceInfo = {
309
309
  metadata?: Record<string, any>
310
310
  }
311
311
 
312
+ export type BenchResult = {
313
+ /** Maximum KV cache size */
314
+ nKvMax: number
315
+ /** Batch size */
316
+ nBatch: number
317
+ /** Micro-batch size */
318
+ nUBatch: number
319
+ /** Flash attention type (0=disabled, 1=enabled, 2=auto) */
320
+ flashAttn: number
321
+ /** Whether prompt processing is shared */
322
+ isPpShared: boolean
323
+ /** Number of GPU layers */
324
+ nGpuLayers: number
325
+ /** Number of threads */
326
+ nThreads: number
327
+ /** Number of threads for batch processing */
328
+ nThreadsBatch: number
329
+ /** Prompt processing tokens count */
330
+ pp: number
331
+ /** Text generation tokens count */
332
+ tg: number
333
+ /** Parallel level */
334
+ pl: number
335
+ /** KV cache used */
336
+ nKv: number
337
+ /** Time for prompt processing (ms) */
338
+ tPp: number
339
+ /** Speed of prompt processing (tokens/sec) */
340
+ speedPp: number
341
+ /** Time for text generation (ms) */
342
+ tTg: number
343
+ /** Speed of text generation (tokens/sec) */
344
+ speedTg: number
345
+ /** Total time (ms) */
346
+ t: number
347
+ /** Overall speed (tokens/sec) */
348
+ speed: number
349
+ }
350
+
312
351
  export type ModelInfo = {
313
352
  desc: string
314
353
  nEmbd: number
@@ -573,6 +612,16 @@ export interface LlamaContext {
573
612
  */
574
613
  clearCache(clearData?: boolean): void
575
614
 
615
+ /**
616
+ * Run a benchmark to measure model performance
617
+ * @param pp Number of tokens to process for prompt processing benchmark
618
+ * @param tg Number of tokens to generate for text generation benchmark
619
+ * @param pl Parallel level (number of sequences)
620
+ * @param nr Number of repetitions
621
+ * @returns Benchmark results
622
+ */
623
+ bench(pp: number, tg: number, pl: number, nr: number): Promise<BenchResult>
624
+
576
625
  // static
577
626
  loadModelInfo(path: string, skip: string[]): Promise<GGUFModelInfo>
578
627
  toggleNativeLog(
package/lib/index.js CHANGED
@@ -204,6 +204,19 @@ class LlamaContextWrapper {
204
204
  clearCache(clearData) {
205
205
  this.ctx.clearCache(clearData);
206
206
  }
207
+ /**
208
+ * Run a benchmark to measure model performance
209
+ * @param pp Number of tokens to process for prompt processing benchmark
210
+ * @param tg Number of tokens to generate for text generation benchmark
211
+ * @param pl Parallel level (number of sequences)
212
+ * @param nr Number of repetitions
213
+ * @returns Benchmark results including timing and speed metrics
214
+ */
215
+ bench(pp, tg, pl, nr) {
216
+ return __awaiter(this, void 0, void 0, function* () {
217
+ return this.ctx.bench(pp, tg, pl, nr);
218
+ });
219
+ }
207
220
  }
208
221
  const loadModel = (options, onProgress) => __awaiter(void 0, void 0, void 0, function* () {
209
222
  var _a, _b;
package/lib/index.ts CHANGED
@@ -16,6 +16,7 @@ import type {
16
16
  JinjaFormattedChatResult,
17
17
  Tool,
18
18
  GGUFModelInfo,
19
+ BenchResult,
19
20
  } from './binding'
20
21
  import { BUILD_NUMBER, BUILD_COMMIT } from './version'
21
22
  import { LlamaParallelAPI } from './parallel'
@@ -309,6 +310,18 @@ class LlamaContextWrapper {
309
310
  clearCache(clearData?: boolean): void {
310
311
  this.ctx.clearCache(clearData)
311
312
  }
313
+
314
+ /**
315
+ * Run a benchmark to measure model performance
316
+ * @param pp Number of tokens to process for prompt processing benchmark
317
+ * @param tg Number of tokens to generate for text generation benchmark
318
+ * @param pl Parallel level (number of sequences)
319
+ * @param nr Number of repetitions
320
+ * @returns Benchmark results including timing and speed metrics
321
+ */
322
+ async bench(pp: number, tg: number, pl: number, nr: number): Promise<BenchResult> {
323
+ return this.ctx.bench(pp, tg, pl, nr)
324
+ }
312
325
  }
313
326
 
314
327
  export const loadModel = async (
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.4.5",
4
+ "version": "1.4.7",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,20 +72,20 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-darwin-arm64": "1.4.5",
76
- "@fugood/node-llama-darwin-x64": "1.4.5",
77
- "@fugood/node-llama-linux-arm64": "1.4.5",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.4.5",
79
- "@fugood/node-llama-linux-arm64-snapdragon": "1.4.5",
80
- "@fugood/node-llama-linux-arm64-vulkan": "1.4.5",
81
- "@fugood/node-llama-linux-x64": "1.4.5",
82
- "@fugood/node-llama-linux-x64-cuda": "1.4.5",
83
- "@fugood/node-llama-linux-x64-vulkan": "1.4.5",
84
- "@fugood/node-llama-win32-arm64": "1.4.5",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.4.5",
86
- "@fugood/node-llama-win32-x64": "1.4.5",
87
- "@fugood/node-llama-win32-x64-cuda": "1.4.5",
88
- "@fugood/node-llama-win32-x64-vulkan": "1.4.5"
75
+ "@fugood/node-llama-darwin-arm64": "1.4.7",
76
+ "@fugood/node-llama-darwin-x64": "1.4.7",
77
+ "@fugood/node-llama-linux-arm64": "1.4.7",
78
+ "@fugood/node-llama-linux-arm64-cuda": "1.4.7",
79
+ "@fugood/node-llama-linux-arm64-snapdragon": "1.4.7",
80
+ "@fugood/node-llama-linux-arm64-vulkan": "1.4.7",
81
+ "@fugood/node-llama-linux-x64": "1.4.7",
82
+ "@fugood/node-llama-linux-x64-cuda": "1.4.7",
83
+ "@fugood/node-llama-linux-x64-vulkan": "1.4.7",
84
+ "@fugood/node-llama-win32-arm64": "1.4.7",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.4.7",
86
+ "@fugood/node-llama-win32-x64": "1.4.7",
87
+ "@fugood/node-llama-win32-x64-cuda": "1.4.7",
88
+ "@fugood/node-llama-win32-x64-vulkan": "1.4.7"
89
89
  },
90
90
  "devDependencies": {
91
91
  "@babel/preset-env": "^7.24.4",
@@ -35,10 +35,10 @@ index 74a7b6a46..7b7a1bd50 100644
35
35
  while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
36
36
  sv.remove_suffix(1);
37
37
  diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
38
- index 41a5bb42d..da5cf4b94 100644
38
+ index c371edaa5..ec032e351 100644
39
39
  --- a/src/llama.cpp/common/chat.cpp
40
40
  +++ b/src/llama.cpp/common/chat.cpp
41
- @@ -6,9 +6,6 @@
41
+ @@ -7,9 +7,6 @@
42
42
  #include "log.h"
43
43
  #include "regex-partial.h"
44
44
 
@@ -48,7 +48,7 @@ index 41a5bb42d..da5cf4b94 100644
48
48
  #include <algorithm>
49
49
  #include <cstdio>
50
50
  #include <cctype>
51
- @@ -134,16 +131,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
51
+ @@ -135,16 +132,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
52
52
  return diffs;
53
53
  }
54
54
 
@@ -65,7 +65,7 @@ index 41a5bb42d..da5cf4b94 100644
65
65
  struct templates_params {
66
66
  json messages;
67
67
  json tools;
68
- @@ -720,7 +707,7 @@ static std::string apply(
68
+ @@ -732,7 +719,7 @@ static std::string apply(
69
69
  tmpl_inputs.extra_context.merge_patch(*additional_context);
70
70
  }
71
71
  // TODO: add flag to control date/time, if only for testing purposes.
@@ -99,10 +99,10 @@ index 6085510a4..263076ce2 100644
99
99
  struct common_chat_tool_call {
100
100
  std::string name;
101
101
  diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
102
- index f07af1d86..1b10c7b13 100644
102
+ index 0497f90a2..29b36f3fe 100644
103
103
  --- a/src/llama.cpp/common/common.cpp
104
104
  +++ b/src/llama.cpp/common/common.cpp
105
- @@ -1236,6 +1236,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
105
+ @@ -1280,6 +1280,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
106
106
  mparams.n_gpu_layers = params.n_gpu_layers;
107
107
  }
108
108
 
@@ -111,7 +111,7 @@ index f07af1d86..1b10c7b13 100644
111
111
  mparams.split_mode = params.split_mode;
112
112
  mparams.tensor_split = params.tensor_split;
113
113
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
114
- index 179113a4d..78aa24bc3 100644
114
+ index d28e48991..562203d02 100644
115
115
  --- a/src/llama.cpp/common/common.h
116
116
  +++ b/src/llama.cpp/common/common.h
117
117
  @@ -302,6 +302,7 @@ struct lr_opt {
@@ -123,7 +123,7 @@ index 179113a4d..78aa24bc3 100644
123
123
  int32_t n_ctx = 4096; // context size
124
124
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
125
125
  diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
126
- index 7e53a57b7..a328d4db4 100644
126
+ index fc31089f3..aa9befe4c 100644
127
127
  --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
128
128
  +++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
129
129
  @@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
@@ -203,6 +203,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
203
203
  static_cast<napi_property_attributes>(napi_enumerable)),
204
204
  InstanceMethod<&LlamaContext::ClearCache>(
205
205
  "clearCache",
206
+ static_cast<napi_property_attributes>(napi_enumerable)),
207
+ InstanceMethod<&LlamaContext::Bench>(
208
+ "bench",
206
209
  static_cast<napi_property_attributes>(napi_enumerable))});
207
210
  Napi::FunctionReference *constructor = new Napi::FunctionReference();
208
211
  *constructor = Napi::Persistent(func);
@@ -1529,3 +1532,69 @@ void LlamaContext::ClearCache(const Napi::CallbackInfo &info) {
1529
1532
 
1530
1533
  _rn_ctx->clearCache(clear_data);
1531
1534
  }
1535
+
1536
+ // bench(pp: number, tg: number, pl: number, nr: number): Promise<BenchResult>
1537
+ Napi::Value LlamaContext::Bench(const Napi::CallbackInfo &info) {
1538
+ Napi::Env env = info.Env();
1539
+
1540
+ if (info.Length() < 4) {
1541
+ Napi::TypeError::New(env, "Expected 4 arguments: pp, tg, pl, nr")
1542
+ .ThrowAsJavaScriptException();
1543
+ return env.Undefined();
1544
+ }
1545
+
1546
+ if (!_rn_ctx) {
1547
+ Napi::TypeError::New(env, "Context is disposed").ThrowAsJavaScriptException();
1548
+ return env.Undefined();
1549
+ }
1550
+
1551
+ if (!_rn_ctx->completion) {
1552
+ Napi::TypeError::New(env, "Completion context not initialized")
1553
+ .ThrowAsJavaScriptException();
1554
+ return env.Undefined();
1555
+ }
1556
+
1557
+ int pp = info[0].ToNumber().Int32Value();
1558
+ int tg = info[1].ToNumber().Int32Value();
1559
+ int pl = info[2].ToNumber().Int32Value();
1560
+ int nr = info[3].ToNumber().Int32Value();
1561
+
1562
+ std::string result;
1563
+ try {
1564
+ result = _rn_ctx->completion->bench(pp, tg, pl, nr);
1565
+ } catch (const std::exception &e) {
1566
+ Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
1567
+ return env.Undefined();
1568
+ }
1569
+
1570
+ // Parse the JSON result and return as object
1571
+ try {
1572
+ auto parsed = json::parse(result);
1573
+ Napi::Object benchResult = Napi::Object::New(env);
1574
+
1575
+ benchResult.Set("nKvMax", Napi::Number::New(env, parsed["n_kv_max"].get<int>()));
1576
+ benchResult.Set("nBatch", Napi::Number::New(env, parsed["n_batch"].get<int>()));
1577
+ benchResult.Set("nUBatch", Napi::Number::New(env, parsed["n_ubatch"].get<int>()));
1578
+ benchResult.Set("flashAttn", Napi::Number::New(env, parsed["flash_attn"].get<int>()));
1579
+ benchResult.Set("isPpShared", Napi::Boolean::New(env, parsed["is_pp_shared"].get<int>() != 0));
1580
+ benchResult.Set("nGpuLayers", Napi::Number::New(env, parsed["n_gpu_layers"].get<int>()));
1581
+ benchResult.Set("nThreads", Napi::Number::New(env, parsed["n_threads"].get<int>()));
1582
+ benchResult.Set("nThreadsBatch", Napi::Number::New(env, parsed["n_threads_batch"].get<int>()));
1583
+ benchResult.Set("pp", Napi::Number::New(env, parsed["pp"].get<int>()));
1584
+ benchResult.Set("tg", Napi::Number::New(env, parsed["tg"].get<int>()));
1585
+ benchResult.Set("pl", Napi::Number::New(env, parsed["pl"].get<int>()));
1586
+ benchResult.Set("nKv", Napi::Number::New(env, parsed["n_kv"].get<int>()));
1587
+ benchResult.Set("tPp", Napi::Number::New(env, parsed["t_pp"].get<double>()));
1588
+ benchResult.Set("speedPp", Napi::Number::New(env, parsed["speed_pp"].get<double>()));
1589
+ benchResult.Set("tTg", Napi::Number::New(env, parsed["t_tg"].get<double>()));
1590
+ benchResult.Set("speedTg", Napi::Number::New(env, parsed["speed_tg"].get<double>()));
1591
+ benchResult.Set("t", Napi::Number::New(env, parsed["t"].get<double>()));
1592
+ benchResult.Set("speed", Napi::Number::New(env, parsed["speed"].get<double>()));
1593
+
1594
+ return benchResult;
1595
+ } catch (const std::exception &e) {
1596
+ Napi::Error::New(env, std::string("Failed to parse benchmark result: ") + e.what())
1597
+ .ThrowAsJavaScriptException();
1598
+ return env.Undefined();
1599
+ }
1600
+ }
@@ -72,6 +72,9 @@ private:
72
72
  // Cache management
73
73
  void ClearCache(const Napi::CallbackInfo &info);
74
74
 
75
+ // Benchmarking
76
+ Napi::Value Bench(const Napi::CallbackInfo &info);
77
+
75
78
  std::string _info;
76
79
  std::vector<std::string> _used_devices;
77
80
  Napi::Object _meta;
@@ -724,16 +724,10 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
724
724
  if (reasoning_unclosed) {
725
725
  if (auto pos = content.find(end_think); pos == std::string::npos && builder.pos() != builder.input().size()) {
726
726
  unclosed_reasoning_content += content;
727
- if (form.allow_toolcall_in_think) {
728
- builder.move_to(tc->groups[0].begin);
729
- if (!builder.try_consume_xml_tool_calls(form)) {
730
- unclosed_reasoning_content += tool_call_start;
731
- builder.move_to(tc->groups[0].end);
732
- }
733
- } else {
727
+ if (!(form.allow_toolcall_in_think && tc)) {
734
728
  unclosed_reasoning_content += tool_call_start;
729
+ continue;
735
730
  }
736
- continue;
737
731
  } else {
738
732
  reasoning_unclosed = false;
739
733
  std::string reasoning_content;
@@ -781,8 +775,12 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
781
775
  }
782
776
  } else {
783
777
  // This <tool_call> start is in thinking block, skip this tool call
784
- auto pos = think_start + start_think.size();
785
- unclosed_reasoning_content = content.substr(pos) + tool_call_start;
778
+ // This <tool_call> start is in thinking block
779
+ if (form.allow_toolcall_in_think) {
780
+ unclosed_reasoning_content = content.substr(think_start + start_think.size());
781
+ } else {
782
+ unclosed_reasoning_content = content.substr(think_start + start_think.size()) + tool_call_start;
783
+ }
786
784
  reasoning_unclosed = true;
787
785
  content.resize(think_start);
788
786
  toolcall_in_think = true;
@@ -805,14 +803,35 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
805
803
  }
806
804
 
807
805
  // remove potential partial suffix
808
- if (content.size() > 0 && builder.pos() == builder.input().size() && unclosed_reasoning_content.empty()) {
809
- rstrip(content);
810
- trim_potential_partial_word(content);
811
- rstrip(content);
806
+ if (builder.pos() == builder.input().size()) {
807
+ if (unclosed_reasoning_content.empty()) {
808
+ rstrip(content);
809
+ trim_potential_partial_word(content);
810
+ rstrip(content);
811
+ } else {
812
+ rstrip(unclosed_reasoning_content);
813
+ trim_potential_partial_word(unclosed_reasoning_content);
814
+ rstrip(unclosed_reasoning_content);
815
+ }
816
+ }
817
+
818
+ // consume unclosed_reasoning_content if allow_toolcall_in_think is set
819
+ if (form.allow_toolcall_in_think && !unclosed_reasoning_content.empty()) {
820
+ if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
821
+ builder.add_reasoning_content(unclosed_reasoning_content);
822
+ } else {
823
+ if (content.empty()) {
824
+ content = start_think + unclosed_reasoning_content;
825
+ } else {
826
+ content += "\n\n" + start_think;
827
+ content += unclosed_reasoning_content;
828
+ }
829
+ }
830
+ unclosed_reasoning_content.clear();
812
831
  }
813
832
 
814
833
  // Add content
815
- if (content.size() != 0) {
834
+ if (!content.empty()) {
816
835
  // If there are multiple content blocks
817
836
  if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content && builder.result().content.size() != 0) {
818
837
  builder.add_content("\n\n");
@@ -820,7 +839,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
820
839
  builder.add_content(content);
821
840
  }
822
841
 
823
- // This <tool_call> start is in thinking block, skip this tool call
842
+ // This <tool_call> start is in thinking block and toolcall_in_think not set, skip this tool call
824
843
  if (toolcall_in_think && !form.allow_toolcall_in_think) {
825
844
  continue;
826
845
  }
@@ -829,7 +848,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
829
848
  if (!tc) {
830
849
  GGML_ASSERT(builder.pos() == builder.input().size());
831
850
  GGML_ASSERT(unclosed_reasoning_content.empty());
832
- GGML_ASSERT(!reasoning_unclosed);
851
+ if (!form.allow_toolcall_in_think) GGML_ASSERT(!reasoning_unclosed);
833
852
  break;
834
853
  }
835
854
 
@@ -854,7 +873,6 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
854
873
 
855
874
  /**
856
875
  * Parse content uses reasoning and XML-Style tool call
857
- * TODO: Note that form.allow_toolcall_in_think is not tested yet. If anyone confirms it works, this comment can be removed.
858
876
  */
859
877
  void common_chat_msg_parser::consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think, const std::string & end_think) {
860
878
  parse_msg_with_xml_tool_calls(*this, form, start_think, end_think);
@@ -31,7 +31,7 @@ struct xml_tool_call_format {
31
31
  std::optional<std::string> last_val_end = std::nullopt;
32
32
  std::optional<std::string> last_tool_end = std::nullopt;
33
33
  bool trim_raw_argval = false;
34
- bool allow_toolcall_in_think = false; // TODO: UNTESTED!!!
34
+ bool allow_toolcall_in_think = false;
35
35
  };
36
36
 
37
37
  // make a GBNF that accept any strings except those containing any of the forbidden strings.
@@ -917,12 +917,13 @@ static void common_chat_parse_kimi_k2(common_chat_msg_parser & builder) {
917
917
  form.tool_start = "<|tool_call_begin|>";
918
918
  form.tool_sep = "<|tool_call_argument_begin|>{";
919
919
  form.key_start = "\"";
920
- form.key_val_sep = "\": ";
921
- form.val_end = ", ";
920
+ form.key_val_sep = "\":";
921
+ form.val_end = ",";
922
922
  form.tool_end = "}<|tool_call_end|>";
923
923
  form.scope_end = "<|tool_calls_section_end|>";
924
924
  form.raw_argval = false;
925
925
  form.last_val_end = "";
926
+ form.allow_toolcall_in_think = true;
926
927
  return form;
927
928
  })();
928
929
  builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
@@ -1,5 +1,6 @@
1
1
  #include "chat.h"
2
2
  #include "chat-parser.h"
3
+ #include "chat-peg-parser.h"
3
4
  #include "common.h"
4
5
  #include "json-partial.h"
5
6
  #include "json-schema-to-grammar.h"
@@ -137,6 +138,7 @@ struct templates_params {
137
138
  common_chat_tool_choice tool_choice;
138
139
  json json_schema;
139
140
  bool parallel_tool_calls;
141
+ common_reasoning_format reasoning_format;
140
142
  bool stream;
141
143
  std::string grammar;
142
144
  bool add_generation_prompt = true;
@@ -576,6 +578,16 @@ common_chat_templates_ptr common_chat_templates_init(
576
578
  "{%- if false %}");
577
579
  }
578
580
 
581
+ // TODO @aldehir : this is a temporary fix, pending Minja changes
582
+ // Ref: https://github.com/ggml-org/llama.cpp/pull/17713#issuecomment-3631342664
583
+ if (default_template_src.find("[TOOL_CALLS]") != std::string::npos
584
+ // search for the error message and patch it
585
+ && default_template_src.find("if (message['content'] is none or") != std::string::npos) {
586
+ string_replace_all(default_template_src,
587
+ "{%- if (message['content'] is none or message['content'] == '' or message['content']|length == 0) and (message['tool_calls'] is not defined or message['tool_calls'] is none or message['tool_calls']|length == 0) %}",
588
+ "{%- if false %}");
589
+ }
590
+
579
591
  std::string token_bos = bos_token_override;
580
592
  std::string token_eos = eos_token_override;
581
593
  bool add_bos = false;
@@ -974,6 +986,118 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
974
986
  return data;
975
987
  }
976
988
 
989
+ static common_chat_params common_chat_params_init_ministral_3(const common_chat_template & tmpl, const struct templates_params & inputs) {
990
+ common_chat_params data;
991
+
992
+ // Build up messages to follow the format: https://huggingface.co/mistralai/Ministral-3-14B-Reasoning-2512/blob/main/chat_template.jinja
993
+ auto adjusted_messages = json::array();
994
+ for (const auto & msg : inputs.messages) {
995
+ auto role = msg.value("role", "");
996
+ if (role != "system" && role != "assistant") {
997
+ // Only adjust system and assistant messages. Interestingly, the system message may contain thinking.
998
+ adjusted_messages.push_back(msg);
999
+ continue;
1000
+ }
1001
+
1002
+ auto content = json::array();
1003
+
1004
+ // If message contains `reasoning_content`, add it as a block of type `thinking`
1005
+ if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
1006
+ content.push_back({
1007
+ {"type", "thinking"},
1008
+ {"thinking", msg.at("reasoning_content").get<std::string>()},
1009
+ });
1010
+ }
1011
+
1012
+ // If message contains `content`, add it as a block of type `text`
1013
+ if (msg.contains("content")) {
1014
+ if (msg.at("content").is_string()) {
1015
+ content.push_back({
1016
+ {"type", "text"},
1017
+ {"text", msg.at("content").get<std::string>()},
1018
+ });
1019
+ } else if (msg.at("content").is_array()) {
1020
+ auto blocks = msg.at("content");
1021
+ content.insert(content.end(), blocks.begin(), blocks.end());
1022
+ }
1023
+ }
1024
+
1025
+ auto adjusted = msg;
1026
+ adjusted["content"] = content;
1027
+ adjusted.erase("reasoning_content");
1028
+ adjusted_messages.push_back(adjusted);
1029
+ }
1030
+
1031
+ auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
1032
+ auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
1033
+ auto include_grammar = true;
1034
+
1035
+ data.prompt = apply(tmpl, inputs, /* messages_override = */ adjusted_messages);
1036
+ data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
1037
+ data.preserved_tokens = {
1038
+ "[THINK]",
1039
+ "[/THINK]",
1040
+ "[TOOL_CALLS]",
1041
+ "[ARGS]",
1042
+ };
1043
+
1044
+ auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
1045
+ auto reasoning = extract_reasoning ? p.optional("[THINK]" + p.reasoning(p.until("[/THINK]")) + "[/THINK]") : p.eps();
1046
+
1047
+ // Response format parser
1048
+ if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
1049
+ // Ministral wants to emit json surrounded by code fences
1050
+ return reasoning << "```json" << p.content(p.schema(p.json(), "response-format", inputs.json_schema)) << "```";
1051
+ }
1052
+
1053
+ // Tool call parser
1054
+ if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
1055
+ auto tool_choice = p.choice();
1056
+ foreach_function(inputs.tools, [&](const json & tool) {
1057
+ const auto & function = tool.at("function");
1058
+ std::string name = function.at("name");
1059
+ const auto & schema = function.at("parameters");
1060
+
1061
+ tool_choice |= p.rule("tool-" + name,
1062
+ p.tool_open(p.tool_name(p.literal(name)) + "[ARGS]")
1063
+ + p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema))
1064
+ );
1065
+ });
1066
+
1067
+ auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
1068
+ auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
1069
+ auto tool_calls = p.trigger_rule("tool-call", p.repeat("[TOOL_CALLS]" + tool_choice, min_calls, max_calls));
1070
+
1071
+ return reasoning << p.content(p.until("[TOOL_CALLS]")) << tool_calls;
1072
+ }
1073
+
1074
+ // Content only parser
1075
+ include_grammar = false;
1076
+ return reasoning << p.content(p.rest());
1077
+ });
1078
+
1079
+ data.parser = parser.save();
1080
+
1081
+ if (include_grammar) {
1082
+ data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
1083
+
1084
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1085
+ foreach_function(inputs.tools, [&](const json & tool) {
1086
+ const auto & function = tool.at("function");
1087
+ auto schema = function.at("parameters");
1088
+ builder.resolve_refs(schema);
1089
+ });
1090
+ parser.build_grammar(builder, data.grammar_lazy);
1091
+ });
1092
+
1093
+ data.grammar_triggers = {
1094
+ {COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"}
1095
+ };
1096
+ }
1097
+
1098
+ return data;
1099
+ }
1100
+
977
1101
  static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
978
1102
  common_chat_params data;
979
1103
  data.prompt = apply(tmpl, inputs);
@@ -2328,6 +2452,7 @@ static common_chat_params common_chat_templates_apply_jinja(
2328
2452
  params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
2329
2453
  params.add_generation_prompt = inputs.add_generation_prompt;
2330
2454
  params.tool_choice = inputs.tool_choice;
2455
+ params.reasoning_format = inputs.reasoning_format;
2331
2456
  params.enable_thinking = inputs.enable_thinking;
2332
2457
  params.grammar = inputs.grammar;
2333
2458
  params.now = inputs.now;
@@ -2491,6 +2616,13 @@ static common_chat_params common_chat_templates_apply_jinja(
2491
2616
  return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
2492
2617
  }
2493
2618
 
2619
+ // Ministral/Mistral Large 3
2620
+ if (src.find("[SYSTEM_PROMPT]") != std::string::npos &&
2621
+ src.find("[TOOL_CALLS]") != std::string::npos &&
2622
+ src.find("[ARGS]") != std::string::npos) {
2623
+ return common_chat_params_init_ministral_3(tmpl, params);
2624
+ }
2625
+
2494
2626
  if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
2495
2627
  return common_chat_params_init_magistral(tmpl, params);
2496
2628
  }