npm - @fugood/llama.node - Versions diffs - 1.4.5 → 1.4.7 - Mend

@fugood/llama.node 1.4.5 → 1.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/lib/binding.ts +49 -0
package/lib/index.js +13 -0
package/lib/index.ts +13 -0
package/package.json +15 -15
package/scripts/llama.cpp.patch +8 -8
package/src/LlamaContext.cpp +69 -0
package/src/LlamaContext.h +3 -0
package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
package/src/llama.cpp/common/chat-parser.cpp +3 -2
package/src/llama.cpp/common/chat.cpp +132 -0
package/src/llama.cpp/common/console.cpp +582 -29
package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +9 -0
package/src/llama.cpp/src/CMakeLists.txt +2 -1
package/src/llama.cpp/src/llama-context.cpp +6 -6
package/src/llama.cpp/src/llama-context.h +1 -1
package/src/llama.cpp/src/llama-grammar.cpp +233 -33
package/src/llama.cpp/src/llama-grammar.h +20 -1
package/src/llama.cpp/src/llama-graph.cpp +1 -1
package/src/llama.cpp/src/llama-model.cpp +20 -8
package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
package/src/llama.cpp/src/models/models.h +3 -2

package/lib/binding.ts CHANGED Viewed

@@ -309,6 +309,45 @@ export type BackendDeviceInfo = {
   metadata?: Record<string, any>
 }
+export type BenchResult = {
+  /** Maximum KV cache size */
+  nKvMax: number
+  /** Batch size */
+  nBatch: number
+  /** Micro-batch size */
+  nUBatch: number
+  /** Flash attention type (0=disabled, 1=enabled, 2=auto) */
+  flashAttn: number
+  /** Whether prompt processing is shared */
+  isPpShared: boolean
+  /** Number of GPU layers */
+  nGpuLayers: number
+  /** Number of threads */
+  nThreads: number
+  /** Number of threads for batch processing */
+  nThreadsBatch: number
+  /** Prompt processing tokens count */
+  pp: number
+  /** Text generation tokens count */
+  tg: number
+  /** Parallel level */
+  pl: number
+  /** KV cache used */
+  nKv: number
+  /** Time for prompt processing (ms) */
+  tPp: number
+  /** Speed of prompt processing (tokens/sec) */
+  speedPp: number
+  /** Time for text generation (ms) */
+  tTg: number
+  /** Speed of text generation (tokens/sec) */
+  speedTg: number
+  /** Total time (ms) */
+  t: number
+  /** Overall speed (tokens/sec) */
+  speed: number
+}
 export type ModelInfo = {
   desc: string
   nEmbd: number
@@ -573,6 +612,16 @@ export interface LlamaContext {
    */
   clearCache(clearData?: boolean): void
+  /**
+   * Run a benchmark to measure model performance
+   * @param pp Number of tokens to process for prompt processing benchmark
+   * @param tg Number of tokens to generate for text generation benchmark
+   * @param pl Parallel level (number of sequences)
+   * @param nr Number of repetitions
+   * @returns Benchmark results
+   */
+  bench(pp: number, tg: number, pl: number, nr: number): Promise<BenchResult>
   // static
   loadModelInfo(path: string, skip: string[]): Promise<GGUFModelInfo>
   toggleNativeLog(

package/lib/index.js CHANGED Viewed

@@ -204,6 +204,19 @@ class LlamaContextWrapper {
     clearCache(clearData) {
         this.ctx.clearCache(clearData);
     }
+    /**
+     * Run a benchmark to measure model performance
+     * @param pp Number of tokens to process for prompt processing benchmark
+     * @param tg Number of tokens to generate for text generation benchmark
+     * @param pl Parallel level (number of sequences)
+     * @param nr Number of repetitions
+     * @returns Benchmark results including timing and speed metrics
+     */
+    bench(pp, tg, pl, nr) {
+        return __awaiter(this, void 0, void 0, function* () {
+            return this.ctx.bench(pp, tg, pl, nr);
+        });
+    }
 }
 const loadModel = (options, onProgress) => __awaiter(void 0, void 0, void 0, function* () {
     var _a, _b;

package/lib/index.ts CHANGED Viewed

@@ -16,6 +16,7 @@ import type {
   JinjaFormattedChatResult,
   Tool,
   GGUFModelInfo,
+  BenchResult,
 } from './binding'
 import { BUILD_NUMBER, BUILD_COMMIT } from './version'
 import { LlamaParallelAPI } from './parallel'
@@ -309,6 +310,18 @@ class LlamaContextWrapper {
   clearCache(clearData?: boolean): void {
     this.ctx.clearCache(clearData)
   }
+  /**
+   * Run a benchmark to measure model performance
+   * @param pp Number of tokens to process for prompt processing benchmark
+   * @param tg Number of tokens to generate for text generation benchmark
+   * @param pl Parallel level (number of sequences)
+   * @param nr Number of repetitions
+   * @returns Benchmark results including timing and speed metrics
+   */
+  async bench(pp: number, tg: number, pl: number, nr: number): Promise<BenchResult> {
+    return this.ctx.bench(pp, tg, pl, nr)
+  }
 }
 export const loadModel = async (

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "1.4.5",
+  "version": "1.4.7",
   "description": "An another Node binding of llama.cpp",
   "main": "lib/index.js",
   "scripts": {
@@ -72,20 +72,20 @@
     "CMakeLists.txt"
   ],
   "optionalDependencies": {
-    "@fugood/node-llama-darwin-arm64": "1.4.5",
-    "@fugood/node-llama-darwin-x64": "1.4.5",
-    "@fugood/node-llama-linux-arm64": "1.4.5",
-    "@fugood/node-llama-linux-arm64-cuda": "1.4.5",
-    "@fugood/node-llama-linux-arm64-snapdragon": "1.4.5",
-    "@fugood/node-llama-linux-arm64-vulkan": "1.4.5",
-    "@fugood/node-llama-linux-x64": "1.4.5",
-    "@fugood/node-llama-linux-x64-cuda": "1.4.5",
-    "@fugood/node-llama-linux-x64-vulkan": "1.4.5",
-    "@fugood/node-llama-win32-arm64": "1.4.5",
-    "@fugood/node-llama-win32-arm64-vulkan": "1.4.5",
-    "@fugood/node-llama-win32-x64": "1.4.5",
-    "@fugood/node-llama-win32-x64-cuda": "1.4.5",
-    "@fugood/node-llama-win32-x64-vulkan": "1.4.5"
+    "@fugood/node-llama-darwin-arm64": "1.4.7",
+    "@fugood/node-llama-darwin-x64": "1.4.7",
+    "@fugood/node-llama-linux-arm64": "1.4.7",
+    "@fugood/node-llama-linux-arm64-cuda": "1.4.7",
+    "@fugood/node-llama-linux-arm64-snapdragon": "1.4.7",
+    "@fugood/node-llama-linux-arm64-vulkan": "1.4.7",
+    "@fugood/node-llama-linux-x64": "1.4.7",
+    "@fugood/node-llama-linux-x64-cuda": "1.4.7",
+    "@fugood/node-llama-linux-x64-vulkan": "1.4.7",
+    "@fugood/node-llama-win32-arm64": "1.4.7",
+    "@fugood/node-llama-win32-arm64-vulkan": "1.4.7",
+    "@fugood/node-llama-win32-x64": "1.4.7",
+    "@fugood/node-llama-win32-x64-cuda": "1.4.7",
+    "@fugood/node-llama-win32-x64-vulkan": "1.4.7"
   },
   "devDependencies": {
     "@babel/preset-env": "^7.24.4",

package/scripts/llama.cpp.patch CHANGED Viewed

@@ -35,10 +35,10 @@ index 74a7b6a46..7b7a1bd50 100644
      while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
          sv.remove_suffix(1);
 diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
-index 41a5bb42d..da5cf4b94 100644
+index c371edaa5..ec032e351 100644
 --- a/src/llama.cpp/common/chat.cpp
 +++ b/src/llama.cpp/common/chat.cpp
-@@ -6,9 +6,6 @@
+@@ -7,9 +7,6 @@
  #include "log.h"
  #include "regex-partial.h"
@@ -48,7 +48,7 @@ index 41a5bb42d..da5cf4b94 100644
  #include <algorithm>
  #include <cstdio>
  #include <cctype>
-@@ -134,16 +131,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
+@@ -135,16 +132,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
      return diffs;
  }
@@ -65,7 +65,7 @@ index 41a5bb42d..da5cf4b94 100644
  struct templates_params {
      json messages;
      json tools;
-@@ -720,7 +707,7 @@ static std::string apply(
+@@ -732,7 +719,7 @@ static std::string apply(
          tmpl_inputs.extra_context.merge_patch(*additional_context);
      }
      // TODO: add flag to control date/time, if only for testing purposes.
@@ -99,10 +99,10 @@ index 6085510a4..263076ce2 100644
  struct common_chat_tool_call {
      std::string name;
 diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
-index f07af1d86..1b10c7b13 100644
+index 0497f90a2..29b36f3fe 100644
 --- a/src/llama.cpp/common/common.cpp
 +++ b/src/llama.cpp/common/common.cpp
-@@ -1236,6 +1236,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
+@@ -1280,6 +1280,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
          mparams.n_gpu_layers = params.n_gpu_layers;
      }
@@ -111,7 +111,7 @@ index f07af1d86..1b10c7b13 100644
      mparams.split_mode      = params.split_mode;
      mparams.tensor_split    = params.tensor_split;
 diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
-index 179113a4d..78aa24bc3 100644
+index d28e48991..562203d02 100644
 --- a/src/llama.cpp/common/common.h
 +++ b/src/llama.cpp/common/common.h
@@ -302,6 +302,7 @@ struct lr_opt {
@@ -123,7 +123,7 @@ index 179113a4d..78aa24bc3 100644
      int32_t n_ctx                 =  4096; // context size
      int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
 diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
-index 7e53a57b7..a328d4db4 100644
+index fc31089f3..aa9befe4c 100644
 --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
 +++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
@@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)

package/src/LlamaContext.cpp CHANGED Viewed

@@ -203,6 +203,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
            static_cast<napi_property_attributes>(napi_enumerable)),
        InstanceMethod<&LlamaContext::ClearCache>(
            "clearCache",
+           static_cast<napi_property_attributes>(napi_enumerable)),
+       InstanceMethod<&LlamaContext::Bench>(
+           "bench",
            static_cast<napi_property_attributes>(napi_enumerable))});
   Napi::FunctionReference *constructor = new Napi::FunctionReference();
   *constructor = Napi::Persistent(func);
@@ -1529,3 +1532,69 @@ void LlamaContext::ClearCache(const Napi::CallbackInfo &info) {
   _rn_ctx->clearCache(clear_data);
 }
+// bench(pp: number, tg: number, pl: number, nr: number): Promise<BenchResult>
+Napi::Value LlamaContext::Bench(const Napi::CallbackInfo &info) {
+  Napi::Env env = info.Env();
+  if (info.Length() < 4) {
+    Napi::TypeError::New(env, "Expected 4 arguments: pp, tg, pl, nr")
+        .ThrowAsJavaScriptException();
+    return env.Undefined();
+  }
+  if (!_rn_ctx) {
+    Napi::TypeError::New(env, "Context is disposed").ThrowAsJavaScriptException();
+    return env.Undefined();
+  }
+  if (!_rn_ctx->completion) {
+    Napi::TypeError::New(env, "Completion context not initialized")
+        .ThrowAsJavaScriptException();
+    return env.Undefined();
+  }
+  int pp = info[0].ToNumber().Int32Value();
+  int tg = info[1].ToNumber().Int32Value();
+  int pl = info[2].ToNumber().Int32Value();
+  int nr = info[3].ToNumber().Int32Value();
+  std::string result;
+  try {
+    result = _rn_ctx->completion->bench(pp, tg, pl, nr);
+  } catch (const std::exception &e) {
+    Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
+    return env.Undefined();
+  }
+  // Parse the JSON result and return as object
+  try {
+    auto parsed = json::parse(result);
+    Napi::Object benchResult = Napi::Object::New(env);
+    benchResult.Set("nKvMax", Napi::Number::New(env, parsed["n_kv_max"].get<int>()));
+    benchResult.Set("nBatch", Napi::Number::New(env, parsed["n_batch"].get<int>()));
+    benchResult.Set("nUBatch", Napi::Number::New(env, parsed["n_ubatch"].get<int>()));
+    benchResult.Set("flashAttn", Napi::Number::New(env, parsed["flash_attn"].get<int>()));
+    benchResult.Set("isPpShared", Napi::Boolean::New(env, parsed["is_pp_shared"].get<int>() != 0));
+    benchResult.Set("nGpuLayers", Napi::Number::New(env, parsed["n_gpu_layers"].get<int>()));
+    benchResult.Set("nThreads", Napi::Number::New(env, parsed["n_threads"].get<int>()));
+    benchResult.Set("nThreadsBatch", Napi::Number::New(env, parsed["n_threads_batch"].get<int>()));
+    benchResult.Set("pp", Napi::Number::New(env, parsed["pp"].get<int>()));
+    benchResult.Set("tg", Napi::Number::New(env, parsed["tg"].get<int>()));
+    benchResult.Set("pl", Napi::Number::New(env, parsed["pl"].get<int>()));
+    benchResult.Set("nKv", Napi::Number::New(env, parsed["n_kv"].get<int>()));
+    benchResult.Set("tPp", Napi::Number::New(env, parsed["t_pp"].get<double>()));
+    benchResult.Set("speedPp", Napi::Number::New(env, parsed["speed_pp"].get<double>()));
+    benchResult.Set("tTg", Napi::Number::New(env, parsed["t_tg"].get<double>()));
+    benchResult.Set("speedTg", Napi::Number::New(env, parsed["speed_tg"].get<double>()));
+    benchResult.Set("t", Napi::Number::New(env, parsed["t"].get<double>()));
+    benchResult.Set("speed", Napi::Number::New(env, parsed["speed"].get<double>()));
+    return benchResult;
+  } catch (const std::exception &e) {
+    Napi::Error::New(env, std::string("Failed to parse benchmark result: ") + e.what())
+        .ThrowAsJavaScriptException();
+    return env.Undefined();
+  }
+}

package/src/LlamaContext.h CHANGED Viewed

@@ -72,6 +72,9 @@ private:
   // Cache management
   void ClearCache(const Napi::CallbackInfo &info);
+  // Benchmarking
+  Napi::Value Bench(const Napi::CallbackInfo &info);
   std::string _info;
   std::vector<std::string> _used_devices;
   Napi::Object _meta;

package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp CHANGED Viewed

@@ -724,16 +724,10 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
         if (reasoning_unclosed) {
             if (auto pos = content.find(end_think); pos == std::string::npos && builder.pos() != builder.input().size()) {
                 unclosed_reasoning_content += content;
-                if (form.allow_toolcall_in_think) {
-                    builder.move_to(tc->groups[0].begin);
-                    if (!builder.try_consume_xml_tool_calls(form)) {
-                        unclosed_reasoning_content += tool_call_start;
-                        builder.move_to(tc->groups[0].end);
-                    }
-                } else {
+                if (!(form.allow_toolcall_in_think && tc)) {
                     unclosed_reasoning_content += tool_call_start;
+                    continue;
                 }
-                continue;
             } else {
                 reasoning_unclosed = false;
                 std::string reasoning_content;
@@ -781,8 +775,12 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
                 }
             } else {
                 // This <tool_call> start is in thinking block, skip this tool call
-                auto pos = think_start + start_think.size();
-                unclosed_reasoning_content = content.substr(pos) + tool_call_start;
+                // This <tool_call> start is in thinking block
+                if (form.allow_toolcall_in_think) {
+                    unclosed_reasoning_content = content.substr(think_start + start_think.size());
+                } else {
+                    unclosed_reasoning_content = content.substr(think_start + start_think.size()) + tool_call_start;
+                }
                 reasoning_unclosed = true;
                 content.resize(think_start);
                 toolcall_in_think = true;
@@ -805,14 +803,35 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
         }
         // remove potential partial suffix
-        if (content.size() > 0 && builder.pos() == builder.input().size() && unclosed_reasoning_content.empty()) {
-            rstrip(content);
-            trim_potential_partial_word(content);
-            rstrip(content);
+        if (builder.pos() == builder.input().size()) {
+            if (unclosed_reasoning_content.empty()) {
+                rstrip(content);
+                trim_potential_partial_word(content);
+                rstrip(content);
+            } else {
+                rstrip(unclosed_reasoning_content);
+                trim_potential_partial_word(unclosed_reasoning_content);
+                rstrip(unclosed_reasoning_content);
+            }
+        }
+        // consume unclosed_reasoning_content if allow_toolcall_in_think is set
+        if (form.allow_toolcall_in_think && !unclosed_reasoning_content.empty()) {
+            if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
+                builder.add_reasoning_content(unclosed_reasoning_content);
+            } else {
+                if (content.empty()) {
+                    content = start_think + unclosed_reasoning_content;
+                } else {
+                    content += "\n\n" + start_think;
+                    content += unclosed_reasoning_content;
+                }
+            }
+            unclosed_reasoning_content.clear();
         }
         // Add content
-        if (content.size() != 0) {
+        if (!content.empty()) {
             // If there are multiple content blocks
             if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content && builder.result().content.size() != 0) {
                 builder.add_content("\n\n");
@@ -820,7 +839,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
             builder.add_content(content);
         }
-        // This <tool_call> start is in thinking block, skip this tool call
+        // This <tool_call> start is in thinking block and toolcall_in_think not set, skip this tool call
         if (toolcall_in_think && !form.allow_toolcall_in_think) {
             continue;
         }
@@ -829,7 +848,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
         if (!tc) {
             GGML_ASSERT(builder.pos() == builder.input().size());
             GGML_ASSERT(unclosed_reasoning_content.empty());
-            GGML_ASSERT(!reasoning_unclosed);
+            if (!form.allow_toolcall_in_think) GGML_ASSERT(!reasoning_unclosed);
             break;
         }
@@ -854,7 +873,6 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
 /**
  * Parse content uses reasoning and XML-Style tool call
- * TODO: Note that form.allow_toolcall_in_think is not tested yet. If anyone confirms it works, this comment can be removed.
  */
 void common_chat_msg_parser::consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think, const std::string & end_think) {
     parse_msg_with_xml_tool_calls(*this, form, start_think, end_think);

package/src/llama.cpp/common/chat-parser-xml-toolcall.h CHANGED Viewed

@@ -31,7 +31,7 @@ struct xml_tool_call_format {
     std::optional<std::string> last_val_end = std::nullopt;
     std::optional<std::string> last_tool_end = std::nullopt;
     bool trim_raw_argval = false;
-    bool allow_toolcall_in_think = false; // TODO: UNTESTED!!!
+    bool allow_toolcall_in_think = false;
 };
 // make a GBNF that accept any strings except those containing any of the forbidden strings.

package/src/llama.cpp/common/chat-parser.cpp CHANGED Viewed

@@ -917,12 +917,13 @@ static void common_chat_parse_kimi_k2(common_chat_msg_parser & builder) {
         form.tool_start  = "<|tool_call_begin|>";
         form.tool_sep    = "<|tool_call_argument_begin|>{";
         form.key_start   = "\"";
-        form.key_val_sep = "\": ";
-        form.val_end     = ", ";
+        form.key_val_sep = "\":";
+        form.val_end     = ",";
         form.tool_end    = "}<|tool_call_end|>";
         form.scope_end   = "<|tool_calls_section_end|>";
         form.raw_argval  = false;
         form.last_val_end = "";
+        form.allow_toolcall_in_think = true;
         return form;
     })();
     builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");

package/src/llama.cpp/common/chat.cpp CHANGED Viewed

@@ -1,5 +1,6 @@
 #include "chat.h"
 #include "chat-parser.h"
+#include "chat-peg-parser.h"
 #include "common.h"
 #include "json-partial.h"
 #include "json-schema-to-grammar.h"
@@ -137,6 +138,7 @@ struct templates_params {
     common_chat_tool_choice tool_choice;
     json json_schema;
     bool parallel_tool_calls;
+    common_reasoning_format reasoning_format;
     bool stream;
     std::string grammar;
     bool add_generation_prompt = true;
@@ -576,6 +578,16 @@ common_chat_templates_ptr common_chat_templates_init(
             "{%- if false %}");
     }
+    // TODO @aldehir : this is a temporary fix, pending Minja changes
+    // Ref: https://github.com/ggml-org/llama.cpp/pull/17713#issuecomment-3631342664
+    if (default_template_src.find("[TOOL_CALLS]") != std::string::npos
+            // search for the error message and patch it
+            && default_template_src.find("if (message['content'] is none or") != std::string::npos) {
+        string_replace_all(default_template_src,
+            "{%- if (message['content'] is none or message['content'] == '' or message['content']|length == 0) and (message['tool_calls'] is not defined or message['tool_calls'] is none or message['tool_calls']|length == 0) %}",
+            "{%- if false %}");
+    }
     std::string token_bos = bos_token_override;
     std::string token_eos = eos_token_override;
     bool add_bos = false;
@@ -974,6 +986,118 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
     return data;
 }
+static common_chat_params common_chat_params_init_ministral_3(const common_chat_template & tmpl, const struct templates_params & inputs) {
+    common_chat_params data;
+    // Build up messages to follow the format: https://huggingface.co/mistralai/Ministral-3-14B-Reasoning-2512/blob/main/chat_template.jinja
+    auto adjusted_messages = json::array();
+    for (const auto & msg : inputs.messages) {
+        auto role = msg.value("role", "");
+        if (role != "system" && role != "assistant") {
+            // Only adjust system and assistant messages. Interestingly, the system message may contain thinking.
+            adjusted_messages.push_back(msg);
+            continue;
+        }
+        auto content = json::array();
+        // If message contains `reasoning_content`, add it as a block of type `thinking`
+        if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
+            content.push_back({
+                {"type", "thinking"},
+                {"thinking", msg.at("reasoning_content").get<std::string>()},
+            });
+        }
+        // If message contains `content`, add it as a block of type `text`
+        if (msg.contains("content")) {
+            if (msg.at("content").is_string()) {
+                content.push_back({
+                    {"type", "text"},
+                    {"text", msg.at("content").get<std::string>()},
+                });
+            } else if (msg.at("content").is_array()) {
+                auto blocks = msg.at("content");
+                content.insert(content.end(), blocks.begin(), blocks.end());
+            }
+        }
+        auto adjusted = msg;
+        adjusted["content"] = content;
+        adjusted.erase("reasoning_content");
+        adjusted_messages.push_back(adjusted);
+    }
+    auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
+    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
+    auto include_grammar = true;
+    data.prompt = apply(tmpl, inputs, /* messages_override = */ adjusted_messages);
+    data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.preserved_tokens = {
+        "[THINK]",
+        "[/THINK]",
+        "[TOOL_CALLS]",
+        "[ARGS]",
+    };
+    auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
+        auto reasoning = extract_reasoning ? p.optional("[THINK]" + p.reasoning(p.until("[/THINK]")) + "[/THINK]") : p.eps();
+        // Response format parser
+        if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
+            // Ministral wants to emit json surrounded by code fences
+            return reasoning << "```json" << p.content(p.schema(p.json(), "response-format", inputs.json_schema)) << "```";
+        }
+        // Tool call parser
+        if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
+            auto tool_choice = p.choice();
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                std::string name = function.at("name");
+                const auto & schema = function.at("parameters");
+                tool_choice |= p.rule("tool-" + name,
+                    p.tool_open(p.tool_name(p.literal(name)) + "[ARGS]")
+                    + p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema))
+                );
+            });
+            auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
+            auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
+            auto tool_calls = p.trigger_rule("tool-call", p.repeat("[TOOL_CALLS]" + tool_choice, min_calls, max_calls));
+            return reasoning << p.content(p.until("[TOOL_CALLS]")) << tool_calls;
+        }
+        // Content only parser
+        include_grammar = false;
+        return reasoning << p.content(p.rest());
+    });
+    data.parser = parser.save();
+    if (include_grammar) {
+        data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
+        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                auto schema = function.at("parameters");
+                builder.resolve_refs(schema);
+            });
+            parser.build_grammar(builder, data.grammar_lazy);
+        });
+        data.grammar_triggers = {
+            {COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"}
+        };
+    }
+    return data;
+}
 static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
     data.prompt = apply(tmpl, inputs);
@@ -2328,6 +2452,7 @@ static common_chat_params common_chat_templates_apply_jinja(
     params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
     params.add_generation_prompt = inputs.add_generation_prompt;
     params.tool_choice = inputs.tool_choice;
+    params.reasoning_format = inputs.reasoning_format;
     params.enable_thinking = inputs.enable_thinking;
     params.grammar = inputs.grammar;
     params.now = inputs.now;
@@ -2491,6 +2616,13 @@ static common_chat_params common_chat_templates_apply_jinja(
         return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
     }
+    // Ministral/Mistral Large 3
+    if (src.find("[SYSTEM_PROMPT]") != std::string::npos &&
+        src.find("[TOOL_CALLS]") != std::string::npos &&
+        src.find("[ARGS]") != std::string::npos) {
+        return common_chat_params_init_ministral_3(tmpl, params);
+    }
     if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
         return common_chat_params_init_magistral(tmpl, params);
     }