@fugood/llama.node 1.4.5 → 1.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +49 -0
- package/lib/index.js +13 -0
- package/lib/index.ts +13 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +8 -8
- package/src/LlamaContext.cpp +69 -0
- package/src/LlamaContext.h +3 -0
- package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
- package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
- package/src/llama.cpp/common/chat-parser.cpp +3 -2
- package/src/llama.cpp/common/chat.cpp +132 -0
- package/src/llama.cpp/common/console.cpp +582 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +9 -0
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-context.cpp +6 -6
- package/src/llama.cpp/src/llama-context.h +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +233 -33
- package/src/llama.cpp/src/llama-grammar.h +20 -1
- package/src/llama.cpp/src/llama-graph.cpp +1 -1
- package/src/llama.cpp/src/llama-model.cpp +20 -8
- package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
- package/src/llama.cpp/src/models/models.h +3 -2
package/lib/binding.ts
CHANGED
|
@@ -309,6 +309,45 @@ export type BackendDeviceInfo = {
|
|
|
309
309
|
metadata?: Record<string, any>
|
|
310
310
|
}
|
|
311
311
|
|
|
312
|
+
export type BenchResult = {
|
|
313
|
+
/** Maximum KV cache size */
|
|
314
|
+
nKvMax: number
|
|
315
|
+
/** Batch size */
|
|
316
|
+
nBatch: number
|
|
317
|
+
/** Micro-batch size */
|
|
318
|
+
nUBatch: number
|
|
319
|
+
/** Flash attention type (0=disabled, 1=enabled, 2=auto) */
|
|
320
|
+
flashAttn: number
|
|
321
|
+
/** Whether prompt processing is shared */
|
|
322
|
+
isPpShared: boolean
|
|
323
|
+
/** Number of GPU layers */
|
|
324
|
+
nGpuLayers: number
|
|
325
|
+
/** Number of threads */
|
|
326
|
+
nThreads: number
|
|
327
|
+
/** Number of threads for batch processing */
|
|
328
|
+
nThreadsBatch: number
|
|
329
|
+
/** Prompt processing tokens count */
|
|
330
|
+
pp: number
|
|
331
|
+
/** Text generation tokens count */
|
|
332
|
+
tg: number
|
|
333
|
+
/** Parallel level */
|
|
334
|
+
pl: number
|
|
335
|
+
/** KV cache used */
|
|
336
|
+
nKv: number
|
|
337
|
+
/** Time for prompt processing (ms) */
|
|
338
|
+
tPp: number
|
|
339
|
+
/** Speed of prompt processing (tokens/sec) */
|
|
340
|
+
speedPp: number
|
|
341
|
+
/** Time for text generation (ms) */
|
|
342
|
+
tTg: number
|
|
343
|
+
/** Speed of text generation (tokens/sec) */
|
|
344
|
+
speedTg: number
|
|
345
|
+
/** Total time (ms) */
|
|
346
|
+
t: number
|
|
347
|
+
/** Overall speed (tokens/sec) */
|
|
348
|
+
speed: number
|
|
349
|
+
}
|
|
350
|
+
|
|
312
351
|
export type ModelInfo = {
|
|
313
352
|
desc: string
|
|
314
353
|
nEmbd: number
|
|
@@ -573,6 +612,16 @@ export interface LlamaContext {
|
|
|
573
612
|
*/
|
|
574
613
|
clearCache(clearData?: boolean): void
|
|
575
614
|
|
|
615
|
+
/**
|
|
616
|
+
* Run a benchmark to measure model performance
|
|
617
|
+
* @param pp Number of tokens to process for prompt processing benchmark
|
|
618
|
+
* @param tg Number of tokens to generate for text generation benchmark
|
|
619
|
+
* @param pl Parallel level (number of sequences)
|
|
620
|
+
* @param nr Number of repetitions
|
|
621
|
+
* @returns Benchmark results
|
|
622
|
+
*/
|
|
623
|
+
bench(pp: number, tg: number, pl: number, nr: number): Promise<BenchResult>
|
|
624
|
+
|
|
576
625
|
// static
|
|
577
626
|
loadModelInfo(path: string, skip: string[]): Promise<GGUFModelInfo>
|
|
578
627
|
toggleNativeLog(
|
package/lib/index.js
CHANGED
|
@@ -204,6 +204,19 @@ class LlamaContextWrapper {
|
|
|
204
204
|
clearCache(clearData) {
|
|
205
205
|
this.ctx.clearCache(clearData);
|
|
206
206
|
}
|
|
207
|
+
/**
|
|
208
|
+
* Run a benchmark to measure model performance
|
|
209
|
+
* @param pp Number of tokens to process for prompt processing benchmark
|
|
210
|
+
* @param tg Number of tokens to generate for text generation benchmark
|
|
211
|
+
* @param pl Parallel level (number of sequences)
|
|
212
|
+
* @param nr Number of repetitions
|
|
213
|
+
* @returns Benchmark results including timing and speed metrics
|
|
214
|
+
*/
|
|
215
|
+
bench(pp, tg, pl, nr) {
|
|
216
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
217
|
+
return this.ctx.bench(pp, tg, pl, nr);
|
|
218
|
+
});
|
|
219
|
+
}
|
|
207
220
|
}
|
|
208
221
|
const loadModel = (options, onProgress) => __awaiter(void 0, void 0, void 0, function* () {
|
|
209
222
|
var _a, _b;
|
package/lib/index.ts
CHANGED
|
@@ -16,6 +16,7 @@ import type {
|
|
|
16
16
|
JinjaFormattedChatResult,
|
|
17
17
|
Tool,
|
|
18
18
|
GGUFModelInfo,
|
|
19
|
+
BenchResult,
|
|
19
20
|
} from './binding'
|
|
20
21
|
import { BUILD_NUMBER, BUILD_COMMIT } from './version'
|
|
21
22
|
import { LlamaParallelAPI } from './parallel'
|
|
@@ -309,6 +310,18 @@ class LlamaContextWrapper {
|
|
|
309
310
|
clearCache(clearData?: boolean): void {
|
|
310
311
|
this.ctx.clearCache(clearData)
|
|
311
312
|
}
|
|
313
|
+
|
|
314
|
+
/**
|
|
315
|
+
* Run a benchmark to measure model performance
|
|
316
|
+
* @param pp Number of tokens to process for prompt processing benchmark
|
|
317
|
+
* @param tg Number of tokens to generate for text generation benchmark
|
|
318
|
+
* @param pl Parallel level (number of sequences)
|
|
319
|
+
* @param nr Number of repetitions
|
|
320
|
+
* @returns Benchmark results including timing and speed metrics
|
|
321
|
+
*/
|
|
322
|
+
async bench(pp: number, tg: number, pl: number, nr: number): Promise<BenchResult> {
|
|
323
|
+
return this.ctx.bench(pp, tg, pl, nr)
|
|
324
|
+
}
|
|
312
325
|
}
|
|
313
326
|
|
|
314
327
|
export const loadModel = async (
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.4.
|
|
4
|
+
"version": "1.4.7",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,20 +72,20 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-darwin-arm64": "1.4.
|
|
76
|
-
"@fugood/node-llama-darwin-x64": "1.4.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.4.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.4.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.4.
|
|
81
|
-
"@fugood/node-llama-linux-x64": "1.4.
|
|
82
|
-
"@fugood/node-llama-linux-x64-cuda": "1.4.
|
|
83
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.4.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.4.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.4.
|
|
86
|
-
"@fugood/node-llama-win32-x64": "1.4.
|
|
87
|
-
"@fugood/node-llama-win32-x64-cuda": "1.4.
|
|
88
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.4.
|
|
75
|
+
"@fugood/node-llama-darwin-arm64": "1.4.7",
|
|
76
|
+
"@fugood/node-llama-darwin-x64": "1.4.7",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.4.7",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.4.7",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.7",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.4.7",
|
|
81
|
+
"@fugood/node-llama-linux-x64": "1.4.7",
|
|
82
|
+
"@fugood/node-llama-linux-x64-cuda": "1.4.7",
|
|
83
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.4.7",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.4.7",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.4.7",
|
|
86
|
+
"@fugood/node-llama-win32-x64": "1.4.7",
|
|
87
|
+
"@fugood/node-llama-win32-x64-cuda": "1.4.7",
|
|
88
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.4.7"
|
|
89
89
|
},
|
|
90
90
|
"devDependencies": {
|
|
91
91
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -35,10 +35,10 @@ index 74a7b6a46..7b7a1bd50 100644
|
|
|
35
35
|
while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
|
|
36
36
|
sv.remove_suffix(1);
|
|
37
37
|
diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
|
|
38
|
-
index
|
|
38
|
+
index c371edaa5..ec032e351 100644
|
|
39
39
|
--- a/src/llama.cpp/common/chat.cpp
|
|
40
40
|
+++ b/src/llama.cpp/common/chat.cpp
|
|
41
|
-
@@ -
|
|
41
|
+
@@ -7,9 +7,6 @@
|
|
42
42
|
#include "log.h"
|
|
43
43
|
#include "regex-partial.h"
|
|
44
44
|
|
|
@@ -48,7 +48,7 @@ index 41a5bb42d..da5cf4b94 100644
|
|
|
48
48
|
#include <algorithm>
|
|
49
49
|
#include <cstdio>
|
|
50
50
|
#include <cctype>
|
|
51
|
-
@@ -
|
|
51
|
+
@@ -135,16 +132,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
|
|
52
52
|
return diffs;
|
|
53
53
|
}
|
|
54
54
|
|
|
@@ -65,7 +65,7 @@ index 41a5bb42d..da5cf4b94 100644
|
|
|
65
65
|
struct templates_params {
|
|
66
66
|
json messages;
|
|
67
67
|
json tools;
|
|
68
|
-
@@ -
|
|
68
|
+
@@ -732,7 +719,7 @@ static std::string apply(
|
|
69
69
|
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
70
70
|
}
|
|
71
71
|
// TODO: add flag to control date/time, if only for testing purposes.
|
|
@@ -99,10 +99,10 @@ index 6085510a4..263076ce2 100644
|
|
|
99
99
|
struct common_chat_tool_call {
|
|
100
100
|
std::string name;
|
|
101
101
|
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
|
|
102
|
-
index
|
|
102
|
+
index 0497f90a2..29b36f3fe 100644
|
|
103
103
|
--- a/src/llama.cpp/common/common.cpp
|
|
104
104
|
+++ b/src/llama.cpp/common/common.cpp
|
|
105
|
-
@@ -
|
|
105
|
+
@@ -1280,6 +1280,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
106
106
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
107
107
|
}
|
|
108
108
|
|
|
@@ -111,7 +111,7 @@ index f07af1d86..1b10c7b13 100644
|
|
|
111
111
|
mparams.split_mode = params.split_mode;
|
|
112
112
|
mparams.tensor_split = params.tensor_split;
|
|
113
113
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
114
|
-
index
|
|
114
|
+
index d28e48991..562203d02 100644
|
|
115
115
|
--- a/src/llama.cpp/common/common.h
|
|
116
116
|
+++ b/src/llama.cpp/common/common.h
|
|
117
117
|
@@ -302,6 +302,7 @@ struct lr_opt {
|
|
@@ -123,7 +123,7 @@ index 179113a4d..78aa24bc3 100644
|
|
|
123
123
|
int32_t n_ctx = 4096; // context size
|
|
124
124
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
125
125
|
diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
126
|
-
index
|
|
126
|
+
index fc31089f3..aa9befe4c 100644
|
|
127
127
|
--- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
128
128
|
+++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
129
129
|
@@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -203,6 +203,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
|
|
|
203
203
|
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
204
204
|
InstanceMethod<&LlamaContext::ClearCache>(
|
|
205
205
|
"clearCache",
|
|
206
|
+
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
207
|
+
InstanceMethod<&LlamaContext::Bench>(
|
|
208
|
+
"bench",
|
|
206
209
|
static_cast<napi_property_attributes>(napi_enumerable))});
|
|
207
210
|
Napi::FunctionReference *constructor = new Napi::FunctionReference();
|
|
208
211
|
*constructor = Napi::Persistent(func);
|
|
@@ -1529,3 +1532,69 @@ void LlamaContext::ClearCache(const Napi::CallbackInfo &info) {
|
|
|
1529
1532
|
|
|
1530
1533
|
_rn_ctx->clearCache(clear_data);
|
|
1531
1534
|
}
|
|
1535
|
+
|
|
1536
|
+
// bench(pp: number, tg: number, pl: number, nr: number): Promise<BenchResult>
|
|
1537
|
+
Napi::Value LlamaContext::Bench(const Napi::CallbackInfo &info) {
|
|
1538
|
+
Napi::Env env = info.Env();
|
|
1539
|
+
|
|
1540
|
+
if (info.Length() < 4) {
|
|
1541
|
+
Napi::TypeError::New(env, "Expected 4 arguments: pp, tg, pl, nr")
|
|
1542
|
+
.ThrowAsJavaScriptException();
|
|
1543
|
+
return env.Undefined();
|
|
1544
|
+
}
|
|
1545
|
+
|
|
1546
|
+
if (!_rn_ctx) {
|
|
1547
|
+
Napi::TypeError::New(env, "Context is disposed").ThrowAsJavaScriptException();
|
|
1548
|
+
return env.Undefined();
|
|
1549
|
+
}
|
|
1550
|
+
|
|
1551
|
+
if (!_rn_ctx->completion) {
|
|
1552
|
+
Napi::TypeError::New(env, "Completion context not initialized")
|
|
1553
|
+
.ThrowAsJavaScriptException();
|
|
1554
|
+
return env.Undefined();
|
|
1555
|
+
}
|
|
1556
|
+
|
|
1557
|
+
int pp = info[0].ToNumber().Int32Value();
|
|
1558
|
+
int tg = info[1].ToNumber().Int32Value();
|
|
1559
|
+
int pl = info[2].ToNumber().Int32Value();
|
|
1560
|
+
int nr = info[3].ToNumber().Int32Value();
|
|
1561
|
+
|
|
1562
|
+
std::string result;
|
|
1563
|
+
try {
|
|
1564
|
+
result = _rn_ctx->completion->bench(pp, tg, pl, nr);
|
|
1565
|
+
} catch (const std::exception &e) {
|
|
1566
|
+
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
1567
|
+
return env.Undefined();
|
|
1568
|
+
}
|
|
1569
|
+
|
|
1570
|
+
// Parse the JSON result and return as object
|
|
1571
|
+
try {
|
|
1572
|
+
auto parsed = json::parse(result);
|
|
1573
|
+
Napi::Object benchResult = Napi::Object::New(env);
|
|
1574
|
+
|
|
1575
|
+
benchResult.Set("nKvMax", Napi::Number::New(env, parsed["n_kv_max"].get<int>()));
|
|
1576
|
+
benchResult.Set("nBatch", Napi::Number::New(env, parsed["n_batch"].get<int>()));
|
|
1577
|
+
benchResult.Set("nUBatch", Napi::Number::New(env, parsed["n_ubatch"].get<int>()));
|
|
1578
|
+
benchResult.Set("flashAttn", Napi::Number::New(env, parsed["flash_attn"].get<int>()));
|
|
1579
|
+
benchResult.Set("isPpShared", Napi::Boolean::New(env, parsed["is_pp_shared"].get<int>() != 0));
|
|
1580
|
+
benchResult.Set("nGpuLayers", Napi::Number::New(env, parsed["n_gpu_layers"].get<int>()));
|
|
1581
|
+
benchResult.Set("nThreads", Napi::Number::New(env, parsed["n_threads"].get<int>()));
|
|
1582
|
+
benchResult.Set("nThreadsBatch", Napi::Number::New(env, parsed["n_threads_batch"].get<int>()));
|
|
1583
|
+
benchResult.Set("pp", Napi::Number::New(env, parsed["pp"].get<int>()));
|
|
1584
|
+
benchResult.Set("tg", Napi::Number::New(env, parsed["tg"].get<int>()));
|
|
1585
|
+
benchResult.Set("pl", Napi::Number::New(env, parsed["pl"].get<int>()));
|
|
1586
|
+
benchResult.Set("nKv", Napi::Number::New(env, parsed["n_kv"].get<int>()));
|
|
1587
|
+
benchResult.Set("tPp", Napi::Number::New(env, parsed["t_pp"].get<double>()));
|
|
1588
|
+
benchResult.Set("speedPp", Napi::Number::New(env, parsed["speed_pp"].get<double>()));
|
|
1589
|
+
benchResult.Set("tTg", Napi::Number::New(env, parsed["t_tg"].get<double>()));
|
|
1590
|
+
benchResult.Set("speedTg", Napi::Number::New(env, parsed["speed_tg"].get<double>()));
|
|
1591
|
+
benchResult.Set("t", Napi::Number::New(env, parsed["t"].get<double>()));
|
|
1592
|
+
benchResult.Set("speed", Napi::Number::New(env, parsed["speed"].get<double>()));
|
|
1593
|
+
|
|
1594
|
+
return benchResult;
|
|
1595
|
+
} catch (const std::exception &e) {
|
|
1596
|
+
Napi::Error::New(env, std::string("Failed to parse benchmark result: ") + e.what())
|
|
1597
|
+
.ThrowAsJavaScriptException();
|
|
1598
|
+
return env.Undefined();
|
|
1599
|
+
}
|
|
1600
|
+
}
|
package/src/LlamaContext.h
CHANGED
|
@@ -724,16 +724,10 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
|
|
|
724
724
|
if (reasoning_unclosed) {
|
|
725
725
|
if (auto pos = content.find(end_think); pos == std::string::npos && builder.pos() != builder.input().size()) {
|
|
726
726
|
unclosed_reasoning_content += content;
|
|
727
|
-
if (form.allow_toolcall_in_think) {
|
|
728
|
-
builder.move_to(tc->groups[0].begin);
|
|
729
|
-
if (!builder.try_consume_xml_tool_calls(form)) {
|
|
730
|
-
unclosed_reasoning_content += tool_call_start;
|
|
731
|
-
builder.move_to(tc->groups[0].end);
|
|
732
|
-
}
|
|
733
|
-
} else {
|
|
727
|
+
if (!(form.allow_toolcall_in_think && tc)) {
|
|
734
728
|
unclosed_reasoning_content += tool_call_start;
|
|
729
|
+
continue;
|
|
735
730
|
}
|
|
736
|
-
continue;
|
|
737
731
|
} else {
|
|
738
732
|
reasoning_unclosed = false;
|
|
739
733
|
std::string reasoning_content;
|
|
@@ -781,8 +775,12 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
|
|
|
781
775
|
}
|
|
782
776
|
} else {
|
|
783
777
|
// This <tool_call> start is in thinking block, skip this tool call
|
|
784
|
-
|
|
785
|
-
|
|
778
|
+
// This <tool_call> start is in thinking block
|
|
779
|
+
if (form.allow_toolcall_in_think) {
|
|
780
|
+
unclosed_reasoning_content = content.substr(think_start + start_think.size());
|
|
781
|
+
} else {
|
|
782
|
+
unclosed_reasoning_content = content.substr(think_start + start_think.size()) + tool_call_start;
|
|
783
|
+
}
|
|
786
784
|
reasoning_unclosed = true;
|
|
787
785
|
content.resize(think_start);
|
|
788
786
|
toolcall_in_think = true;
|
|
@@ -805,14 +803,35 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
|
|
|
805
803
|
}
|
|
806
804
|
|
|
807
805
|
// remove potential partial suffix
|
|
808
|
-
if (
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
806
|
+
if (builder.pos() == builder.input().size()) {
|
|
807
|
+
if (unclosed_reasoning_content.empty()) {
|
|
808
|
+
rstrip(content);
|
|
809
|
+
trim_potential_partial_word(content);
|
|
810
|
+
rstrip(content);
|
|
811
|
+
} else {
|
|
812
|
+
rstrip(unclosed_reasoning_content);
|
|
813
|
+
trim_potential_partial_word(unclosed_reasoning_content);
|
|
814
|
+
rstrip(unclosed_reasoning_content);
|
|
815
|
+
}
|
|
816
|
+
}
|
|
817
|
+
|
|
818
|
+
// consume unclosed_reasoning_content if allow_toolcall_in_think is set
|
|
819
|
+
if (form.allow_toolcall_in_think && !unclosed_reasoning_content.empty()) {
|
|
820
|
+
if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
|
|
821
|
+
builder.add_reasoning_content(unclosed_reasoning_content);
|
|
822
|
+
} else {
|
|
823
|
+
if (content.empty()) {
|
|
824
|
+
content = start_think + unclosed_reasoning_content;
|
|
825
|
+
} else {
|
|
826
|
+
content += "\n\n" + start_think;
|
|
827
|
+
content += unclosed_reasoning_content;
|
|
828
|
+
}
|
|
829
|
+
}
|
|
830
|
+
unclosed_reasoning_content.clear();
|
|
812
831
|
}
|
|
813
832
|
|
|
814
833
|
// Add content
|
|
815
|
-
if (content.
|
|
834
|
+
if (!content.empty()) {
|
|
816
835
|
// If there are multiple content blocks
|
|
817
836
|
if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content && builder.result().content.size() != 0) {
|
|
818
837
|
builder.add_content("\n\n");
|
|
@@ -820,7 +839,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
|
|
|
820
839
|
builder.add_content(content);
|
|
821
840
|
}
|
|
822
841
|
|
|
823
|
-
// This <tool_call> start is in thinking block, skip this tool call
|
|
842
|
+
// This <tool_call> start is in thinking block and toolcall_in_think not set, skip this tool call
|
|
824
843
|
if (toolcall_in_think && !form.allow_toolcall_in_think) {
|
|
825
844
|
continue;
|
|
826
845
|
}
|
|
@@ -829,7 +848,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
|
|
|
829
848
|
if (!tc) {
|
|
830
849
|
GGML_ASSERT(builder.pos() == builder.input().size());
|
|
831
850
|
GGML_ASSERT(unclosed_reasoning_content.empty());
|
|
832
|
-
GGML_ASSERT(!reasoning_unclosed);
|
|
851
|
+
if (!form.allow_toolcall_in_think) GGML_ASSERT(!reasoning_unclosed);
|
|
833
852
|
break;
|
|
834
853
|
}
|
|
835
854
|
|
|
@@ -854,7 +873,6 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
|
|
|
854
873
|
|
|
855
874
|
/**
|
|
856
875
|
* Parse content uses reasoning and XML-Style tool call
|
|
857
|
-
* TODO: Note that form.allow_toolcall_in_think is not tested yet. If anyone confirms it works, this comment can be removed.
|
|
858
876
|
*/
|
|
859
877
|
void common_chat_msg_parser::consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think, const std::string & end_think) {
|
|
860
878
|
parse_msg_with_xml_tool_calls(*this, form, start_think, end_think);
|
|
@@ -31,7 +31,7 @@ struct xml_tool_call_format {
|
|
|
31
31
|
std::optional<std::string> last_val_end = std::nullopt;
|
|
32
32
|
std::optional<std::string> last_tool_end = std::nullopt;
|
|
33
33
|
bool trim_raw_argval = false;
|
|
34
|
-
bool allow_toolcall_in_think = false;
|
|
34
|
+
bool allow_toolcall_in_think = false;
|
|
35
35
|
};
|
|
36
36
|
|
|
37
37
|
// make a GBNF that accept any strings except those containing any of the forbidden strings.
|
|
@@ -917,12 +917,13 @@ static void common_chat_parse_kimi_k2(common_chat_msg_parser & builder) {
|
|
|
917
917
|
form.tool_start = "<|tool_call_begin|>";
|
|
918
918
|
form.tool_sep = "<|tool_call_argument_begin|>{";
|
|
919
919
|
form.key_start = "\"";
|
|
920
|
-
form.key_val_sep = "\":
|
|
921
|
-
form.val_end = ",
|
|
920
|
+
form.key_val_sep = "\":";
|
|
921
|
+
form.val_end = ",";
|
|
922
922
|
form.tool_end = "}<|tool_call_end|>";
|
|
923
923
|
form.scope_end = "<|tool_calls_section_end|>";
|
|
924
924
|
form.raw_argval = false;
|
|
925
925
|
form.last_val_end = "";
|
|
926
|
+
form.allow_toolcall_in_think = true;
|
|
926
927
|
return form;
|
|
927
928
|
})();
|
|
928
929
|
builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
#include "chat.h"
|
|
2
2
|
#include "chat-parser.h"
|
|
3
|
+
#include "chat-peg-parser.h"
|
|
3
4
|
#include "common.h"
|
|
4
5
|
#include "json-partial.h"
|
|
5
6
|
#include "json-schema-to-grammar.h"
|
|
@@ -137,6 +138,7 @@ struct templates_params {
|
|
|
137
138
|
common_chat_tool_choice tool_choice;
|
|
138
139
|
json json_schema;
|
|
139
140
|
bool parallel_tool_calls;
|
|
141
|
+
common_reasoning_format reasoning_format;
|
|
140
142
|
bool stream;
|
|
141
143
|
std::string grammar;
|
|
142
144
|
bool add_generation_prompt = true;
|
|
@@ -576,6 +578,16 @@ common_chat_templates_ptr common_chat_templates_init(
|
|
|
576
578
|
"{%- if false %}");
|
|
577
579
|
}
|
|
578
580
|
|
|
581
|
+
// TODO @aldehir : this is a temporary fix, pending Minja changes
|
|
582
|
+
// Ref: https://github.com/ggml-org/llama.cpp/pull/17713#issuecomment-3631342664
|
|
583
|
+
if (default_template_src.find("[TOOL_CALLS]") != std::string::npos
|
|
584
|
+
// search for the error message and patch it
|
|
585
|
+
&& default_template_src.find("if (message['content'] is none or") != std::string::npos) {
|
|
586
|
+
string_replace_all(default_template_src,
|
|
587
|
+
"{%- if (message['content'] is none or message['content'] == '' or message['content']|length == 0) and (message['tool_calls'] is not defined or message['tool_calls'] is none or message['tool_calls']|length == 0) %}",
|
|
588
|
+
"{%- if false %}");
|
|
589
|
+
}
|
|
590
|
+
|
|
579
591
|
std::string token_bos = bos_token_override;
|
|
580
592
|
std::string token_eos = eos_token_override;
|
|
581
593
|
bool add_bos = false;
|
|
@@ -974,6 +986,118 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
|
|
|
974
986
|
return data;
|
|
975
987
|
}
|
|
976
988
|
|
|
989
|
+
static common_chat_params common_chat_params_init_ministral_3(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
990
|
+
common_chat_params data;
|
|
991
|
+
|
|
992
|
+
// Build up messages to follow the format: https://huggingface.co/mistralai/Ministral-3-14B-Reasoning-2512/blob/main/chat_template.jinja
|
|
993
|
+
auto adjusted_messages = json::array();
|
|
994
|
+
for (const auto & msg : inputs.messages) {
|
|
995
|
+
auto role = msg.value("role", "");
|
|
996
|
+
if (role != "system" && role != "assistant") {
|
|
997
|
+
// Only adjust system and assistant messages. Interestingly, the system message may contain thinking.
|
|
998
|
+
adjusted_messages.push_back(msg);
|
|
999
|
+
continue;
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
auto content = json::array();
|
|
1003
|
+
|
|
1004
|
+
// If message contains `reasoning_content`, add it as a block of type `thinking`
|
|
1005
|
+
if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
|
|
1006
|
+
content.push_back({
|
|
1007
|
+
{"type", "thinking"},
|
|
1008
|
+
{"thinking", msg.at("reasoning_content").get<std::string>()},
|
|
1009
|
+
});
|
|
1010
|
+
}
|
|
1011
|
+
|
|
1012
|
+
// If message contains `content`, add it as a block of type `text`
|
|
1013
|
+
if (msg.contains("content")) {
|
|
1014
|
+
if (msg.at("content").is_string()) {
|
|
1015
|
+
content.push_back({
|
|
1016
|
+
{"type", "text"},
|
|
1017
|
+
{"text", msg.at("content").get<std::string>()},
|
|
1018
|
+
});
|
|
1019
|
+
} else if (msg.at("content").is_array()) {
|
|
1020
|
+
auto blocks = msg.at("content");
|
|
1021
|
+
content.insert(content.end(), blocks.begin(), blocks.end());
|
|
1022
|
+
}
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
auto adjusted = msg;
|
|
1026
|
+
adjusted["content"] = content;
|
|
1027
|
+
adjusted.erase("reasoning_content");
|
|
1028
|
+
adjusted_messages.push_back(adjusted);
|
|
1029
|
+
}
|
|
1030
|
+
|
|
1031
|
+
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
|
1032
|
+
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
|
1033
|
+
auto include_grammar = true;
|
|
1034
|
+
|
|
1035
|
+
data.prompt = apply(tmpl, inputs, /* messages_override = */ adjusted_messages);
|
|
1036
|
+
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
|
1037
|
+
data.preserved_tokens = {
|
|
1038
|
+
"[THINK]",
|
|
1039
|
+
"[/THINK]",
|
|
1040
|
+
"[TOOL_CALLS]",
|
|
1041
|
+
"[ARGS]",
|
|
1042
|
+
};
|
|
1043
|
+
|
|
1044
|
+
auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
|
|
1045
|
+
auto reasoning = extract_reasoning ? p.optional("[THINK]" + p.reasoning(p.until("[/THINK]")) + "[/THINK]") : p.eps();
|
|
1046
|
+
|
|
1047
|
+
// Response format parser
|
|
1048
|
+
if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
|
|
1049
|
+
// Ministral wants to emit json surrounded by code fences
|
|
1050
|
+
return reasoning << "```json" << p.content(p.schema(p.json(), "response-format", inputs.json_schema)) << "```";
|
|
1051
|
+
}
|
|
1052
|
+
|
|
1053
|
+
// Tool call parser
|
|
1054
|
+
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
|
|
1055
|
+
auto tool_choice = p.choice();
|
|
1056
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
1057
|
+
const auto & function = tool.at("function");
|
|
1058
|
+
std::string name = function.at("name");
|
|
1059
|
+
const auto & schema = function.at("parameters");
|
|
1060
|
+
|
|
1061
|
+
tool_choice |= p.rule("tool-" + name,
|
|
1062
|
+
p.tool_open(p.tool_name(p.literal(name)) + "[ARGS]")
|
|
1063
|
+
+ p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema))
|
|
1064
|
+
);
|
|
1065
|
+
});
|
|
1066
|
+
|
|
1067
|
+
auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
|
|
1068
|
+
auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
|
|
1069
|
+
auto tool_calls = p.trigger_rule("tool-call", p.repeat("[TOOL_CALLS]" + tool_choice, min_calls, max_calls));
|
|
1070
|
+
|
|
1071
|
+
return reasoning << p.content(p.until("[TOOL_CALLS]")) << tool_calls;
|
|
1072
|
+
}
|
|
1073
|
+
|
|
1074
|
+
// Content only parser
|
|
1075
|
+
include_grammar = false;
|
|
1076
|
+
return reasoning << p.content(p.rest());
|
|
1077
|
+
});
|
|
1078
|
+
|
|
1079
|
+
data.parser = parser.save();
|
|
1080
|
+
|
|
1081
|
+
if (include_grammar) {
|
|
1082
|
+
data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
|
|
1083
|
+
|
|
1084
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1085
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
1086
|
+
const auto & function = tool.at("function");
|
|
1087
|
+
auto schema = function.at("parameters");
|
|
1088
|
+
builder.resolve_refs(schema);
|
|
1089
|
+
});
|
|
1090
|
+
parser.build_grammar(builder, data.grammar_lazy);
|
|
1091
|
+
});
|
|
1092
|
+
|
|
1093
|
+
data.grammar_triggers = {
|
|
1094
|
+
{COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"}
|
|
1095
|
+
};
|
|
1096
|
+
}
|
|
1097
|
+
|
|
1098
|
+
return data;
|
|
1099
|
+
}
|
|
1100
|
+
|
|
977
1101
|
static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
978
1102
|
common_chat_params data;
|
|
979
1103
|
data.prompt = apply(tmpl, inputs);
|
|
@@ -2328,6 +2452,7 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2328
2452
|
params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
|
|
2329
2453
|
params.add_generation_prompt = inputs.add_generation_prompt;
|
|
2330
2454
|
params.tool_choice = inputs.tool_choice;
|
|
2455
|
+
params.reasoning_format = inputs.reasoning_format;
|
|
2331
2456
|
params.enable_thinking = inputs.enable_thinking;
|
|
2332
2457
|
params.grammar = inputs.grammar;
|
|
2333
2458
|
params.now = inputs.now;
|
|
@@ -2491,6 +2616,13 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2491
2616
|
return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
|
|
2492
2617
|
}
|
|
2493
2618
|
|
|
2619
|
+
// Ministral/Mistral Large 3
|
|
2620
|
+
if (src.find("[SYSTEM_PROMPT]") != std::string::npos &&
|
|
2621
|
+
src.find("[TOOL_CALLS]") != std::string::npos &&
|
|
2622
|
+
src.find("[ARGS]") != std::string::npos) {
|
|
2623
|
+
return common_chat_params_init_ministral_3(tmpl, params);
|
|
2624
|
+
}
|
|
2625
|
+
|
|
2494
2626
|
if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
|
|
2495
2627
|
return common_chat_params_init_magistral(tmpl, params);
|
|
2496
2628
|
}
|