@fugood/llama.node 1.1.8 → 1.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +9 -0
- package/lib/index.js +9 -2
- package/lib/index.ts +57 -30
- package/lib/version.js +2 -2
- package/lib/version.ts +2 -2
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +15 -5
- package/src/LlamaCompletionWorker.cpp +12 -3
- package/src/LlamaCompletionWorker.h +3 -1
- package/src/LlamaContext.cpp +14 -1
- package/src/llama.cpp/common/arg.cpp +6 -4
- package/src/llama.cpp/common/chat.cpp +34 -3
- package/src/llama.cpp/common/common.cpp +0 -15
- package/src/llama.cpp/common/common.h +1 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +25 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/include/llama.h +1 -110
- package/src/llama.cpp/src/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/llama-arch.cpp +19 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +13 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +5 -192
- package/src/llama.cpp/src/llama-context.h +2 -7
- package/src/llama.cpp/src/llama-cparams.h +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +35 -57
- package/src/llama.cpp/src/llama-graph.h +36 -46
- package/src/llama.cpp/src/llama-hparams.cpp +25 -0
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +69 -52
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +28 -26
- package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +123 -474
- package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +34 -59
- package/src/llama.cpp/src/llama-kv-cells.h +21 -21
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +34 -33
- package/src/llama.cpp/src/llama-memory-hybrid.h +24 -28
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +7 -7
- package/src/llama.cpp/src/llama-memory-recurrent.h +8 -12
- package/src/llama.cpp/src/llama-memory.h +11 -8
- package/src/llama.cpp/src/llama-model.cpp +396 -187
- package/src/llama.cpp/src/llama-model.h +1 -0
package/lib/binding.ts
CHANGED
|
@@ -59,6 +59,10 @@ export type LlamaModelOptions = {
|
|
|
59
59
|
* Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
60
60
|
*/
|
|
61
61
|
swa_full?: boolean
|
|
62
|
+
/**
|
|
63
|
+
* Number of layers to keep MoE weights on CPU
|
|
64
|
+
*/
|
|
65
|
+
n_cpu_moe?: number
|
|
62
66
|
use_mlock?: boolean
|
|
63
67
|
use_mmap?: boolean
|
|
64
68
|
vocab_only?: boolean
|
|
@@ -96,6 +100,11 @@ export type LlamaCompletionOptions = {
|
|
|
96
100
|
enable_thinking?: boolean
|
|
97
101
|
thinking_forced_open?: boolean
|
|
98
102
|
prompt?: string
|
|
103
|
+
/**
|
|
104
|
+
* Text to prefill the response with.
|
|
105
|
+
* This text will be added to the beginning of the generated response.
|
|
106
|
+
*/
|
|
107
|
+
prefill_text?: string
|
|
99
108
|
temperature?: number
|
|
100
109
|
top_k?: number
|
|
101
110
|
top_p?: number
|
package/lib/index.js
CHANGED
|
@@ -148,7 +148,12 @@ class LlamaContextWrapper {
|
|
|
148
148
|
enable_thinking: (_a = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _a !== void 0 ? _a : true,
|
|
149
149
|
add_generation_prompt: params === null || params === void 0 ? void 0 : params.add_generation_prompt,
|
|
150
150
|
now: params === null || params === void 0 ? void 0 : params.now,
|
|
151
|
-
chat_template_kwargs: params === null || params === void 0 ? void 0 : params.chat_template_kwargs
|
|
151
|
+
chat_template_kwargs: (params === null || params === void 0 ? void 0 : params.chat_template_kwargs)
|
|
152
|
+
? Object.entries(params.chat_template_kwargs).reduce((acc, [key, value]) => {
|
|
153
|
+
acc[key] = JSON.stringify(value); // Each value is a stringified JSON object
|
|
154
|
+
return acc;
|
|
155
|
+
}, {})
|
|
156
|
+
: undefined,
|
|
152
157
|
});
|
|
153
158
|
if (!useJinja) {
|
|
154
159
|
return {
|
|
@@ -179,7 +184,9 @@ class LlamaContextWrapper {
|
|
|
179
184
|
return this.ctx.embedding(text);
|
|
180
185
|
}
|
|
181
186
|
rerank(query, documents, params) {
|
|
182
|
-
return this.ctx
|
|
187
|
+
return this.ctx
|
|
188
|
+
.rerank(query, documents, params)
|
|
189
|
+
.then((results) => {
|
|
183
190
|
// Sort by score descending and add document text for convenience
|
|
184
191
|
return results
|
|
185
192
|
.map((result) => (Object.assign(Object.assign({}, result), { document: documents[result.index] })))
|
package/lib/index.ts
CHANGED
|
@@ -165,11 +165,11 @@ class LlamaContextWrapper {
|
|
|
165
165
|
response_format?: CompletionResponseFormat
|
|
166
166
|
tools?: Tool[]
|
|
167
167
|
parallel_tool_calls?: boolean
|
|
168
|
-
tool_choice?: string
|
|
169
|
-
enable_thinking?: boolean
|
|
170
|
-
add_generation_prompt?: boolean
|
|
171
|
-
now?: string | number
|
|
172
|
-
chat_template_kwargs?: Record<string, string
|
|
168
|
+
tool_choice?: string
|
|
169
|
+
enable_thinking?: boolean
|
|
170
|
+
add_generation_prompt?: boolean
|
|
171
|
+
now?: string | number
|
|
172
|
+
chat_template_kwargs?: Record<string, string>
|
|
173
173
|
},
|
|
174
174
|
): FormattedChatResult {
|
|
175
175
|
const {
|
|
@@ -192,7 +192,15 @@ class LlamaContextWrapper {
|
|
|
192
192
|
enable_thinking: params?.enable_thinking ?? true,
|
|
193
193
|
add_generation_prompt: params?.add_generation_prompt,
|
|
194
194
|
now: params?.now,
|
|
195
|
-
chat_template_kwargs: params?.chat_template_kwargs
|
|
195
|
+
chat_template_kwargs: params?.chat_template_kwargs
|
|
196
|
+
? Object.entries(params.chat_template_kwargs).reduce(
|
|
197
|
+
(acc, [key, value]) => {
|
|
198
|
+
acc[key] = JSON.stringify(value) // Each value is a stringified JSON object
|
|
199
|
+
return acc
|
|
200
|
+
},
|
|
201
|
+
{} as Record<string, any>,
|
|
202
|
+
)
|
|
203
|
+
: undefined,
|
|
196
204
|
})
|
|
197
205
|
|
|
198
206
|
if (!useJinja) {
|
|
@@ -218,18 +226,24 @@ class LlamaContextWrapper {
|
|
|
218
226
|
): Promise<LlamaCompletionResult> {
|
|
219
227
|
const { messages, media_paths = options.media_paths } =
|
|
220
228
|
this._formatMediaChat(options.messages)
|
|
221
|
-
return this.ctx.completion(
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
229
|
+
return this.ctx.completion(
|
|
230
|
+
{
|
|
231
|
+
...options,
|
|
232
|
+
messages,
|
|
233
|
+
media_paths: options.media_paths || media_paths,
|
|
234
|
+
},
|
|
235
|
+
callback || (() => {}),
|
|
236
|
+
)
|
|
226
237
|
}
|
|
227
238
|
|
|
228
239
|
stopCompletion(): void {
|
|
229
240
|
return this.ctx.stopCompletion()
|
|
230
241
|
}
|
|
231
242
|
|
|
232
|
-
tokenize(
|
|
243
|
+
tokenize(
|
|
244
|
+
text: string,
|
|
245
|
+
{ media_paths }: { media_paths?: string[] } = {},
|
|
246
|
+
): Promise<TokenizeResult> {
|
|
233
247
|
return this.ctx.tokenize(text, media_paths)
|
|
234
248
|
}
|
|
235
249
|
|
|
@@ -241,16 +255,27 @@ class LlamaContextWrapper {
|
|
|
241
255
|
return this.ctx.embedding(text)
|
|
242
256
|
}
|
|
243
257
|
|
|
244
|
-
rerank(
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
258
|
+
rerank(
|
|
259
|
+
query: string,
|
|
260
|
+
documents: string[],
|
|
261
|
+
params?: RerankParams,
|
|
262
|
+
): Promise<Array<RerankResult & { document: string }>> {
|
|
263
|
+
return this.ctx
|
|
264
|
+
.rerank(query, documents, params)
|
|
265
|
+
.then((results: RerankResult[]) => {
|
|
266
|
+
// Sort by score descending and add document text for convenience
|
|
267
|
+
return results
|
|
268
|
+
.map((result: RerankResult) => ({
|
|
269
|
+
...result,
|
|
270
|
+
document: documents[result.index],
|
|
271
|
+
}))
|
|
272
|
+
.sort(
|
|
273
|
+
(
|
|
274
|
+
a: RerankResult & { document: string },
|
|
275
|
+
b: RerankResult & { document: string },
|
|
276
|
+
) => b.score - a.score,
|
|
277
|
+
)
|
|
278
|
+
})
|
|
254
279
|
}
|
|
255
280
|
|
|
256
281
|
saveSession(path: string): Promise<void> {
|
|
@@ -277,10 +302,7 @@ class LlamaContextWrapper {
|
|
|
277
302
|
return this.ctx.getLoadedLoraAdapters()
|
|
278
303
|
}
|
|
279
304
|
|
|
280
|
-
initMultimodal(options: {
|
|
281
|
-
path: string
|
|
282
|
-
use_gpu?: boolean
|
|
283
|
-
}): boolean {
|
|
305
|
+
initMultimodal(options: { path: string; use_gpu?: boolean }): boolean {
|
|
284
306
|
return this.ctx.initMultimodal(options)
|
|
285
307
|
}
|
|
286
308
|
|
|
@@ -299,7 +321,7 @@ class LlamaContextWrapper {
|
|
|
299
321
|
return this.ctx.getMultimodalSupport()
|
|
300
322
|
}
|
|
301
323
|
|
|
302
|
-
initVocoder(options: { path: string
|
|
324
|
+
initVocoder(options: { path: string; n_batch?: number }): boolean {
|
|
303
325
|
return this.ctx.initVocoder(options)
|
|
304
326
|
}
|
|
305
327
|
|
|
@@ -311,7 +333,10 @@ class LlamaContextWrapper {
|
|
|
311
333
|
return this.ctx.isVocoderEnabled()
|
|
312
334
|
}
|
|
313
335
|
|
|
314
|
-
getFormattedAudioCompletion(
|
|
336
|
+
getFormattedAudioCompletion(
|
|
337
|
+
speaker: string | null,
|
|
338
|
+
text: string,
|
|
339
|
+
): {
|
|
315
340
|
prompt: string
|
|
316
341
|
grammar?: string
|
|
317
342
|
} {
|
|
@@ -322,7 +347,7 @@ class LlamaContextWrapper {
|
|
|
322
347
|
return this.ctx.getAudioCompletionGuideTokens(text)
|
|
323
348
|
}
|
|
324
349
|
|
|
325
|
-
decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array> {
|
|
350
|
+
decodeAudioTokens(tokens: number[] | Int32Array): Promise<Float32Array> {
|
|
326
351
|
return this.ctx.decodeAudioTokens(tokens)
|
|
327
352
|
}
|
|
328
353
|
}
|
|
@@ -348,7 +373,9 @@ const modelInfoSkip = [
|
|
|
348
373
|
'tokenizer.ggml.scores',
|
|
349
374
|
]
|
|
350
375
|
|
|
351
|
-
export const loadLlamaModelInfo = async (
|
|
376
|
+
export const loadLlamaModelInfo = async (
|
|
377
|
+
path: string,
|
|
378
|
+
): Promise<GGUFModelInfo> => {
|
|
352
379
|
const variant = 'default'
|
|
353
380
|
mods[variant] ??= await loadModule(variant)
|
|
354
381
|
refreshNativeLogSetup()
|
package/lib/version.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.BUILD_COMMIT = exports.BUILD_NUMBER = void 0;
|
|
4
|
-
exports.BUILD_NUMBER = '
|
|
5
|
-
exports.BUILD_COMMIT = '
|
|
4
|
+
exports.BUILD_NUMBER = '6250';
|
|
5
|
+
exports.BUILD_COMMIT = 'e92734d51';
|
package/lib/version.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
export const BUILD_NUMBER = '
|
|
2
|
-
export const BUILD_COMMIT = '
|
|
1
|
+
export const BUILD_NUMBER = '6250';
|
|
2
|
+
export const BUILD_COMMIT = 'e92734d51';
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.1.
|
|
4
|
+
"version": "1.1.10",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -71,19 +71,19 @@
|
|
|
71
71
|
"CMakeLists.txt"
|
|
72
72
|
],
|
|
73
73
|
"optionalDependencies": {
|
|
74
|
-
"@fugood/node-llama-linux-x64": "1.1.
|
|
75
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.1.
|
|
76
|
-
"@fugood/node-llama-linux-x64-cuda": "1.1.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.1.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.1.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.1.
|
|
80
|
-
"@fugood/node-llama-win32-x64": "1.1.
|
|
81
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.1.
|
|
82
|
-
"@fugood/node-llama-win32-x64-cuda": "1.1.
|
|
83
|
-
"@fugood/node-llama-win32-arm64": "1.1.
|
|
84
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.1.
|
|
85
|
-
"@fugood/node-llama-darwin-x64": "1.1.
|
|
86
|
-
"@fugood/node-llama-darwin-arm64": "1.1.
|
|
74
|
+
"@fugood/node-llama-linux-x64": "1.1.10",
|
|
75
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.1.10",
|
|
76
|
+
"@fugood/node-llama-linux-x64-cuda": "1.1.10",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.1.10",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.1.10",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.1.10",
|
|
80
|
+
"@fugood/node-llama-win32-x64": "1.1.10",
|
|
81
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.1.10",
|
|
82
|
+
"@fugood/node-llama-win32-x64-cuda": "1.1.10",
|
|
83
|
+
"@fugood/node-llama-win32-arm64": "1.1.10",
|
|
84
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.1.10",
|
|
85
|
+
"@fugood/node-llama-darwin-x64": "1.1.10",
|
|
86
|
+
"@fugood/node-llama-darwin-arm64": "1.1.10"
|
|
87
87
|
},
|
|
88
88
|
"devDependencies": {
|
|
89
89
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
|
|
2
|
-
index
|
|
2
|
+
index 111b4a21b..16ce87672 100644
|
|
3
3
|
--- a/src/llama.cpp/common/chat.cpp
|
|
4
4
|
+++ b/src/llama.cpp/common/chat.cpp
|
|
5
5
|
@@ -6,9 +6,6 @@
|
|
@@ -29,6 +29,16 @@ index 23d3828f9..ca48af00c 100644
|
|
|
29
29
|
struct templates_params {
|
|
30
30
|
json messages;
|
|
31
31
|
json tools;
|
|
32
|
+
@@ -784,8 +771,7 @@ static std::string apply(
|
|
33
|
+
if (additional_context) {
|
|
34
|
+
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
35
|
+
}
|
|
36
|
+
- // TODO: add flag to control date/time, if only for testing purposes.
|
|
37
|
+
- // tmpl_inputs.now = std::chrono::system_clock::now();
|
|
38
|
+
+ tmpl_inputs.now = inputs.now;
|
|
39
|
+
|
|
40
|
+
minja::chat_template_options tmpl_opts;
|
|
41
|
+
// To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
|
|
32
42
|
diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
|
|
33
43
|
index d1e480c91..437e64e29 100644
|
|
34
44
|
--- a/src/llama.cpp/common/chat.h
|
|
@@ -54,10 +64,10 @@ index d1e480c91..437e64e29 100644
|
|
|
54
64
|
struct common_chat_tool_call {
|
|
55
65
|
std::string name;
|
|
56
66
|
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
|
|
57
|
-
index
|
|
67
|
+
index fdce1dcde..55aac3412 100644
|
|
58
68
|
--- a/src/llama.cpp/common/common.cpp
|
|
59
69
|
+++ b/src/llama.cpp/common/common.cpp
|
|
60
|
-
@@ -
|
|
70
|
+
@@ -1103,6 +1103,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
61
71
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
62
72
|
}
|
|
63
73
|
|
|
@@ -66,10 +76,10 @@ index 67dd5404f..909a97c66 100644
|
|
|
66
76
|
mparams.split_mode = params.split_mode;
|
|
67
77
|
mparams.tensor_split = params.tensor_split;
|
|
68
78
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
69
|
-
index
|
|
79
|
+
index 390dda5e5..f259ca785 100644
|
|
70
80
|
--- a/src/llama.cpp/common/common.h
|
|
71
81
|
+++ b/src/llama.cpp/common/common.h
|
|
72
|
-
@@ -
|
|
82
|
+
@@ -270,6 +270,7 @@ struct lr_opt {
|
|
73
83
|
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
|
74
84
|
|
|
75
85
|
struct common_params {
|
|
@@ -35,12 +35,14 @@ LlamaCompletionWorker::LlamaCompletionWorker(
|
|
|
35
35
|
const std::vector<std::string> &media_paths,
|
|
36
36
|
const std::vector<llama_token> &guide_tokens,
|
|
37
37
|
bool has_vocoder,
|
|
38
|
-
tts_type tts_type_val
|
|
38
|
+
tts_type tts_type_val,
|
|
39
|
+
const std::string &prefill_text)
|
|
39
40
|
: AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
|
|
40
41
|
_params(params), _stop_words(stop_words), _chat_format(chat_format),
|
|
41
42
|
_thinking_forced_open(thinking_forced_open),
|
|
42
43
|
_reasoning_format(reasoning_format),
|
|
43
44
|
_media_paths(media_paths), _guide_tokens(guide_tokens),
|
|
45
|
+
_prefill_text(prefill_text),
|
|
44
46
|
_has_vocoder(has_vocoder), _tts_type(tts_type_val) {
|
|
45
47
|
if (!callback.IsEmpty()) {
|
|
46
48
|
_tsfn = Napi::ThreadSafeFunction::New(info.Env(), callback,
|
|
@@ -68,8 +70,11 @@ LlamaCompletionWorker::PartialOutput LlamaCompletionWorker::getPartialOutput(con
|
|
|
68
70
|
|
|
69
71
|
chat_syntax.parse_tool_calls = true;
|
|
70
72
|
|
|
73
|
+
// Combine prefill_text with generated_text for parsing
|
|
74
|
+
std::string full_text = _prefill_text + generated_text;
|
|
75
|
+
|
|
71
76
|
// Use is_partial=true for streaming partial output
|
|
72
|
-
common_chat_msg parsed_msg = common_chat_parse(
|
|
77
|
+
common_chat_msg parsed_msg = common_chat_parse(full_text, true, chat_syntax);
|
|
73
78
|
|
|
74
79
|
result.content = parsed_msg.content;
|
|
75
80
|
result.reasoning_content = parsed_msg.reasoning_content;
|
|
@@ -156,6 +161,7 @@ void LlamaCompletionWorker::Execute() {
|
|
|
156
161
|
auto embd = _sess->tokens_ptr();
|
|
157
162
|
embd->reserve(embd->size() + max_len);
|
|
158
163
|
|
|
164
|
+
|
|
159
165
|
if (is_enc_dec) {
|
|
160
166
|
if (n_input > 0) {
|
|
161
167
|
// Decode tokens in batches using n_batch as chunk size
|
|
@@ -378,8 +384,11 @@ void LlamaCompletionWorker::OnOK() {
|
|
|
378
384
|
chat_syntax.thinking_forced_open = _thinking_forced_open;
|
|
379
385
|
|
|
380
386
|
chat_syntax.reasoning_format = common_reasoning_format_from_name(_reasoning_format);
|
|
387
|
+
|
|
388
|
+
// Combine prefill_text with generated_text for final parsing
|
|
389
|
+
std::string full_text = _prefill_text + _result.text;
|
|
381
390
|
common_chat_msg message = common_chat_parse(
|
|
382
|
-
|
|
391
|
+
full_text,
|
|
383
392
|
false,
|
|
384
393
|
chat_syntax
|
|
385
394
|
);
|
|
@@ -26,7 +26,8 @@ public:
|
|
|
26
26
|
const std::vector<std::string> &media_paths = {},
|
|
27
27
|
const std::vector<llama_token> &guide_tokens = {},
|
|
28
28
|
bool has_vocoder = false,
|
|
29
|
-
tts_type tts_type_val = UNKNOWN
|
|
29
|
+
tts_type tts_type_val = UNKNOWN,
|
|
30
|
+
const std::string &prefill_text = "");
|
|
30
31
|
|
|
31
32
|
~LlamaCompletionWorker();
|
|
32
33
|
|
|
@@ -58,6 +59,7 @@ private:
|
|
|
58
59
|
std::string _reasoning_format;
|
|
59
60
|
std::vector<std::string> _media_paths;
|
|
60
61
|
std::vector<llama_token> _guide_tokens;
|
|
62
|
+
std::string _prefill_text;
|
|
61
63
|
std::function<void()> _onComplete;
|
|
62
64
|
bool _has_callback = false;
|
|
63
65
|
bool _interrupted = false;
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
#include "llama-impl.h"
|
|
16
16
|
|
|
17
17
|
#include <atomic>
|
|
18
|
+
#include <list>
|
|
18
19
|
#include <mutex>
|
|
19
20
|
#include <queue>
|
|
20
21
|
|
|
@@ -258,6 +259,16 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
258
259
|
params.numa =
|
|
259
260
|
static_cast<ggml_numa_strategy>(get_option<uint32_t>(options, "numa", 0));
|
|
260
261
|
|
|
262
|
+
int n_cpu_moe = get_option<int32_t>(options, "n_cpu_moe", 0);
|
|
263
|
+
if (n_cpu_moe > 0) {
|
|
264
|
+
static std::list<std::string> buft_overrides;
|
|
265
|
+
for (int i = 0; i < n_cpu_moe; ++i) {
|
|
266
|
+
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
|
|
267
|
+
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
|
|
268
|
+
}
|
|
269
|
+
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
|
270
|
+
}
|
|
271
|
+
|
|
261
272
|
llama_backend_init();
|
|
262
273
|
llama_numa_init(params.numa);
|
|
263
274
|
|
|
@@ -924,6 +935,8 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
924
935
|
json_schema_to_grammar(json::parse(json_schema_str));
|
|
925
936
|
}
|
|
926
937
|
|
|
938
|
+
std::string prefill_text = get_option<std::string>(options, "prefill_text", "");
|
|
939
|
+
|
|
927
940
|
params.n_predict = get_option<int32_t>(options, "n_predict", -1);
|
|
928
941
|
params.sampling.temp = get_option<float>(options, "temperature", 0.80f);
|
|
929
942
|
params.sampling.top_k = get_option<int32_t>(options, "top_k", 40);
|
|
@@ -996,7 +1009,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
996
1009
|
auto *worker =
|
|
997
1010
|
new LlamaCompletionWorker(info, _sess, callback, params, stop_words,
|
|
998
1011
|
chat_format, thinking_forced_open, reasoning_format, media_paths, guide_tokens,
|
|
999
|
-
_has_vocoder, _tts_type);
|
|
1012
|
+
_has_vocoder, _tts_type, prefill_text);
|
|
1000
1013
|
worker->Queue();
|
|
1001
1014
|
_wip = worker;
|
|
1002
1015
|
worker->OnComplete([this]() { _wip = nullptr; });
|
|
@@ -1532,7 +1532,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1532
1532
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
|
1533
1533
|
add_opt(common_arg(
|
|
1534
1534
|
{"--context-shift"},
|
|
1535
|
-
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "
|
|
1535
|
+
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
|
|
1536
1536
|
[](common_params & params) {
|
|
1537
1537
|
params.ctx_shift = true;
|
|
1538
1538
|
}
|
|
@@ -1755,7 +1755,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1755
1755
|
[](common_params & params) {
|
|
1756
1756
|
params.warmup = false;
|
|
1757
1757
|
}
|
|
1758
|
-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
|
|
1758
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1759
1759
|
add_opt(common_arg(
|
|
1760
1760
|
{"--spm-infill"},
|
|
1761
1761
|
string_format(
|
|
@@ -2254,9 +2254,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2254
2254
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
2255
2255
|
add_opt(common_arg(
|
|
2256
2256
|
{"-dt", "--defrag-thold"}, "N",
|
|
2257
|
-
string_format("KV cache defragmentation threshold (
|
|
2257
|
+
string_format("KV cache defragmentation threshold (DEPRECATED)"),
|
|
2258
2258
|
[](common_params & params, const std::string & value) {
|
|
2259
|
-
params
|
|
2259
|
+
GGML_UNUSED(params);
|
|
2260
|
+
GGML_UNUSED(value);
|
|
2261
|
+
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
|
|
2260
2262
|
}
|
|
2261
2263
|
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
|
|
2262
2264
|
add_opt(common_arg(
|
|
@@ -134,6 +134,7 @@ struct templates_params {
|
|
|
134
134
|
json extra_context;
|
|
135
135
|
bool add_bos;
|
|
136
136
|
bool add_eos;
|
|
137
|
+
bool is_inference = true;
|
|
137
138
|
};
|
|
138
139
|
|
|
139
140
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
|
|
@@ -770,8 +771,7 @@ static std::string apply(
|
|
|
770
771
|
if (additional_context) {
|
|
771
772
|
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
772
773
|
}
|
|
773
|
-
|
|
774
|
-
// tmpl_inputs.now = std::chrono::system_clock::now();
|
|
774
|
+
tmpl_inputs.now = inputs.now;
|
|
775
775
|
|
|
776
776
|
minja::chat_template_options tmpl_opts;
|
|
777
777
|
// To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
|
|
@@ -1323,6 +1323,17 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
|
|
1323
1323
|
common_chat_params data;
|
|
1324
1324
|
auto prompt = apply(tmpl, inputs);
|
|
1325
1325
|
|
|
1326
|
+
// Check if we need to replace the return token with end token during
|
|
1327
|
+
// inference and without generation prompt. For more details see:
|
|
1328
|
+
// https://github.com/ggml-org/llama.cpp/issues/15417
|
|
1329
|
+
if (inputs.is_inference && !inputs.add_generation_prompt) {
|
|
1330
|
+
static constexpr std::string_view return_token = "<|return|>";
|
|
1331
|
+
static constexpr std::string_view end_token = "<|end|>";
|
|
1332
|
+
if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) {
|
|
1333
|
+
prompt.replace(pos, return_token.length(), end_token);
|
|
1334
|
+
}
|
|
1335
|
+
}
|
|
1336
|
+
|
|
1326
1337
|
data.prompt = prompt;
|
|
1327
1338
|
data.format = COMMON_CHAT_FORMAT_GPT_OSS;
|
|
1328
1339
|
|
|
@@ -1336,6 +1347,26 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
|
|
1336
1347
|
"<|end|>",
|
|
1337
1348
|
};
|
|
1338
1349
|
|
|
1350
|
+
if (!inputs.json_schema.is_null()) {
|
|
1351
|
+
data.grammar_lazy = false;
|
|
1352
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1353
|
+
auto schema = inputs.json_schema;
|
|
1354
|
+
builder.resolve_refs(schema);
|
|
1355
|
+
|
|
1356
|
+
auto not_end = builder.add_rule("not-end",
|
|
1357
|
+
"[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
|
|
1358
|
+
auto analysis = builder.add_rule("analysis",
|
|
1359
|
+
"\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
|
|
1360
|
+
auto constraint = builder.add_rule("constraint", "\"<|constrain|>\"? [a-zA-Z0-9_-]+");
|
|
1361
|
+
auto final = builder.add_rule("final",
|
|
1362
|
+
"\"<|channel|>final\" ( \" \" " + constraint + " )? \"<|message|>\" " +
|
|
1363
|
+
builder.add_schema("response", schema)
|
|
1364
|
+
);
|
|
1365
|
+
|
|
1366
|
+
builder.add_rule("root", "( " + analysis + " \"<|start|>assistant\" )? " + final);
|
|
1367
|
+
});
|
|
1368
|
+
}
|
|
1369
|
+
|
|
1339
1370
|
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1340
1371
|
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
1341
1372
|
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
@@ -2096,7 +2127,7 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2096
2127
|
}
|
|
2097
2128
|
|
|
2098
2129
|
// GPT-OSS
|
|
2099
|
-
if (src.find("<|channel|>") != std::string::npos
|
|
2130
|
+
if (src.find("<|channel|>") != std::string::npos) {
|
|
2100
2131
|
return common_chat_params_init_gpt_oss(tmpl, params);
|
|
2101
2132
|
}
|
|
2102
2133
|
|
|
@@ -558,13 +558,6 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
|
|
|
558
558
|
|
|
559
559
|
auto detokenized = common_token_to_piece(ctx, token);
|
|
560
560
|
|
|
561
|
-
detokenized.erase(
|
|
562
|
-
std::remove_if(
|
|
563
|
-
detokenized.begin(),
|
|
564
|
-
detokenized.end(),
|
|
565
|
-
[](const unsigned char c) { return !std::isprint(c); }),
|
|
566
|
-
detokenized.end());
|
|
567
|
-
|
|
568
561
|
buf << "'" << detokenized << "'"
|
|
569
562
|
<< ":" << std::to_string(token);
|
|
570
563
|
}
|
|
@@ -589,13 +582,6 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
|
|
589
582
|
|
|
590
583
|
auto detokenized = common_token_to_piece(ctx, batch.token[i]);
|
|
591
584
|
|
|
592
|
-
detokenized.erase(
|
|
593
|
-
std::remove_if(
|
|
594
|
-
detokenized.begin(),
|
|
595
|
-
detokenized.end(),
|
|
596
|
-
[](const unsigned char c) { return !std::isprint(c); }),
|
|
597
|
-
detokenized.end());
|
|
598
|
-
|
|
599
585
|
buf << "\n" << std::to_string(i)
|
|
600
586
|
<< ", token '" << detokenized << "'"
|
|
601
587
|
<< ", pos " << std::to_string(batch.pos[i])
|
|
@@ -1167,7 +1153,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|
|
1167
1153
|
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
|
1168
1154
|
cparams.pooling_type = params.pooling_type;
|
|
1169
1155
|
cparams.attention_type = params.attention_type;
|
|
1170
|
-
cparams.defrag_thold = params.defrag_thold;
|
|
1171
1156
|
cparams.cb_eval = params.cb_eval;
|
|
1172
1157
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
|
1173
1158
|
cparams.offload_kqv = !params.no_kv_offload;
|
|
@@ -289,7 +289,6 @@ struct common_params {
|
|
|
289
289
|
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
|
290
290
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
|
291
291
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
|
292
|
-
float defrag_thold = 0.1f; // KV cache defragmentation threshold
|
|
293
292
|
|
|
294
293
|
// offload params
|
|
295
294
|
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
|
@@ -376,7 +375,7 @@ struct common_params {
|
|
|
376
375
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
|
377
376
|
bool flash_attn = false; // flash attention
|
|
378
377
|
bool no_perf = false; // disable performance metrics
|
|
379
|
-
bool ctx_shift = false; // context shift on
|
|
378
|
+
bool ctx_shift = false; // context shift on infinite text generation
|
|
380
379
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
381
380
|
bool kv_unified = false; // enable unified KV cache
|
|
382
381
|
|
|
@@ -158,7 +158,6 @@ option(GGML_CUDA "ggml: use CUDA"
|
|
|
158
158
|
option(GGML_MUSA "ggml: use MUSA" OFF)
|
|
159
159
|
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
|
|
160
160
|
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
|
|
161
|
-
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
|
|
162
161
|
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
|
163
162
|
"ggml: max. batch size for using peer access")
|
|
164
163
|
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
|
|
@@ -244,6 +244,13 @@
|
|
|
244
244
|
#define GGML_MROPE_SECTIONS 4
|
|
245
245
|
|
|
246
246
|
#define GGML_UNUSED(x) (void)(x)
|
|
247
|
+
#ifdef __CUDACC__
|
|
248
|
+
template<typename... Args>
|
|
249
|
+
__host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexcept {}
|
|
250
|
+
#define GGML_UNUSED_VARS(...) ggml_unused_vars_impl(__VA_ARGS__)
|
|
251
|
+
#else
|
|
252
|
+
#define GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0)
|
|
253
|
+
#endif // __CUDACC__
|
|
247
254
|
|
|
248
255
|
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
|
249
256
|
|
|
@@ -505,6 +512,7 @@ extern "C" {
|
|
|
505
512
|
GGML_OP_IM2COL,
|
|
506
513
|
GGML_OP_IM2COL_BACK,
|
|
507
514
|
GGML_OP_CONV_2D,
|
|
515
|
+
GGML_OP_CONV_3D,
|
|
508
516
|
GGML_OP_CONV_2D_DW,
|
|
509
517
|
GGML_OP_CONV_TRANSPOSE_2D,
|
|
510
518
|
GGML_OP_POOL_1D,
|
|
@@ -1933,6 +1941,23 @@ extern "C" {
|
|
|
1933
1941
|
int d0, // dilation dimension 0
|
|
1934
1942
|
int d1); // dilation dimension 1
|
|
1935
1943
|
|
|
1944
|
+
GGML_API struct ggml_tensor * ggml_conv_3d(
|
|
1945
|
+
struct ggml_context * ctx,
|
|
1946
|
+
struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
|
|
1947
|
+
struct ggml_tensor * b, // input [W, H, D, C * N]
|
|
1948
|
+
int s0, // stride
|
|
1949
|
+
int s1,
|
|
1950
|
+
int s2,
|
|
1951
|
+
int p0, // padding
|
|
1952
|
+
int p1,
|
|
1953
|
+
int p2,
|
|
1954
|
+
int d0, // dilation
|
|
1955
|
+
int d1,
|
|
1956
|
+
int d2,
|
|
1957
|
+
int n_channels,
|
|
1958
|
+
int n_batch,
|
|
1959
|
+
int n_channels_out);
|
|
1960
|
+
|
|
1936
1961
|
enum ggml_op_pool {
|
|
1937
1962
|
GGML_OP_POOL_MAX,
|
|
1938
1963
|
GGML_OP_POOL_AVG,
|