@fugood/llama.node 1.1.7 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +4 -0
- package/lib/index.js +9 -2
- package/lib/index.ts +57 -30
- package/lib/version.js +2 -2
- package/lib/version.ts +2 -2
- package/package.json +14 -14
- package/src/LlamaContext.cpp +20 -0
- package/src/common.hpp +8 -1
- package/src/llama.cpp/common/arg.cpp +13 -4
- package/src/llama.cpp/common/chat.cpp +33 -2
- package/src/llama.cpp/common/common.cpp +0 -15
- package/src/llama.cpp/common/common.h +6 -4
- package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +25 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/include/llama.h +1 -110
- package/src/llama.cpp/src/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/llama-arch.cpp +19 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +13 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +5 -197
- package/src/llama.cpp/src/llama-context.h +2 -7
- package/src/llama.cpp/src/llama-cparams.h +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +35 -57
- package/src/llama.cpp/src/llama-graph.h +36 -46
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +47 -47
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +26 -26
- package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +88 -441
- package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +20 -43
- package/src/llama.cpp/src/llama-kv-cells.h +21 -21
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +5 -5
- package/src/llama.cpp/src/llama-memory-hybrid.h +6 -6
- package/src/llama.cpp/src/llama-memory-recurrent.h +1 -1
- package/src/llama.cpp/src/llama-memory.h +3 -8
- package/src/llama.cpp/src/llama-model.cpp +449 -246
- package/src/llama.cpp/src/llama-model.h +2 -0
package/lib/binding.ts
CHANGED
|
@@ -59,6 +59,10 @@ export type LlamaModelOptions = {
|
|
|
59
59
|
* Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
60
60
|
*/
|
|
61
61
|
swa_full?: boolean
|
|
62
|
+
/**
|
|
63
|
+
* Number of layers to keep MoE weights on CPU
|
|
64
|
+
*/
|
|
65
|
+
n_cpu_moe?: number
|
|
62
66
|
use_mlock?: boolean
|
|
63
67
|
use_mmap?: boolean
|
|
64
68
|
vocab_only?: boolean
|
package/lib/index.js
CHANGED
|
@@ -148,7 +148,12 @@ class LlamaContextWrapper {
|
|
|
148
148
|
enable_thinking: (_a = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _a !== void 0 ? _a : true,
|
|
149
149
|
add_generation_prompt: params === null || params === void 0 ? void 0 : params.add_generation_prompt,
|
|
150
150
|
now: params === null || params === void 0 ? void 0 : params.now,
|
|
151
|
-
chat_template_kwargs: params === null || params === void 0 ? void 0 : params.chat_template_kwargs
|
|
151
|
+
chat_template_kwargs: (params === null || params === void 0 ? void 0 : params.chat_template_kwargs)
|
|
152
|
+
? Object.entries(params.chat_template_kwargs).reduce((acc, [key, value]) => {
|
|
153
|
+
acc[key] = JSON.stringify(value); // Each value is a stringified JSON object
|
|
154
|
+
return acc;
|
|
155
|
+
}, {})
|
|
156
|
+
: undefined,
|
|
152
157
|
});
|
|
153
158
|
if (!useJinja) {
|
|
154
159
|
return {
|
|
@@ -179,7 +184,9 @@ class LlamaContextWrapper {
|
|
|
179
184
|
return this.ctx.embedding(text);
|
|
180
185
|
}
|
|
181
186
|
rerank(query, documents, params) {
|
|
182
|
-
return this.ctx
|
|
187
|
+
return this.ctx
|
|
188
|
+
.rerank(query, documents, params)
|
|
189
|
+
.then((results) => {
|
|
183
190
|
// Sort by score descending and add document text for convenience
|
|
184
191
|
return results
|
|
185
192
|
.map((result) => (Object.assign(Object.assign({}, result), { document: documents[result.index] })))
|
package/lib/index.ts
CHANGED
|
@@ -165,11 +165,11 @@ class LlamaContextWrapper {
|
|
|
165
165
|
response_format?: CompletionResponseFormat
|
|
166
166
|
tools?: Tool[]
|
|
167
167
|
parallel_tool_calls?: boolean
|
|
168
|
-
tool_choice?: string
|
|
169
|
-
enable_thinking?: boolean
|
|
170
|
-
add_generation_prompt?: boolean
|
|
171
|
-
now?: string | number
|
|
172
|
-
chat_template_kwargs?: Record<string, string
|
|
168
|
+
tool_choice?: string
|
|
169
|
+
enable_thinking?: boolean
|
|
170
|
+
add_generation_prompt?: boolean
|
|
171
|
+
now?: string | number
|
|
172
|
+
chat_template_kwargs?: Record<string, string>
|
|
173
173
|
},
|
|
174
174
|
): FormattedChatResult {
|
|
175
175
|
const {
|
|
@@ -192,7 +192,15 @@ class LlamaContextWrapper {
|
|
|
192
192
|
enable_thinking: params?.enable_thinking ?? true,
|
|
193
193
|
add_generation_prompt: params?.add_generation_prompt,
|
|
194
194
|
now: params?.now,
|
|
195
|
-
chat_template_kwargs: params?.chat_template_kwargs
|
|
195
|
+
chat_template_kwargs: params?.chat_template_kwargs
|
|
196
|
+
? Object.entries(params.chat_template_kwargs).reduce(
|
|
197
|
+
(acc, [key, value]) => {
|
|
198
|
+
acc[key] = JSON.stringify(value) // Each value is a stringified JSON object
|
|
199
|
+
return acc
|
|
200
|
+
},
|
|
201
|
+
{} as Record<string, any>,
|
|
202
|
+
)
|
|
203
|
+
: undefined,
|
|
196
204
|
})
|
|
197
205
|
|
|
198
206
|
if (!useJinja) {
|
|
@@ -218,18 +226,24 @@ class LlamaContextWrapper {
|
|
|
218
226
|
): Promise<LlamaCompletionResult> {
|
|
219
227
|
const { messages, media_paths = options.media_paths } =
|
|
220
228
|
this._formatMediaChat(options.messages)
|
|
221
|
-
return this.ctx.completion(
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
229
|
+
return this.ctx.completion(
|
|
230
|
+
{
|
|
231
|
+
...options,
|
|
232
|
+
messages,
|
|
233
|
+
media_paths: options.media_paths || media_paths,
|
|
234
|
+
},
|
|
235
|
+
callback || (() => {}),
|
|
236
|
+
)
|
|
226
237
|
}
|
|
227
238
|
|
|
228
239
|
stopCompletion(): void {
|
|
229
240
|
return this.ctx.stopCompletion()
|
|
230
241
|
}
|
|
231
242
|
|
|
232
|
-
tokenize(
|
|
243
|
+
tokenize(
|
|
244
|
+
text: string,
|
|
245
|
+
{ media_paths }: { media_paths?: string[] } = {},
|
|
246
|
+
): Promise<TokenizeResult> {
|
|
233
247
|
return this.ctx.tokenize(text, media_paths)
|
|
234
248
|
}
|
|
235
249
|
|
|
@@ -241,16 +255,27 @@ class LlamaContextWrapper {
|
|
|
241
255
|
return this.ctx.embedding(text)
|
|
242
256
|
}
|
|
243
257
|
|
|
244
|
-
rerank(
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
258
|
+
rerank(
|
|
259
|
+
query: string,
|
|
260
|
+
documents: string[],
|
|
261
|
+
params?: RerankParams,
|
|
262
|
+
): Promise<Array<RerankResult & { document: string }>> {
|
|
263
|
+
return this.ctx
|
|
264
|
+
.rerank(query, documents, params)
|
|
265
|
+
.then((results: RerankResult[]) => {
|
|
266
|
+
// Sort by score descending and add document text for convenience
|
|
267
|
+
return results
|
|
268
|
+
.map((result: RerankResult) => ({
|
|
269
|
+
...result,
|
|
270
|
+
document: documents[result.index],
|
|
271
|
+
}))
|
|
272
|
+
.sort(
|
|
273
|
+
(
|
|
274
|
+
a: RerankResult & { document: string },
|
|
275
|
+
b: RerankResult & { document: string },
|
|
276
|
+
) => b.score - a.score,
|
|
277
|
+
)
|
|
278
|
+
})
|
|
254
279
|
}
|
|
255
280
|
|
|
256
281
|
saveSession(path: string): Promise<void> {
|
|
@@ -277,10 +302,7 @@ class LlamaContextWrapper {
|
|
|
277
302
|
return this.ctx.getLoadedLoraAdapters()
|
|
278
303
|
}
|
|
279
304
|
|
|
280
|
-
initMultimodal(options: {
|
|
281
|
-
path: string
|
|
282
|
-
use_gpu?: boolean
|
|
283
|
-
}): boolean {
|
|
305
|
+
initMultimodal(options: { path: string; use_gpu?: boolean }): boolean {
|
|
284
306
|
return this.ctx.initMultimodal(options)
|
|
285
307
|
}
|
|
286
308
|
|
|
@@ -299,7 +321,7 @@ class LlamaContextWrapper {
|
|
|
299
321
|
return this.ctx.getMultimodalSupport()
|
|
300
322
|
}
|
|
301
323
|
|
|
302
|
-
initVocoder(options: { path: string
|
|
324
|
+
initVocoder(options: { path: string; n_batch?: number }): boolean {
|
|
303
325
|
return this.ctx.initVocoder(options)
|
|
304
326
|
}
|
|
305
327
|
|
|
@@ -311,7 +333,10 @@ class LlamaContextWrapper {
|
|
|
311
333
|
return this.ctx.isVocoderEnabled()
|
|
312
334
|
}
|
|
313
335
|
|
|
314
|
-
getFormattedAudioCompletion(
|
|
336
|
+
getFormattedAudioCompletion(
|
|
337
|
+
speaker: string | null,
|
|
338
|
+
text: string,
|
|
339
|
+
): {
|
|
315
340
|
prompt: string
|
|
316
341
|
grammar?: string
|
|
317
342
|
} {
|
|
@@ -322,7 +347,7 @@ class LlamaContextWrapper {
|
|
|
322
347
|
return this.ctx.getAudioCompletionGuideTokens(text)
|
|
323
348
|
}
|
|
324
349
|
|
|
325
|
-
decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array> {
|
|
350
|
+
decodeAudioTokens(tokens: number[] | Int32Array): Promise<Float32Array> {
|
|
326
351
|
return this.ctx.decodeAudioTokens(tokens)
|
|
327
352
|
}
|
|
328
353
|
}
|
|
@@ -348,7 +373,9 @@ const modelInfoSkip = [
|
|
|
348
373
|
'tokenizer.ggml.scores',
|
|
349
374
|
]
|
|
350
375
|
|
|
351
|
-
export const loadLlamaModelInfo = async (
|
|
376
|
+
export const loadLlamaModelInfo = async (
|
|
377
|
+
path: string,
|
|
378
|
+
): Promise<GGUFModelInfo> => {
|
|
352
379
|
const variant = 'default'
|
|
353
380
|
mods[variant] ??= await loadModule(variant)
|
|
354
381
|
refreshNativeLogSetup()
|
package/lib/version.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.BUILD_COMMIT = exports.BUILD_NUMBER = void 0;
|
|
4
|
-
exports.BUILD_NUMBER = '
|
|
5
|
-
exports.BUILD_COMMIT = '
|
|
4
|
+
exports.BUILD_NUMBER = '6250';
|
|
5
|
+
exports.BUILD_COMMIT = 'e92734d51';
|
package/lib/version.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
export const BUILD_NUMBER = '
|
|
2
|
-
export const BUILD_COMMIT = '
|
|
1
|
+
export const BUILD_NUMBER = '6250';
|
|
2
|
+
export const BUILD_COMMIT = 'e92734d51';
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.1.
|
|
4
|
+
"version": "1.1.9",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -71,19 +71,19 @@
|
|
|
71
71
|
"CMakeLists.txt"
|
|
72
72
|
],
|
|
73
73
|
"optionalDependencies": {
|
|
74
|
-
"@fugood/node-llama-linux-x64": "1.1.
|
|
75
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.1.
|
|
76
|
-
"@fugood/node-llama-linux-x64-cuda": "1.1.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.1.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.1.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.1.
|
|
80
|
-
"@fugood/node-llama-win32-x64": "1.1.
|
|
81
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.1.
|
|
82
|
-
"@fugood/node-llama-win32-x64-cuda": "1.1.
|
|
83
|
-
"@fugood/node-llama-win32-arm64": "1.1.
|
|
84
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.1.
|
|
85
|
-
"@fugood/node-llama-darwin-x64": "1.1.
|
|
86
|
-
"@fugood/node-llama-darwin-arm64": "1.1.
|
|
74
|
+
"@fugood/node-llama-linux-x64": "1.1.9",
|
|
75
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.1.9",
|
|
76
|
+
"@fugood/node-llama-linux-x64-cuda": "1.1.9",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.1.9",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.1.9",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.1.9",
|
|
80
|
+
"@fugood/node-llama-win32-x64": "1.1.9",
|
|
81
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.1.9",
|
|
82
|
+
"@fugood/node-llama-win32-x64-cuda": "1.1.9",
|
|
83
|
+
"@fugood/node-llama-win32-arm64": "1.1.9",
|
|
84
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.1.9",
|
|
85
|
+
"@fugood/node-llama-darwin-x64": "1.1.9",
|
|
86
|
+
"@fugood/node-llama-darwin-arm64": "1.1.9"
|
|
87
87
|
},
|
|
88
88
|
"devDependencies": {
|
|
89
89
|
"@babel/preset-env": "^7.24.4",
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
#include "llama-impl.h"
|
|
16
16
|
|
|
17
17
|
#include <atomic>
|
|
18
|
+
#include <list>
|
|
18
19
|
#include <mutex>
|
|
19
20
|
#include <queue>
|
|
20
21
|
|
|
@@ -258,6 +259,16 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
258
259
|
params.numa =
|
|
259
260
|
static_cast<ggml_numa_strategy>(get_option<uint32_t>(options, "numa", 0));
|
|
260
261
|
|
|
262
|
+
int n_cpu_moe = get_option<int32_t>(options, "n_cpu_moe", 0);
|
|
263
|
+
if (n_cpu_moe > 0) {
|
|
264
|
+
static std::list<std::string> buft_overrides;
|
|
265
|
+
for (int i = 0; i < n_cpu_moe; ++i) {
|
|
266
|
+
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
|
|
267
|
+
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
|
|
268
|
+
}
|
|
269
|
+
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
|
270
|
+
}
|
|
271
|
+
|
|
261
272
|
llama_backend_init();
|
|
262
273
|
llama_numa_init(params.numa);
|
|
263
274
|
|
|
@@ -636,6 +647,15 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
636
647
|
_sess, _templates, messages, chat_template, json_schema_str, tools_str,
|
|
637
648
|
parallel_tool_calls, tool_choice, enable_thinking,
|
|
638
649
|
add_generation_prompt, now_str, chat_template_kwargs);
|
|
650
|
+
} catch (const nlohmann::json_abi_v3_12_0::detail::parse_error& e) {
|
|
651
|
+
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
652
|
+
return env.Undefined();
|
|
653
|
+
} catch (const std::invalid_argument& e) {
|
|
654
|
+
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
655
|
+
return env.Undefined();
|
|
656
|
+
} catch (const std::runtime_error& e) {
|
|
657
|
+
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
658
|
+
return env.Undefined();
|
|
639
659
|
} catch (const std::exception &e) {
|
|
640
660
|
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
641
661
|
return env.Undefined();
|
package/src/common.hpp
CHANGED
|
@@ -461,7 +461,14 @@ processMediaPrompt(llama_context *ctx, mtmd_context *mtmd_ctx,
|
|
|
461
461
|
}
|
|
462
462
|
|
|
463
463
|
// Clear all KV cache entries after position n_past
|
|
464
|
-
|
|
464
|
+
auto * kv = llama_get_memory(ctx);
|
|
465
|
+
bool clear_result = llama_memory_seq_rm(kv, 0, n_past, -1);
|
|
466
|
+
if (!clear_result) {
|
|
467
|
+
fprintf(stdout, "[DEBUG] llama_memory_seq_rm failed (likely using a non-Transformer model)! Trying full clear...");
|
|
468
|
+
llama_memory_clear(kv, false);
|
|
469
|
+
n_past = 0;
|
|
470
|
+
new_n_past = n_past;
|
|
471
|
+
}
|
|
465
472
|
|
|
466
473
|
size_t num_chunks = mtmd_input_chunks_size(chunks);
|
|
467
474
|
|
|
@@ -1530,6 +1530,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1530
1530
|
params.ctx_shift = false;
|
|
1531
1531
|
}
|
|
1532
1532
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
|
1533
|
+
add_opt(common_arg(
|
|
1534
|
+
{"--context-shift"},
|
|
1535
|
+
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
|
|
1536
|
+
[](common_params & params) {
|
|
1537
|
+
params.ctx_shift = true;
|
|
1538
|
+
}
|
|
1539
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
|
|
1533
1540
|
add_opt(common_arg(
|
|
1534
1541
|
{"--chunks"}, "N",
|
|
1535
1542
|
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
|
|
@@ -1748,7 +1755,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1748
1755
|
[](common_params & params) {
|
|
1749
1756
|
params.warmup = false;
|
|
1750
1757
|
}
|
|
1751
|
-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
|
|
1758
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1752
1759
|
add_opt(common_arg(
|
|
1753
1760
|
{"--spm-infill"},
|
|
1754
1761
|
string_format(
|
|
@@ -1823,7 +1830,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1823
1830
|
[](common_params & params, const std::string & value) {
|
|
1824
1831
|
params.sampling.top_n_sigma = std::stof(value);
|
|
1825
1832
|
}
|
|
1826
|
-
).
|
|
1833
|
+
).set_sparam());
|
|
1827
1834
|
add_opt(common_arg(
|
|
1828
1835
|
{"--xtc-probability"}, "N",
|
|
1829
1836
|
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
|
|
@@ -2247,9 +2254,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2247
2254
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
2248
2255
|
add_opt(common_arg(
|
|
2249
2256
|
{"-dt", "--defrag-thold"}, "N",
|
|
2250
|
-
string_format("KV cache defragmentation threshold (
|
|
2257
|
+
string_format("KV cache defragmentation threshold (DEPRECATED)"),
|
|
2251
2258
|
[](common_params & params, const std::string & value) {
|
|
2252
|
-
params
|
|
2259
|
+
GGML_UNUSED(params);
|
|
2260
|
+
GGML_UNUSED(value);
|
|
2261
|
+
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
|
|
2253
2262
|
}
|
|
2254
2263
|
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
|
|
2255
2264
|
add_opt(common_arg(
|
|
@@ -134,6 +134,7 @@ struct templates_params {
|
|
|
134
134
|
json extra_context;
|
|
135
135
|
bool add_bos;
|
|
136
136
|
bool add_eos;
|
|
137
|
+
bool is_inference = true;
|
|
137
138
|
};
|
|
138
139
|
|
|
139
140
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
|
|
@@ -619,7 +620,6 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
|
|
|
619
620
|
case COMMON_REASONING_FORMAT_AUTO: return "auto";
|
|
620
621
|
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
|
|
621
622
|
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
|
|
622
|
-
case COMMON_REASONING_FORMAT_GRANITE: return "granite";
|
|
623
623
|
default:
|
|
624
624
|
throw std::runtime_error("Unknown reasoning format");
|
|
625
625
|
}
|
|
@@ -1324,6 +1324,17 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
|
|
1324
1324
|
common_chat_params data;
|
|
1325
1325
|
auto prompt = apply(tmpl, inputs);
|
|
1326
1326
|
|
|
1327
|
+
// Check if we need to replace the return token with end token during
|
|
1328
|
+
// inference and without generation prompt. For more details see:
|
|
1329
|
+
// https://github.com/ggml-org/llama.cpp/issues/15417
|
|
1330
|
+
if (inputs.is_inference && !inputs.add_generation_prompt) {
|
|
1331
|
+
static constexpr std::string_view return_token = "<|return|>";
|
|
1332
|
+
static constexpr std::string_view end_token = "<|end|>";
|
|
1333
|
+
if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) {
|
|
1334
|
+
prompt.replace(pos, return_token.length(), end_token);
|
|
1335
|
+
}
|
|
1336
|
+
}
|
|
1337
|
+
|
|
1327
1338
|
data.prompt = prompt;
|
|
1328
1339
|
data.format = COMMON_CHAT_FORMAT_GPT_OSS;
|
|
1329
1340
|
|
|
@@ -1337,6 +1348,26 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
|
|
1337
1348
|
"<|end|>",
|
|
1338
1349
|
};
|
|
1339
1350
|
|
|
1351
|
+
if (!inputs.json_schema.is_null()) {
|
|
1352
|
+
data.grammar_lazy = false;
|
|
1353
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1354
|
+
auto schema = inputs.json_schema;
|
|
1355
|
+
builder.resolve_refs(schema);
|
|
1356
|
+
|
|
1357
|
+
auto not_end = builder.add_rule("not-end",
|
|
1358
|
+
"[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
|
|
1359
|
+
auto analysis = builder.add_rule("analysis",
|
|
1360
|
+
"\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
|
|
1361
|
+
auto constraint = builder.add_rule("constraint", "\"<|constrain|>\"? [a-zA-Z0-9_-]+");
|
|
1362
|
+
auto final = builder.add_rule("final",
|
|
1363
|
+
"\"<|channel|>final\" ( \" \" " + constraint + " )? \"<|message|>\" " +
|
|
1364
|
+
builder.add_schema("response", schema)
|
|
1365
|
+
);
|
|
1366
|
+
|
|
1367
|
+
builder.add_rule("root", "( " + analysis + " \"<|start|>assistant\" )? " + final);
|
|
1368
|
+
});
|
|
1369
|
+
}
|
|
1370
|
+
|
|
1340
1371
|
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1341
1372
|
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
1342
1373
|
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
@@ -2097,7 +2128,7 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2097
2128
|
}
|
|
2098
2129
|
|
|
2099
2130
|
// GPT-OSS
|
|
2100
|
-
if (src.find("<|channel|>") != std::string::npos
|
|
2131
|
+
if (src.find("<|channel|>") != std::string::npos) {
|
|
2101
2132
|
return common_chat_params_init_gpt_oss(tmpl, params);
|
|
2102
2133
|
}
|
|
2103
2134
|
|
|
@@ -558,13 +558,6 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
|
|
|
558
558
|
|
|
559
559
|
auto detokenized = common_token_to_piece(ctx, token);
|
|
560
560
|
|
|
561
|
-
detokenized.erase(
|
|
562
|
-
std::remove_if(
|
|
563
|
-
detokenized.begin(),
|
|
564
|
-
detokenized.end(),
|
|
565
|
-
[](const unsigned char c) { return !std::isprint(c); }),
|
|
566
|
-
detokenized.end());
|
|
567
|
-
|
|
568
561
|
buf << "'" << detokenized << "'"
|
|
569
562
|
<< ":" << std::to_string(token);
|
|
570
563
|
}
|
|
@@ -589,13 +582,6 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
|
|
589
582
|
|
|
590
583
|
auto detokenized = common_token_to_piece(ctx, batch.token[i]);
|
|
591
584
|
|
|
592
|
-
detokenized.erase(
|
|
593
|
-
std::remove_if(
|
|
594
|
-
detokenized.begin(),
|
|
595
|
-
detokenized.end(),
|
|
596
|
-
[](const unsigned char c) { return !std::isprint(c); }),
|
|
597
|
-
detokenized.end());
|
|
598
|
-
|
|
599
585
|
buf << "\n" << std::to_string(i)
|
|
600
586
|
<< ", token '" << detokenized << "'"
|
|
601
587
|
<< ", pos " << std::to_string(batch.pos[i])
|
|
@@ -1167,7 +1153,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|
|
1167
1153
|
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
|
1168
1154
|
cparams.pooling_type = params.pooling_type;
|
|
1169
1155
|
cparams.attention_type = params.attention_type;
|
|
1170
|
-
cparams.defrag_thold = params.defrag_thold;
|
|
1171
1156
|
cparams.cb_eval = params.cb_eval;
|
|
1172
1157
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
|
1173
1158
|
cparams.offload_kqv = !params.no_kv_offload;
|
|
@@ -239,12 +239,15 @@ struct common_params_diffusion {
|
|
|
239
239
|
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
|
|
240
240
|
};
|
|
241
241
|
|
|
242
|
+
// reasoning API response format (not to be confused as chat template's reasoning format)
|
|
242
243
|
enum common_reasoning_format {
|
|
243
244
|
COMMON_REASONING_FORMAT_NONE,
|
|
244
|
-
COMMON_REASONING_FORMAT_AUTO,
|
|
245
|
+
COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
|
|
245
246
|
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
|
246
247
|
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
|
247
|
-
|
|
248
|
+
// do not extend this enum unless you absolutely have to
|
|
249
|
+
// in most cases, use COMMON_REASONING_FORMAT_AUTO
|
|
250
|
+
// see: https://github.com/ggml-org/llama.cpp/pull/15408
|
|
248
251
|
};
|
|
249
252
|
|
|
250
253
|
|
|
@@ -286,7 +289,6 @@ struct common_params {
|
|
|
286
289
|
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
|
287
290
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
|
288
291
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
|
289
|
-
float defrag_thold = 0.1f; // KV cache defragmentation threshold
|
|
290
292
|
|
|
291
293
|
// offload params
|
|
292
294
|
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
|
@@ -373,7 +375,7 @@ struct common_params {
|
|
|
373
375
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
|
374
376
|
bool flash_attn = false; // flash attention
|
|
375
377
|
bool no_perf = false; // disable performance metrics
|
|
376
|
-
bool ctx_shift =
|
|
378
|
+
bool ctx_shift = false; // context shift on infinite text generation
|
|
377
379
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
378
380
|
bool kv_unified = false; // enable unified KV cache
|
|
379
381
|
|
|
@@ -158,7 +158,6 @@ option(GGML_CUDA "ggml: use CUDA"
|
|
|
158
158
|
option(GGML_MUSA "ggml: use MUSA" OFF)
|
|
159
159
|
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
|
|
160
160
|
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
|
|
161
|
-
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
|
|
162
161
|
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
|
163
162
|
"ggml: max. batch size for using peer access")
|
|
164
163
|
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
|
|
@@ -244,6 +244,13 @@
|
|
|
244
244
|
#define GGML_MROPE_SECTIONS 4
|
|
245
245
|
|
|
246
246
|
#define GGML_UNUSED(x) (void)(x)
|
|
247
|
+
#ifdef __CUDACC__
|
|
248
|
+
template<typename... Args>
|
|
249
|
+
__host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexcept {}
|
|
250
|
+
#define GGML_UNUSED_VARS(...) ggml_unused_vars_impl(__VA_ARGS__)
|
|
251
|
+
#else
|
|
252
|
+
#define GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0)
|
|
253
|
+
#endif // __CUDACC__
|
|
247
254
|
|
|
248
255
|
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
|
249
256
|
|
|
@@ -505,6 +512,7 @@ extern "C" {
|
|
|
505
512
|
GGML_OP_IM2COL,
|
|
506
513
|
GGML_OP_IM2COL_BACK,
|
|
507
514
|
GGML_OP_CONV_2D,
|
|
515
|
+
GGML_OP_CONV_3D,
|
|
508
516
|
GGML_OP_CONV_2D_DW,
|
|
509
517
|
GGML_OP_CONV_TRANSPOSE_2D,
|
|
510
518
|
GGML_OP_POOL_1D,
|
|
@@ -1933,6 +1941,23 @@ extern "C" {
|
|
|
1933
1941
|
int d0, // dilation dimension 0
|
|
1934
1942
|
int d1); // dilation dimension 1
|
|
1935
1943
|
|
|
1944
|
+
GGML_API struct ggml_tensor * ggml_conv_3d(
|
|
1945
|
+
struct ggml_context * ctx,
|
|
1946
|
+
struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
|
|
1947
|
+
struct ggml_tensor * b, // input [W, H, D, C * N]
|
|
1948
|
+
int s0, // stride
|
|
1949
|
+
int s1,
|
|
1950
|
+
int s2,
|
|
1951
|
+
int p0, // padding
|
|
1952
|
+
int p1,
|
|
1953
|
+
int p2,
|
|
1954
|
+
int d0, // dilation
|
|
1955
|
+
int d1,
|
|
1956
|
+
int d2,
|
|
1957
|
+
int n_channels,
|
|
1958
|
+
int n_batch,
|
|
1959
|
+
int n_channels_out);
|
|
1960
|
+
|
|
1936
1961
|
enum ggml_op_pool {
|
|
1937
1962
|
GGML_OP_POOL_MAX,
|
|
1938
1963
|
GGML_OP_POOL_AVG,
|
|
@@ -278,6 +278,72 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
278
278
|
#endif
|
|
279
279
|
}
|
|
280
280
|
|
|
281
|
+
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
282
|
+
assert(nrc == 1);
|
|
283
|
+
UNUSED(nrc);
|
|
284
|
+
UNUSED(bx);
|
|
285
|
+
UNUSED(by);
|
|
286
|
+
UNUSED(bs);
|
|
287
|
+
assert(n % QK_MXFP4 == 0);
|
|
288
|
+
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
|
|
289
|
+
|
|
290
|
+
const block_mxfp4 * GGML_RESTRICT x = vx;
|
|
291
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
292
|
+
|
|
293
|
+
const int nb = n / QK_MXFP4;
|
|
294
|
+
|
|
295
|
+
int ib = 0;
|
|
296
|
+
float sumf = 0;
|
|
297
|
+
|
|
298
|
+
#if defined(__POWER9_VECTOR__)
|
|
299
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
|
300
|
+
const vector unsigned char vshift4 = vec_splats((unsigned char)4);
|
|
301
|
+
vector float vsumf0 = vec_splats(0.0f);
|
|
302
|
+
|
|
303
|
+
vector signed char kv = vec_xl(0, (const signed char *)kvalues_mxfp4);
|
|
304
|
+
|
|
305
|
+
#pragma GCC unroll 8
|
|
306
|
+
for (; ib < nb; ++ib) {
|
|
307
|
+
__builtin_prefetch(x[ib].qs, 0, 1);
|
|
308
|
+
__builtin_prefetch(y[ib].qs, 0, 1);
|
|
309
|
+
|
|
310
|
+
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d) *
|
|
311
|
+
GGML_E8M0_TO_FP32_HALF(x[ib].e));
|
|
312
|
+
|
|
313
|
+
vector signed char q8y0 = vec_xl( 0, y[ib].qs);
|
|
314
|
+
vector signed char q8y1 = vec_xl(16, y[ib].qs);
|
|
315
|
+
|
|
316
|
+
vector signed char qxs = (vector signed char)vec_xl(0, x[ib].qs);
|
|
317
|
+
|
|
318
|
+
vector unsigned char lo_nibbles = (vector unsigned char)vec_and(qxs, lowMask);
|
|
319
|
+
vector unsigned char hi_nibbles = (vector unsigned char)vec_sr(qxs, vshift4);
|
|
320
|
+
|
|
321
|
+
vector signed char q4x0 = vec_perm(kv, kv, lo_nibbles);
|
|
322
|
+
vector signed char q4x1 = vec_perm(kv, kv, hi_nibbles);
|
|
323
|
+
|
|
324
|
+
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
|
|
325
|
+
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
|
326
|
+
|
|
327
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
|
328
|
+
vsumi0 = vec_sum4s(qv0, vsumi0);
|
|
329
|
+
vsumi0 = vec_sum4s(qv1, vsumi0);
|
|
330
|
+
|
|
331
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vyd, vsumf0);
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
|
335
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
|
336
|
+
sumf = vec_extract(vsumf0, 0);
|
|
337
|
+
*s = sumf;
|
|
338
|
+
#else
|
|
339
|
+
UNUSED(x);
|
|
340
|
+
UNUSED(y);
|
|
341
|
+
UNUSED(ib);
|
|
342
|
+
UNUSED(sumf);
|
|
343
|
+
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
344
|
+
#endif
|
|
345
|
+
}
|
|
346
|
+
|
|
281
347
|
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
282
348
|
const int qk = QK8_0;
|
|
283
349
|
const int nb = n / qk;
|