@fugood/llama.node 1.1.8 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +4 -0
- package/lib/index.js +9 -2
- package/lib/index.ts +57 -30
- package/lib/version.js +2 -2
- package/lib/version.ts +2 -2
- package/package.json +14 -14
- package/src/LlamaContext.cpp +11 -0
- package/src/llama.cpp/common/arg.cpp +6 -4
- package/src/llama.cpp/common/chat.cpp +33 -1
- package/src/llama.cpp/common/common.cpp +0 -15
- package/src/llama.cpp/common/common.h +1 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +25 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/include/llama.h +1 -110
- package/src/llama.cpp/src/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/llama-arch.cpp +19 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +13 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +5 -192
- package/src/llama.cpp/src/llama-context.h +2 -7
- package/src/llama.cpp/src/llama-cparams.h +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +35 -57
- package/src/llama.cpp/src/llama-graph.h +36 -46
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +47 -47
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +26 -26
- package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +88 -441
- package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +20 -43
- package/src/llama.cpp/src/llama-kv-cells.h +21 -21
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +5 -5
- package/src/llama.cpp/src/llama-memory-hybrid.h +6 -6
- package/src/llama.cpp/src/llama-memory-recurrent.h +1 -1
- package/src/llama.cpp/src/llama-memory.h +3 -8
- package/src/llama.cpp/src/llama-model.cpp +369 -176
- package/src/llama.cpp/src/llama-model.h +1 -0
package/lib/binding.ts
CHANGED
|
@@ -59,6 +59,10 @@ export type LlamaModelOptions = {
|
|
|
59
59
|
* Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
60
60
|
*/
|
|
61
61
|
swa_full?: boolean
|
|
62
|
+
/**
|
|
63
|
+
* Number of layers to keep MoE weights on CPU
|
|
64
|
+
*/
|
|
65
|
+
n_cpu_moe?: number
|
|
62
66
|
use_mlock?: boolean
|
|
63
67
|
use_mmap?: boolean
|
|
64
68
|
vocab_only?: boolean
|
package/lib/index.js
CHANGED
|
@@ -148,7 +148,12 @@ class LlamaContextWrapper {
|
|
|
148
148
|
enable_thinking: (_a = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _a !== void 0 ? _a : true,
|
|
149
149
|
add_generation_prompt: params === null || params === void 0 ? void 0 : params.add_generation_prompt,
|
|
150
150
|
now: params === null || params === void 0 ? void 0 : params.now,
|
|
151
|
-
chat_template_kwargs: params === null || params === void 0 ? void 0 : params.chat_template_kwargs
|
|
151
|
+
chat_template_kwargs: (params === null || params === void 0 ? void 0 : params.chat_template_kwargs)
|
|
152
|
+
? Object.entries(params.chat_template_kwargs).reduce((acc, [key, value]) => {
|
|
153
|
+
acc[key] = JSON.stringify(value); // Each value is a stringified JSON object
|
|
154
|
+
return acc;
|
|
155
|
+
}, {})
|
|
156
|
+
: undefined,
|
|
152
157
|
});
|
|
153
158
|
if (!useJinja) {
|
|
154
159
|
return {
|
|
@@ -179,7 +184,9 @@ class LlamaContextWrapper {
|
|
|
179
184
|
return this.ctx.embedding(text);
|
|
180
185
|
}
|
|
181
186
|
rerank(query, documents, params) {
|
|
182
|
-
return this.ctx
|
|
187
|
+
return this.ctx
|
|
188
|
+
.rerank(query, documents, params)
|
|
189
|
+
.then((results) => {
|
|
183
190
|
// Sort by score descending and add document text for convenience
|
|
184
191
|
return results
|
|
185
192
|
.map((result) => (Object.assign(Object.assign({}, result), { document: documents[result.index] })))
|
package/lib/index.ts
CHANGED
|
@@ -165,11 +165,11 @@ class LlamaContextWrapper {
|
|
|
165
165
|
response_format?: CompletionResponseFormat
|
|
166
166
|
tools?: Tool[]
|
|
167
167
|
parallel_tool_calls?: boolean
|
|
168
|
-
tool_choice?: string
|
|
169
|
-
enable_thinking?: boolean
|
|
170
|
-
add_generation_prompt?: boolean
|
|
171
|
-
now?: string | number
|
|
172
|
-
chat_template_kwargs?: Record<string, string
|
|
168
|
+
tool_choice?: string
|
|
169
|
+
enable_thinking?: boolean
|
|
170
|
+
add_generation_prompt?: boolean
|
|
171
|
+
now?: string | number
|
|
172
|
+
chat_template_kwargs?: Record<string, string>
|
|
173
173
|
},
|
|
174
174
|
): FormattedChatResult {
|
|
175
175
|
const {
|
|
@@ -192,7 +192,15 @@ class LlamaContextWrapper {
|
|
|
192
192
|
enable_thinking: params?.enable_thinking ?? true,
|
|
193
193
|
add_generation_prompt: params?.add_generation_prompt,
|
|
194
194
|
now: params?.now,
|
|
195
|
-
chat_template_kwargs: params?.chat_template_kwargs
|
|
195
|
+
chat_template_kwargs: params?.chat_template_kwargs
|
|
196
|
+
? Object.entries(params.chat_template_kwargs).reduce(
|
|
197
|
+
(acc, [key, value]) => {
|
|
198
|
+
acc[key] = JSON.stringify(value) // Each value is a stringified JSON object
|
|
199
|
+
return acc
|
|
200
|
+
},
|
|
201
|
+
{} as Record<string, any>,
|
|
202
|
+
)
|
|
203
|
+
: undefined,
|
|
196
204
|
})
|
|
197
205
|
|
|
198
206
|
if (!useJinja) {
|
|
@@ -218,18 +226,24 @@ class LlamaContextWrapper {
|
|
|
218
226
|
): Promise<LlamaCompletionResult> {
|
|
219
227
|
const { messages, media_paths = options.media_paths } =
|
|
220
228
|
this._formatMediaChat(options.messages)
|
|
221
|
-
return this.ctx.completion(
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
229
|
+
return this.ctx.completion(
|
|
230
|
+
{
|
|
231
|
+
...options,
|
|
232
|
+
messages,
|
|
233
|
+
media_paths: options.media_paths || media_paths,
|
|
234
|
+
},
|
|
235
|
+
callback || (() => {}),
|
|
236
|
+
)
|
|
226
237
|
}
|
|
227
238
|
|
|
228
239
|
stopCompletion(): void {
|
|
229
240
|
return this.ctx.stopCompletion()
|
|
230
241
|
}
|
|
231
242
|
|
|
232
|
-
tokenize(
|
|
243
|
+
tokenize(
|
|
244
|
+
text: string,
|
|
245
|
+
{ media_paths }: { media_paths?: string[] } = {},
|
|
246
|
+
): Promise<TokenizeResult> {
|
|
233
247
|
return this.ctx.tokenize(text, media_paths)
|
|
234
248
|
}
|
|
235
249
|
|
|
@@ -241,16 +255,27 @@ class LlamaContextWrapper {
|
|
|
241
255
|
return this.ctx.embedding(text)
|
|
242
256
|
}
|
|
243
257
|
|
|
244
|
-
rerank(
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
258
|
+
rerank(
|
|
259
|
+
query: string,
|
|
260
|
+
documents: string[],
|
|
261
|
+
params?: RerankParams,
|
|
262
|
+
): Promise<Array<RerankResult & { document: string }>> {
|
|
263
|
+
return this.ctx
|
|
264
|
+
.rerank(query, documents, params)
|
|
265
|
+
.then((results: RerankResult[]) => {
|
|
266
|
+
// Sort by score descending and add document text for convenience
|
|
267
|
+
return results
|
|
268
|
+
.map((result: RerankResult) => ({
|
|
269
|
+
...result,
|
|
270
|
+
document: documents[result.index],
|
|
271
|
+
}))
|
|
272
|
+
.sort(
|
|
273
|
+
(
|
|
274
|
+
a: RerankResult & { document: string },
|
|
275
|
+
b: RerankResult & { document: string },
|
|
276
|
+
) => b.score - a.score,
|
|
277
|
+
)
|
|
278
|
+
})
|
|
254
279
|
}
|
|
255
280
|
|
|
256
281
|
saveSession(path: string): Promise<void> {
|
|
@@ -277,10 +302,7 @@ class LlamaContextWrapper {
|
|
|
277
302
|
return this.ctx.getLoadedLoraAdapters()
|
|
278
303
|
}
|
|
279
304
|
|
|
280
|
-
initMultimodal(options: {
|
|
281
|
-
path: string
|
|
282
|
-
use_gpu?: boolean
|
|
283
|
-
}): boolean {
|
|
305
|
+
initMultimodal(options: { path: string; use_gpu?: boolean }): boolean {
|
|
284
306
|
return this.ctx.initMultimodal(options)
|
|
285
307
|
}
|
|
286
308
|
|
|
@@ -299,7 +321,7 @@ class LlamaContextWrapper {
|
|
|
299
321
|
return this.ctx.getMultimodalSupport()
|
|
300
322
|
}
|
|
301
323
|
|
|
302
|
-
initVocoder(options: { path: string
|
|
324
|
+
initVocoder(options: { path: string; n_batch?: number }): boolean {
|
|
303
325
|
return this.ctx.initVocoder(options)
|
|
304
326
|
}
|
|
305
327
|
|
|
@@ -311,7 +333,10 @@ class LlamaContextWrapper {
|
|
|
311
333
|
return this.ctx.isVocoderEnabled()
|
|
312
334
|
}
|
|
313
335
|
|
|
314
|
-
getFormattedAudioCompletion(
|
|
336
|
+
getFormattedAudioCompletion(
|
|
337
|
+
speaker: string | null,
|
|
338
|
+
text: string,
|
|
339
|
+
): {
|
|
315
340
|
prompt: string
|
|
316
341
|
grammar?: string
|
|
317
342
|
} {
|
|
@@ -322,7 +347,7 @@ class LlamaContextWrapper {
|
|
|
322
347
|
return this.ctx.getAudioCompletionGuideTokens(text)
|
|
323
348
|
}
|
|
324
349
|
|
|
325
|
-
decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array> {
|
|
350
|
+
decodeAudioTokens(tokens: number[] | Int32Array): Promise<Float32Array> {
|
|
326
351
|
return this.ctx.decodeAudioTokens(tokens)
|
|
327
352
|
}
|
|
328
353
|
}
|
|
@@ -348,7 +373,9 @@ const modelInfoSkip = [
|
|
|
348
373
|
'tokenizer.ggml.scores',
|
|
349
374
|
]
|
|
350
375
|
|
|
351
|
-
export const loadLlamaModelInfo = async (
|
|
376
|
+
export const loadLlamaModelInfo = async (
|
|
377
|
+
path: string,
|
|
378
|
+
): Promise<GGUFModelInfo> => {
|
|
352
379
|
const variant = 'default'
|
|
353
380
|
mods[variant] ??= await loadModule(variant)
|
|
354
381
|
refreshNativeLogSetup()
|
package/lib/version.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.BUILD_COMMIT = exports.BUILD_NUMBER = void 0;
|
|
4
|
-
exports.BUILD_NUMBER = '
|
|
5
|
-
exports.BUILD_COMMIT = '
|
|
4
|
+
exports.BUILD_NUMBER = '6250';
|
|
5
|
+
exports.BUILD_COMMIT = 'e92734d51';
|
package/lib/version.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
export const BUILD_NUMBER = '
|
|
2
|
-
export const BUILD_COMMIT = '
|
|
1
|
+
export const BUILD_NUMBER = '6250';
|
|
2
|
+
export const BUILD_COMMIT = 'e92734d51';
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.1.
|
|
4
|
+
"version": "1.1.9",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -71,19 +71,19 @@
|
|
|
71
71
|
"CMakeLists.txt"
|
|
72
72
|
],
|
|
73
73
|
"optionalDependencies": {
|
|
74
|
-
"@fugood/node-llama-linux-x64": "1.1.
|
|
75
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.1.
|
|
76
|
-
"@fugood/node-llama-linux-x64-cuda": "1.1.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.1.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.1.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.1.
|
|
80
|
-
"@fugood/node-llama-win32-x64": "1.1.
|
|
81
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.1.
|
|
82
|
-
"@fugood/node-llama-win32-x64-cuda": "1.1.
|
|
83
|
-
"@fugood/node-llama-win32-arm64": "1.1.
|
|
84
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.1.
|
|
85
|
-
"@fugood/node-llama-darwin-x64": "1.1.
|
|
86
|
-
"@fugood/node-llama-darwin-arm64": "1.1.
|
|
74
|
+
"@fugood/node-llama-linux-x64": "1.1.9",
|
|
75
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.1.9",
|
|
76
|
+
"@fugood/node-llama-linux-x64-cuda": "1.1.9",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.1.9",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.1.9",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.1.9",
|
|
80
|
+
"@fugood/node-llama-win32-x64": "1.1.9",
|
|
81
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.1.9",
|
|
82
|
+
"@fugood/node-llama-win32-x64-cuda": "1.1.9",
|
|
83
|
+
"@fugood/node-llama-win32-arm64": "1.1.9",
|
|
84
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.1.9",
|
|
85
|
+
"@fugood/node-llama-darwin-x64": "1.1.9",
|
|
86
|
+
"@fugood/node-llama-darwin-arm64": "1.1.9"
|
|
87
87
|
},
|
|
88
88
|
"devDependencies": {
|
|
89
89
|
"@babel/preset-env": "^7.24.4",
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
#include "llama-impl.h"
|
|
16
16
|
|
|
17
17
|
#include <atomic>
|
|
18
|
+
#include <list>
|
|
18
19
|
#include <mutex>
|
|
19
20
|
#include <queue>
|
|
20
21
|
|
|
@@ -258,6 +259,16 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
258
259
|
params.numa =
|
|
259
260
|
static_cast<ggml_numa_strategy>(get_option<uint32_t>(options, "numa", 0));
|
|
260
261
|
|
|
262
|
+
int n_cpu_moe = get_option<int32_t>(options, "n_cpu_moe", 0);
|
|
263
|
+
if (n_cpu_moe > 0) {
|
|
264
|
+
static std::list<std::string> buft_overrides;
|
|
265
|
+
for (int i = 0; i < n_cpu_moe; ++i) {
|
|
266
|
+
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
|
|
267
|
+
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
|
|
268
|
+
}
|
|
269
|
+
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
|
270
|
+
}
|
|
271
|
+
|
|
261
272
|
llama_backend_init();
|
|
262
273
|
llama_numa_init(params.numa);
|
|
263
274
|
|
|
@@ -1532,7 +1532,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1532
1532
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
|
1533
1533
|
add_opt(common_arg(
|
|
1534
1534
|
{"--context-shift"},
|
|
1535
|
-
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "
|
|
1535
|
+
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
|
|
1536
1536
|
[](common_params & params) {
|
|
1537
1537
|
params.ctx_shift = true;
|
|
1538
1538
|
}
|
|
@@ -1755,7 +1755,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1755
1755
|
[](common_params & params) {
|
|
1756
1756
|
params.warmup = false;
|
|
1757
1757
|
}
|
|
1758
|
-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
|
|
1758
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1759
1759
|
add_opt(common_arg(
|
|
1760
1760
|
{"--spm-infill"},
|
|
1761
1761
|
string_format(
|
|
@@ -2254,9 +2254,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2254
2254
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
2255
2255
|
add_opt(common_arg(
|
|
2256
2256
|
{"-dt", "--defrag-thold"}, "N",
|
|
2257
|
-
string_format("KV cache defragmentation threshold (
|
|
2257
|
+
string_format("KV cache defragmentation threshold (DEPRECATED)"),
|
|
2258
2258
|
[](common_params & params, const std::string & value) {
|
|
2259
|
-
params
|
|
2259
|
+
GGML_UNUSED(params);
|
|
2260
|
+
GGML_UNUSED(value);
|
|
2261
|
+
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
|
|
2260
2262
|
}
|
|
2261
2263
|
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
|
|
2262
2264
|
add_opt(common_arg(
|
|
@@ -134,6 +134,7 @@ struct templates_params {
|
|
|
134
134
|
json extra_context;
|
|
135
135
|
bool add_bos;
|
|
136
136
|
bool add_eos;
|
|
137
|
+
bool is_inference = true;
|
|
137
138
|
};
|
|
138
139
|
|
|
139
140
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
|
|
@@ -1323,6 +1324,17 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
|
|
1323
1324
|
common_chat_params data;
|
|
1324
1325
|
auto prompt = apply(tmpl, inputs);
|
|
1325
1326
|
|
|
1327
|
+
// Check if we need to replace the return token with end token during
|
|
1328
|
+
// inference and without generation prompt. For more details see:
|
|
1329
|
+
// https://github.com/ggml-org/llama.cpp/issues/15417
|
|
1330
|
+
if (inputs.is_inference && !inputs.add_generation_prompt) {
|
|
1331
|
+
static constexpr std::string_view return_token = "<|return|>";
|
|
1332
|
+
static constexpr std::string_view end_token = "<|end|>";
|
|
1333
|
+
if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) {
|
|
1334
|
+
prompt.replace(pos, return_token.length(), end_token);
|
|
1335
|
+
}
|
|
1336
|
+
}
|
|
1337
|
+
|
|
1326
1338
|
data.prompt = prompt;
|
|
1327
1339
|
data.format = COMMON_CHAT_FORMAT_GPT_OSS;
|
|
1328
1340
|
|
|
@@ -1336,6 +1348,26 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
|
|
1336
1348
|
"<|end|>",
|
|
1337
1349
|
};
|
|
1338
1350
|
|
|
1351
|
+
if (!inputs.json_schema.is_null()) {
|
|
1352
|
+
data.grammar_lazy = false;
|
|
1353
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1354
|
+
auto schema = inputs.json_schema;
|
|
1355
|
+
builder.resolve_refs(schema);
|
|
1356
|
+
|
|
1357
|
+
auto not_end = builder.add_rule("not-end",
|
|
1358
|
+
"[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
|
|
1359
|
+
auto analysis = builder.add_rule("analysis",
|
|
1360
|
+
"\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
|
|
1361
|
+
auto constraint = builder.add_rule("constraint", "\"<|constrain|>\"? [a-zA-Z0-9_-]+");
|
|
1362
|
+
auto final = builder.add_rule("final",
|
|
1363
|
+
"\"<|channel|>final\" ( \" \" " + constraint + " )? \"<|message|>\" " +
|
|
1364
|
+
builder.add_schema("response", schema)
|
|
1365
|
+
);
|
|
1366
|
+
|
|
1367
|
+
builder.add_rule("root", "( " + analysis + " \"<|start|>assistant\" )? " + final);
|
|
1368
|
+
});
|
|
1369
|
+
}
|
|
1370
|
+
|
|
1339
1371
|
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1340
1372
|
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
1341
1373
|
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
@@ -2096,7 +2128,7 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2096
2128
|
}
|
|
2097
2129
|
|
|
2098
2130
|
// GPT-OSS
|
|
2099
|
-
if (src.find("<|channel|>") != std::string::npos
|
|
2131
|
+
if (src.find("<|channel|>") != std::string::npos) {
|
|
2100
2132
|
return common_chat_params_init_gpt_oss(tmpl, params);
|
|
2101
2133
|
}
|
|
2102
2134
|
|
|
@@ -558,13 +558,6 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
|
|
|
558
558
|
|
|
559
559
|
auto detokenized = common_token_to_piece(ctx, token);
|
|
560
560
|
|
|
561
|
-
detokenized.erase(
|
|
562
|
-
std::remove_if(
|
|
563
|
-
detokenized.begin(),
|
|
564
|
-
detokenized.end(),
|
|
565
|
-
[](const unsigned char c) { return !std::isprint(c); }),
|
|
566
|
-
detokenized.end());
|
|
567
|
-
|
|
568
561
|
buf << "'" << detokenized << "'"
|
|
569
562
|
<< ":" << std::to_string(token);
|
|
570
563
|
}
|
|
@@ -589,13 +582,6 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
|
|
589
582
|
|
|
590
583
|
auto detokenized = common_token_to_piece(ctx, batch.token[i]);
|
|
591
584
|
|
|
592
|
-
detokenized.erase(
|
|
593
|
-
std::remove_if(
|
|
594
|
-
detokenized.begin(),
|
|
595
|
-
detokenized.end(),
|
|
596
|
-
[](const unsigned char c) { return !std::isprint(c); }),
|
|
597
|
-
detokenized.end());
|
|
598
|
-
|
|
599
585
|
buf << "\n" << std::to_string(i)
|
|
600
586
|
<< ", token '" << detokenized << "'"
|
|
601
587
|
<< ", pos " << std::to_string(batch.pos[i])
|
|
@@ -1167,7 +1153,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|
|
1167
1153
|
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
|
1168
1154
|
cparams.pooling_type = params.pooling_type;
|
|
1169
1155
|
cparams.attention_type = params.attention_type;
|
|
1170
|
-
cparams.defrag_thold = params.defrag_thold;
|
|
1171
1156
|
cparams.cb_eval = params.cb_eval;
|
|
1172
1157
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
|
1173
1158
|
cparams.offload_kqv = !params.no_kv_offload;
|
|
@@ -289,7 +289,6 @@ struct common_params {
|
|
|
289
289
|
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
|
290
290
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
|
291
291
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
|
292
|
-
float defrag_thold = 0.1f; // KV cache defragmentation threshold
|
|
293
292
|
|
|
294
293
|
// offload params
|
|
295
294
|
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
|
@@ -376,7 +375,7 @@ struct common_params {
|
|
|
376
375
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
|
377
376
|
bool flash_attn = false; // flash attention
|
|
378
377
|
bool no_perf = false; // disable performance metrics
|
|
379
|
-
bool ctx_shift = false; // context shift on
|
|
378
|
+
bool ctx_shift = false; // context shift on infinite text generation
|
|
380
379
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
381
380
|
bool kv_unified = false; // enable unified KV cache
|
|
382
381
|
|
|
@@ -158,7 +158,6 @@ option(GGML_CUDA "ggml: use CUDA"
|
|
|
158
158
|
option(GGML_MUSA "ggml: use MUSA" OFF)
|
|
159
159
|
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
|
|
160
160
|
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
|
|
161
|
-
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
|
|
162
161
|
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
|
163
162
|
"ggml: max. batch size for using peer access")
|
|
164
163
|
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
|
|
@@ -244,6 +244,13 @@
|
|
|
244
244
|
#define GGML_MROPE_SECTIONS 4
|
|
245
245
|
|
|
246
246
|
#define GGML_UNUSED(x) (void)(x)
|
|
247
|
+
#ifdef __CUDACC__
|
|
248
|
+
template<typename... Args>
|
|
249
|
+
__host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexcept {}
|
|
250
|
+
#define GGML_UNUSED_VARS(...) ggml_unused_vars_impl(__VA_ARGS__)
|
|
251
|
+
#else
|
|
252
|
+
#define GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0)
|
|
253
|
+
#endif // __CUDACC__
|
|
247
254
|
|
|
248
255
|
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
|
249
256
|
|
|
@@ -505,6 +512,7 @@ extern "C" {
|
|
|
505
512
|
GGML_OP_IM2COL,
|
|
506
513
|
GGML_OP_IM2COL_BACK,
|
|
507
514
|
GGML_OP_CONV_2D,
|
|
515
|
+
GGML_OP_CONV_3D,
|
|
508
516
|
GGML_OP_CONV_2D_DW,
|
|
509
517
|
GGML_OP_CONV_TRANSPOSE_2D,
|
|
510
518
|
GGML_OP_POOL_1D,
|
|
@@ -1933,6 +1941,23 @@ extern "C" {
|
|
|
1933
1941
|
int d0, // dilation dimension 0
|
|
1934
1942
|
int d1); // dilation dimension 1
|
|
1935
1943
|
|
|
1944
|
+
GGML_API struct ggml_tensor * ggml_conv_3d(
|
|
1945
|
+
struct ggml_context * ctx,
|
|
1946
|
+
struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
|
|
1947
|
+
struct ggml_tensor * b, // input [W, H, D, C * N]
|
|
1948
|
+
int s0, // stride
|
|
1949
|
+
int s1,
|
|
1950
|
+
int s2,
|
|
1951
|
+
int p0, // padding
|
|
1952
|
+
int p1,
|
|
1953
|
+
int p2,
|
|
1954
|
+
int d0, // dilation
|
|
1955
|
+
int d1,
|
|
1956
|
+
int d2,
|
|
1957
|
+
int n_channels,
|
|
1958
|
+
int n_batch,
|
|
1959
|
+
int n_channels_out);
|
|
1960
|
+
|
|
1936
1961
|
enum ggml_op_pool {
|
|
1937
1962
|
GGML_OP_POOL_MAX,
|
|
1938
1963
|
GGML_OP_POOL_AVG,
|