@fugood/llama.node 1.1.7 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/lib/binding.ts +4 -0
  2. package/lib/index.js +9 -2
  3. package/lib/index.ts +57 -30
  4. package/lib/version.js +2 -2
  5. package/lib/version.ts +2 -2
  6. package/package.json +14 -14
  7. package/src/LlamaContext.cpp +20 -0
  8. package/src/common.hpp +8 -1
  9. package/src/llama.cpp/common/arg.cpp +13 -4
  10. package/src/llama.cpp/common/chat.cpp +33 -2
  11. package/src/llama.cpp/common/common.cpp +0 -15
  12. package/src/llama.cpp/common/common.h +6 -4
  13. package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
  14. package/src/llama.cpp/ggml/include/ggml.h +25 -0
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -3
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  22. package/src/llama.cpp/include/llama.h +1 -110
  23. package/src/llama.cpp/src/CMakeLists.txt +2 -2
  24. package/src/llama.cpp/src/llama-arch.cpp +19 -0
  25. package/src/llama.cpp/src/llama-arch.h +1 -0
  26. package/src/llama.cpp/src/llama-chat.cpp +13 -2
  27. package/src/llama.cpp/src/llama-chat.h +1 -0
  28. package/src/llama.cpp/src/llama-context.cpp +5 -197
  29. package/src/llama.cpp/src/llama-context.h +2 -7
  30. package/src/llama.cpp/src/llama-cparams.h +0 -1
  31. package/src/llama.cpp/src/llama-graph.cpp +35 -57
  32. package/src/llama.cpp/src/llama-graph.h +36 -46
  33. package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +47 -47
  34. package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +26 -26
  35. package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +88 -441
  36. package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +20 -43
  37. package/src/llama.cpp/src/llama-kv-cells.h +21 -21
  38. package/src/llama.cpp/src/llama-memory-hybrid.cpp +5 -5
  39. package/src/llama.cpp/src/llama-memory-hybrid.h +6 -6
  40. package/src/llama.cpp/src/llama-memory-recurrent.h +1 -1
  41. package/src/llama.cpp/src/llama-memory.h +3 -8
  42. package/src/llama.cpp/src/llama-model.cpp +449 -246
  43. package/src/llama.cpp/src/llama-model.h +2 -0
package/lib/binding.ts CHANGED
@@ -59,6 +59,10 @@ export type LlamaModelOptions = {
59
59
  * Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
60
60
  */
61
61
  swa_full?: boolean
62
+ /**
63
+ * Number of layers to keep MoE weights on CPU
64
+ */
65
+ n_cpu_moe?: number
62
66
  use_mlock?: boolean
63
67
  use_mmap?: boolean
64
68
  vocab_only?: boolean
package/lib/index.js CHANGED
@@ -148,7 +148,12 @@ class LlamaContextWrapper {
148
148
  enable_thinking: (_a = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _a !== void 0 ? _a : true,
149
149
  add_generation_prompt: params === null || params === void 0 ? void 0 : params.add_generation_prompt,
150
150
  now: params === null || params === void 0 ? void 0 : params.now,
151
- chat_template_kwargs: params === null || params === void 0 ? void 0 : params.chat_template_kwargs,
151
+ chat_template_kwargs: (params === null || params === void 0 ? void 0 : params.chat_template_kwargs)
152
+ ? Object.entries(params.chat_template_kwargs).reduce((acc, [key, value]) => {
153
+ acc[key] = JSON.stringify(value); // Each value is a stringified JSON object
154
+ return acc;
155
+ }, {})
156
+ : undefined,
152
157
  });
153
158
  if (!useJinja) {
154
159
  return {
@@ -179,7 +184,9 @@ class LlamaContextWrapper {
179
184
  return this.ctx.embedding(text);
180
185
  }
181
186
  rerank(query, documents, params) {
182
- return this.ctx.rerank(query, documents, params).then((results) => {
187
+ return this.ctx
188
+ .rerank(query, documents, params)
189
+ .then((results) => {
183
190
  // Sort by score descending and add document text for convenience
184
191
  return results
185
192
  .map((result) => (Object.assign(Object.assign({}, result), { document: documents[result.index] })))
package/lib/index.ts CHANGED
@@ -165,11 +165,11 @@ class LlamaContextWrapper {
165
165
  response_format?: CompletionResponseFormat
166
166
  tools?: Tool[]
167
167
  parallel_tool_calls?: boolean
168
- tool_choice?: string,
169
- enable_thinking?: boolean,
170
- add_generation_prompt?: boolean,
171
- now?: string | number,
172
- chat_template_kwargs?: Record<string, string>,
168
+ tool_choice?: string
169
+ enable_thinking?: boolean
170
+ add_generation_prompt?: boolean
171
+ now?: string | number
172
+ chat_template_kwargs?: Record<string, string>
173
173
  },
174
174
  ): FormattedChatResult {
175
175
  const {
@@ -192,7 +192,15 @@ class LlamaContextWrapper {
192
192
  enable_thinking: params?.enable_thinking ?? true,
193
193
  add_generation_prompt: params?.add_generation_prompt,
194
194
  now: params?.now,
195
- chat_template_kwargs: params?.chat_template_kwargs,
195
+ chat_template_kwargs: params?.chat_template_kwargs
196
+ ? Object.entries(params.chat_template_kwargs).reduce(
197
+ (acc, [key, value]) => {
198
+ acc[key] = JSON.stringify(value) // Each value is a stringified JSON object
199
+ return acc
200
+ },
201
+ {} as Record<string, any>,
202
+ )
203
+ : undefined,
196
204
  })
197
205
 
198
206
  if (!useJinja) {
@@ -218,18 +226,24 @@ class LlamaContextWrapper {
218
226
  ): Promise<LlamaCompletionResult> {
219
227
  const { messages, media_paths = options.media_paths } =
220
228
  this._formatMediaChat(options.messages)
221
- return this.ctx.completion({
222
- ...options,
223
- messages,
224
- media_paths: options.media_paths || media_paths,
225
- }, callback || (() => {}))
229
+ return this.ctx.completion(
230
+ {
231
+ ...options,
232
+ messages,
233
+ media_paths: options.media_paths || media_paths,
234
+ },
235
+ callback || (() => {}),
236
+ )
226
237
  }
227
238
 
228
239
  stopCompletion(): void {
229
240
  return this.ctx.stopCompletion()
230
241
  }
231
242
 
232
- tokenize(text: string, { media_paths }: { media_paths?: string[] } = {}): Promise<TokenizeResult> {
243
+ tokenize(
244
+ text: string,
245
+ { media_paths }: { media_paths?: string[] } = {},
246
+ ): Promise<TokenizeResult> {
233
247
  return this.ctx.tokenize(text, media_paths)
234
248
  }
235
249
 
@@ -241,16 +255,27 @@ class LlamaContextWrapper {
241
255
  return this.ctx.embedding(text)
242
256
  }
243
257
 
244
- rerank(query: string, documents: string[], params?: RerankParams): Promise<Array<RerankResult & { document: string }>> {
245
- return this.ctx.rerank(query, documents, params).then((results: RerankResult[]) => {
246
- // Sort by score descending and add document text for convenience
247
- return results
248
- .map((result: RerankResult) => ({
249
- ...result,
250
- document: documents[result.index],
251
- }))
252
- .sort((a: RerankResult & { document: string }, b: RerankResult & { document: string }) => b.score - a.score)
253
- })
258
+ rerank(
259
+ query: string,
260
+ documents: string[],
261
+ params?: RerankParams,
262
+ ): Promise<Array<RerankResult & { document: string }>> {
263
+ return this.ctx
264
+ .rerank(query, documents, params)
265
+ .then((results: RerankResult[]) => {
266
+ // Sort by score descending and add document text for convenience
267
+ return results
268
+ .map((result: RerankResult) => ({
269
+ ...result,
270
+ document: documents[result.index],
271
+ }))
272
+ .sort(
273
+ (
274
+ a: RerankResult & { document: string },
275
+ b: RerankResult & { document: string },
276
+ ) => b.score - a.score,
277
+ )
278
+ })
254
279
  }
255
280
 
256
281
  saveSession(path: string): Promise<void> {
@@ -277,10 +302,7 @@ class LlamaContextWrapper {
277
302
  return this.ctx.getLoadedLoraAdapters()
278
303
  }
279
304
 
280
- initMultimodal(options: {
281
- path: string
282
- use_gpu?: boolean
283
- }): boolean {
305
+ initMultimodal(options: { path: string; use_gpu?: boolean }): boolean {
284
306
  return this.ctx.initMultimodal(options)
285
307
  }
286
308
 
@@ -299,7 +321,7 @@ class LlamaContextWrapper {
299
321
  return this.ctx.getMultimodalSupport()
300
322
  }
301
323
 
302
- initVocoder(options: { path: string, n_batch?: number }): boolean {
324
+ initVocoder(options: { path: string; n_batch?: number }): boolean {
303
325
  return this.ctx.initVocoder(options)
304
326
  }
305
327
 
@@ -311,7 +333,10 @@ class LlamaContextWrapper {
311
333
  return this.ctx.isVocoderEnabled()
312
334
  }
313
335
 
314
- getFormattedAudioCompletion(speaker: string|null, text: string): {
336
+ getFormattedAudioCompletion(
337
+ speaker: string | null,
338
+ text: string,
339
+ ): {
315
340
  prompt: string
316
341
  grammar?: string
317
342
  } {
@@ -322,7 +347,7 @@ class LlamaContextWrapper {
322
347
  return this.ctx.getAudioCompletionGuideTokens(text)
323
348
  }
324
349
 
325
- decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array> {
350
+ decodeAudioTokens(tokens: number[] | Int32Array): Promise<Float32Array> {
326
351
  return this.ctx.decodeAudioTokens(tokens)
327
352
  }
328
353
  }
@@ -348,7 +373,9 @@ const modelInfoSkip = [
348
373
  'tokenizer.ggml.scores',
349
374
  ]
350
375
 
351
- export const loadLlamaModelInfo = async (path: string): Promise<GGUFModelInfo> => {
376
+ export const loadLlamaModelInfo = async (
377
+ path: string,
378
+ ): Promise<GGUFModelInfo> => {
352
379
  const variant = 'default'
353
380
  mods[variant] ??= await loadModule(variant)
354
381
  refreshNativeLogSetup()
package/lib/version.js CHANGED
@@ -1,5 +1,5 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.BUILD_COMMIT = exports.BUILD_NUMBER = void 0;
4
- exports.BUILD_NUMBER = '6096';
5
- exports.BUILD_COMMIT = 'fd1234cb';
4
+ exports.BUILD_NUMBER = '6250';
5
+ exports.BUILD_COMMIT = 'e92734d51';
package/lib/version.ts CHANGED
@@ -1,2 +1,2 @@
1
- export const BUILD_NUMBER = '6096';
2
- export const BUILD_COMMIT = 'fd1234cb';
1
+ export const BUILD_NUMBER = '6250';
2
+ export const BUILD_COMMIT = 'e92734d51';
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.1.7",
4
+ "version": "1.1.9",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -71,19 +71,19 @@
71
71
  "CMakeLists.txt"
72
72
  ],
73
73
  "optionalDependencies": {
74
- "@fugood/node-llama-linux-x64": "1.1.7",
75
- "@fugood/node-llama-linux-x64-vulkan": "1.1.7",
76
- "@fugood/node-llama-linux-x64-cuda": "1.1.7",
77
- "@fugood/node-llama-linux-arm64": "1.1.7",
78
- "@fugood/node-llama-linux-arm64-vulkan": "1.1.7",
79
- "@fugood/node-llama-linux-arm64-cuda": "1.1.7",
80
- "@fugood/node-llama-win32-x64": "1.1.7",
81
- "@fugood/node-llama-win32-x64-vulkan": "1.1.7",
82
- "@fugood/node-llama-win32-x64-cuda": "1.1.7",
83
- "@fugood/node-llama-win32-arm64": "1.1.7",
84
- "@fugood/node-llama-win32-arm64-vulkan": "1.1.7",
85
- "@fugood/node-llama-darwin-x64": "1.1.7",
86
- "@fugood/node-llama-darwin-arm64": "1.1.7"
74
+ "@fugood/node-llama-linux-x64": "1.1.9",
75
+ "@fugood/node-llama-linux-x64-vulkan": "1.1.9",
76
+ "@fugood/node-llama-linux-x64-cuda": "1.1.9",
77
+ "@fugood/node-llama-linux-arm64": "1.1.9",
78
+ "@fugood/node-llama-linux-arm64-vulkan": "1.1.9",
79
+ "@fugood/node-llama-linux-arm64-cuda": "1.1.9",
80
+ "@fugood/node-llama-win32-x64": "1.1.9",
81
+ "@fugood/node-llama-win32-x64-vulkan": "1.1.9",
82
+ "@fugood/node-llama-win32-x64-cuda": "1.1.9",
83
+ "@fugood/node-llama-win32-arm64": "1.1.9",
84
+ "@fugood/node-llama-win32-arm64-vulkan": "1.1.9",
85
+ "@fugood/node-llama-darwin-x64": "1.1.9",
86
+ "@fugood/node-llama-darwin-arm64": "1.1.9"
87
87
  },
88
88
  "devDependencies": {
89
89
  "@babel/preset-env": "^7.24.4",
@@ -15,6 +15,7 @@
15
15
  #include "llama-impl.h"
16
16
 
17
17
  #include <atomic>
18
+ #include <list>
18
19
  #include <mutex>
19
20
  #include <queue>
20
21
 
@@ -258,6 +259,16 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
258
259
  params.numa =
259
260
  static_cast<ggml_numa_strategy>(get_option<uint32_t>(options, "numa", 0));
260
261
 
262
+ int n_cpu_moe = get_option<int32_t>(options, "n_cpu_moe", 0);
263
+ if (n_cpu_moe > 0) {
264
+ static std::list<std::string> buft_overrides;
265
+ for (int i = 0; i < n_cpu_moe; ++i) {
266
+ buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
267
+ params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
268
+ }
269
+ params.tensor_buft_overrides.push_back({nullptr, nullptr});
270
+ }
271
+
261
272
  llama_backend_init();
262
273
  llama_numa_init(params.numa);
263
274
 
@@ -636,6 +647,15 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
636
647
  _sess, _templates, messages, chat_template, json_schema_str, tools_str,
637
648
  parallel_tool_calls, tool_choice, enable_thinking,
638
649
  add_generation_prompt, now_str, chat_template_kwargs);
650
+ } catch (const nlohmann::json_abi_v3_12_0::detail::parse_error& e) {
651
+ Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
652
+ return env.Undefined();
653
+ } catch (const std::invalid_argument& e) {
654
+ Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
655
+ return env.Undefined();
656
+ } catch (const std::runtime_error& e) {
657
+ Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
658
+ return env.Undefined();
639
659
  } catch (const std::exception &e) {
640
660
  Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
641
661
  return env.Undefined();
package/src/common.hpp CHANGED
@@ -461,7 +461,14 @@ processMediaPrompt(llama_context *ctx, mtmd_context *mtmd_ctx,
461
461
  }
462
462
 
463
463
  // Clear all KV cache entries after position n_past
464
- llama_memory_seq_rm(llama_get_memory(ctx), 0, n_past, -1);
464
+ auto * kv = llama_get_memory(ctx);
465
+ bool clear_result = llama_memory_seq_rm(kv, 0, n_past, -1);
466
+ if (!clear_result) {
467
+ fprintf(stdout, "[DEBUG] llama_memory_seq_rm failed (likely using a non-Transformer model)! Trying full clear...");
468
+ llama_memory_clear(kv, false);
469
+ n_past = 0;
470
+ new_n_past = n_past;
471
+ }
465
472
 
466
473
  size_t num_chunks = mtmd_input_chunks_size(chunks);
467
474
 
@@ -1530,6 +1530,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1530
1530
  params.ctx_shift = false;
1531
1531
  }
1532
1532
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
1533
+ add_opt(common_arg(
1534
+ {"--context-shift"},
1535
+ string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
1536
+ [](common_params & params) {
1537
+ params.ctx_shift = true;
1538
+ }
1539
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
1533
1540
  add_opt(common_arg(
1534
1541
  {"--chunks"}, "N",
1535
1542
  string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -1748,7 +1755,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1748
1755
  [](common_params & params) {
1749
1756
  params.warmup = false;
1750
1757
  }
1751
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
1758
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
1752
1759
  add_opt(common_arg(
1753
1760
  {"--spm-infill"},
1754
1761
  string_format(
@@ -1823,7 +1830,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1823
1830
  [](common_params & params, const std::string & value) {
1824
1831
  params.sampling.top_n_sigma = std::stof(value);
1825
1832
  }
1826
- ).set_examples({LLAMA_EXAMPLE_MAIN}).set_sparam());
1833
+ ).set_sparam());
1827
1834
  add_opt(common_arg(
1828
1835
  {"--xtc-probability"}, "N",
1829
1836
  string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
@@ -2247,9 +2254,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2247
2254
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2248
2255
  add_opt(common_arg(
2249
2256
  {"-dt", "--defrag-thold"}, "N",
2250
- string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
2257
+ string_format("KV cache defragmentation threshold (DEPRECATED)"),
2251
2258
  [](common_params & params, const std::string & value) {
2252
- params.defrag_thold = std::stof(value);
2259
+ GGML_UNUSED(params);
2260
+ GGML_UNUSED(value);
2261
+ LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
2253
2262
  }
2254
2263
  ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
2255
2264
  add_opt(common_arg(
@@ -134,6 +134,7 @@ struct templates_params {
134
134
  json extra_context;
135
135
  bool add_bos;
136
136
  bool add_eos;
137
+ bool is_inference = true;
137
138
  };
138
139
 
139
140
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -619,7 +620,6 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
619
620
  case COMMON_REASONING_FORMAT_AUTO: return "auto";
620
621
  case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
621
622
  case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
622
- case COMMON_REASONING_FORMAT_GRANITE: return "granite";
623
623
  default:
624
624
  throw std::runtime_error("Unknown reasoning format");
625
625
  }
@@ -1324,6 +1324,17 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
1324
1324
  common_chat_params data;
1325
1325
  auto prompt = apply(tmpl, inputs);
1326
1326
 
1327
+ // Check if we need to replace the return token with end token during
1328
+ // inference and without generation prompt. For more details see:
1329
+ // https://github.com/ggml-org/llama.cpp/issues/15417
1330
+ if (inputs.is_inference && !inputs.add_generation_prompt) {
1331
+ static constexpr std::string_view return_token = "<|return|>";
1332
+ static constexpr std::string_view end_token = "<|end|>";
1333
+ if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) {
1334
+ prompt.replace(pos, return_token.length(), end_token);
1335
+ }
1336
+ }
1337
+
1327
1338
  data.prompt = prompt;
1328
1339
  data.format = COMMON_CHAT_FORMAT_GPT_OSS;
1329
1340
 
@@ -1337,6 +1348,26 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
1337
1348
  "<|end|>",
1338
1349
  };
1339
1350
 
1351
+ if (!inputs.json_schema.is_null()) {
1352
+ data.grammar_lazy = false;
1353
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1354
+ auto schema = inputs.json_schema;
1355
+ builder.resolve_refs(schema);
1356
+
1357
+ auto not_end = builder.add_rule("not-end",
1358
+ "[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
1359
+ auto analysis = builder.add_rule("analysis",
1360
+ "\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
1361
+ auto constraint = builder.add_rule("constraint", "\"<|constrain|>\"? [a-zA-Z0-9_-]+");
1362
+ auto final = builder.add_rule("final",
1363
+ "\"<|channel|>final\" ( \" \" " + constraint + " )? \"<|message|>\" " +
1364
+ builder.add_schema("response", schema)
1365
+ );
1366
+
1367
+ builder.add_rule("root", "( " + analysis + " \"<|start|>assistant\" )? " + final);
1368
+ });
1369
+ }
1370
+
1340
1371
  if (inputs.tools.is_array() && !inputs.tools.empty()) {
1341
1372
  data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1342
1373
  data.grammar = build_grammar([&](const common_grammar_builder & builder) {
@@ -2097,7 +2128,7 @@ static common_chat_params common_chat_templates_apply_jinja(
2097
2128
  }
2098
2129
 
2099
2130
  // GPT-OSS
2100
- if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
2131
+ if (src.find("<|channel|>") != std::string::npos) {
2101
2132
  return common_chat_params_init_gpt_oss(tmpl, params);
2102
2133
  }
2103
2134
 
@@ -558,13 +558,6 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
558
558
 
559
559
  auto detokenized = common_token_to_piece(ctx, token);
560
560
 
561
- detokenized.erase(
562
- std::remove_if(
563
- detokenized.begin(),
564
- detokenized.end(),
565
- [](const unsigned char c) { return !std::isprint(c); }),
566
- detokenized.end());
567
-
568
561
  buf << "'" << detokenized << "'"
569
562
  << ":" << std::to_string(token);
570
563
  }
@@ -589,13 +582,6 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
589
582
 
590
583
  auto detokenized = common_token_to_piece(ctx, batch.token[i]);
591
584
 
592
- detokenized.erase(
593
- std::remove_if(
594
- detokenized.begin(),
595
- detokenized.end(),
596
- [](const unsigned char c) { return !std::isprint(c); }),
597
- detokenized.end());
598
-
599
585
  buf << "\n" << std::to_string(i)
600
586
  << ", token '" << detokenized << "'"
601
587
  << ", pos " << std::to_string(batch.pos[i])
@@ -1167,7 +1153,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1167
1153
  cparams.yarn_orig_ctx = params.yarn_orig_ctx;
1168
1154
  cparams.pooling_type = params.pooling_type;
1169
1155
  cparams.attention_type = params.attention_type;
1170
- cparams.defrag_thold = params.defrag_thold;
1171
1156
  cparams.cb_eval = params.cb_eval;
1172
1157
  cparams.cb_eval_user_data = params.cb_eval_user_data;
1173
1158
  cparams.offload_kqv = !params.no_kv_offload;
@@ -239,12 +239,15 @@ struct common_params_diffusion {
239
239
  bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
240
240
  };
241
241
 
242
+ // reasoning API response format (not to be confused as chat template's reasoning format)
242
243
  enum common_reasoning_format {
243
244
  COMMON_REASONING_FORMAT_NONE,
244
- COMMON_REASONING_FORMAT_AUTO,
245
+ COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
245
246
  COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
246
247
  COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
247
- COMMON_REASONING_FORMAT_GRANITE, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
248
+ // do not extend this enum unless you absolutely have to
249
+ // in most cases, use COMMON_REASONING_FORMAT_AUTO
250
+ // see: https://github.com/ggml-org/llama.cpp/pull/15408
248
251
  };
249
252
 
250
253
 
@@ -286,7 +289,6 @@ struct common_params {
286
289
  float yarn_beta_fast = 32.0f; // YaRN low correction dim
287
290
  float yarn_beta_slow = 1.0f; // YaRN high correction dim
288
291
  int32_t yarn_orig_ctx = 0; // YaRN original context length
289
- float defrag_thold = 0.1f; // KV cache defragmentation threshold
290
292
 
291
293
  // offload params
292
294
  std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
@@ -373,7 +375,7 @@ struct common_params {
373
375
  bool cont_batching = true; // insert new sequences for decoding on-the-fly
374
376
  bool flash_attn = false; // flash attention
375
377
  bool no_perf = false; // disable performance metrics
376
- bool ctx_shift = true; // context shift on inifinite text generation
378
+ bool ctx_shift = false; // context shift on infinite text generation
377
379
  bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
378
380
  bool kv_unified = false; // enable unified KV cache
379
381
 
@@ -158,7 +158,6 @@ option(GGML_CUDA "ggml: use CUDA"
158
158
  option(GGML_MUSA "ggml: use MUSA" OFF)
159
159
  option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
160
160
  option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
161
- option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
162
161
  set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
163
162
  "ggml: max. batch size for using peer access")
164
163
  option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
@@ -244,6 +244,13 @@
244
244
  #define GGML_MROPE_SECTIONS 4
245
245
 
246
246
  #define GGML_UNUSED(x) (void)(x)
247
+ #ifdef __CUDACC__
248
+ template<typename... Args>
249
+ __host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexcept {}
250
+ #define GGML_UNUSED_VARS(...) ggml_unused_vars_impl(__VA_ARGS__)
251
+ #else
252
+ #define GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0)
253
+ #endif // __CUDACC__
247
254
 
248
255
  #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
249
256
 
@@ -505,6 +512,7 @@ extern "C" {
505
512
  GGML_OP_IM2COL,
506
513
  GGML_OP_IM2COL_BACK,
507
514
  GGML_OP_CONV_2D,
515
+ GGML_OP_CONV_3D,
508
516
  GGML_OP_CONV_2D_DW,
509
517
  GGML_OP_CONV_TRANSPOSE_2D,
510
518
  GGML_OP_POOL_1D,
@@ -1933,6 +1941,23 @@ extern "C" {
1933
1941
  int d0, // dilation dimension 0
1934
1942
  int d1); // dilation dimension 1
1935
1943
 
1944
+ GGML_API struct ggml_tensor * ggml_conv_3d(
1945
+ struct ggml_context * ctx,
1946
+ struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
1947
+ struct ggml_tensor * b, // input [W, H, D, C * N]
1948
+ int s0, // stride
1949
+ int s1,
1950
+ int s2,
1951
+ int p0, // padding
1952
+ int p1,
1953
+ int p2,
1954
+ int d0, // dilation
1955
+ int d1,
1956
+ int d2,
1957
+ int n_channels,
1958
+ int n_batch,
1959
+ int n_channels_out);
1960
+
1936
1961
  enum ggml_op_pool {
1937
1962
  GGML_OP_POOL_MAX,
1938
1963
  GGML_OP_POOL_AVG,
@@ -278,6 +278,72 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
278
278
  #endif
279
279
  }
280
280
 
281
+ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
282
+ assert(nrc == 1);
283
+ UNUSED(nrc);
284
+ UNUSED(bx);
285
+ UNUSED(by);
286
+ UNUSED(bs);
287
+ assert(n % QK_MXFP4 == 0);
288
+ static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
289
+
290
+ const block_mxfp4 * GGML_RESTRICT x = vx;
291
+ const block_q8_0 * GGML_RESTRICT y = vy;
292
+
293
+ const int nb = n / QK_MXFP4;
294
+
295
+ int ib = 0;
296
+ float sumf = 0;
297
+
298
+ #if defined(__POWER9_VECTOR__)
299
+ const vector signed char lowMask = vec_splats((signed char)0xF);
300
+ const vector unsigned char vshift4 = vec_splats((unsigned char)4);
301
+ vector float vsumf0 = vec_splats(0.0f);
302
+
303
+ vector signed char kv = vec_xl(0, (const signed char *)kvalues_mxfp4);
304
+
305
+ #pragma GCC unroll 8
306
+ for (; ib < nb; ++ib) {
307
+ __builtin_prefetch(x[ib].qs, 0, 1);
308
+ __builtin_prefetch(y[ib].qs, 0, 1);
309
+
310
+ vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d) *
311
+ GGML_E8M0_TO_FP32_HALF(x[ib].e));
312
+
313
+ vector signed char q8y0 = vec_xl( 0, y[ib].qs);
314
+ vector signed char q8y1 = vec_xl(16, y[ib].qs);
315
+
316
+ vector signed char qxs = (vector signed char)vec_xl(0, x[ib].qs);
317
+
318
+ vector unsigned char lo_nibbles = (vector unsigned char)vec_and(qxs, lowMask);
319
+ vector unsigned char hi_nibbles = (vector unsigned char)vec_sr(qxs, vshift4);
320
+
321
+ vector signed char q4x0 = vec_perm(kv, kv, lo_nibbles);
322
+ vector signed char q4x1 = vec_perm(kv, kv, hi_nibbles);
323
+
324
+ vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
325
+ vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
326
+
327
+ vector signed int vsumi0 = vec_splats((int32_t)0);
328
+ vsumi0 = vec_sum4s(qv0, vsumi0);
329
+ vsumi0 = vec_sum4s(qv1, vsumi0);
330
+
331
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vyd, vsumf0);
332
+ }
333
+
334
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
335
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
336
+ sumf = vec_extract(vsumf0, 0);
337
+ *s = sumf;
338
+ #else
339
+ UNUSED(x);
340
+ UNUSED(y);
341
+ UNUSED(ib);
342
+ UNUSED(sumf);
343
+ ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
344
+ #endif
345
+ }
346
+
281
347
  void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
282
348
  const int qk = QK8_0;
283
349
  const int nb = n / qk;