@fugood/llama.node 1.1.8 → 1.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/lib/binding.ts +9 -0
  2. package/lib/index.js +9 -2
  3. package/lib/index.ts +57 -30
  4. package/lib/version.js +2 -2
  5. package/lib/version.ts +2 -2
  6. package/package.json +14 -14
  7. package/scripts/llama.cpp.patch +15 -5
  8. package/src/LlamaCompletionWorker.cpp +12 -3
  9. package/src/LlamaCompletionWorker.h +3 -1
  10. package/src/LlamaContext.cpp +14 -1
  11. package/src/llama.cpp/common/arg.cpp +6 -4
  12. package/src/llama.cpp/common/chat.cpp +34 -3
  13. package/src/llama.cpp/common/common.cpp +0 -15
  14. package/src/llama.cpp/common/common.h +1 -2
  15. package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
  16. package/src/llama.cpp/ggml/include/ggml.h +25 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -2
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  23. package/src/llama.cpp/include/llama.h +1 -110
  24. package/src/llama.cpp/src/CMakeLists.txt +2 -2
  25. package/src/llama.cpp/src/llama-arch.cpp +19 -0
  26. package/src/llama.cpp/src/llama-arch.h +1 -0
  27. package/src/llama.cpp/src/llama-chat.cpp +13 -2
  28. package/src/llama.cpp/src/llama-chat.h +1 -0
  29. package/src/llama.cpp/src/llama-context.cpp +5 -192
  30. package/src/llama.cpp/src/llama-context.h +2 -7
  31. package/src/llama.cpp/src/llama-cparams.h +0 -1
  32. package/src/llama.cpp/src/llama-graph.cpp +35 -57
  33. package/src/llama.cpp/src/llama-graph.h +36 -46
  34. package/src/llama.cpp/src/llama-hparams.cpp +25 -0
  35. package/src/llama.cpp/src/llama-hparams.h +6 -0
  36. package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +69 -52
  37. package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +28 -26
  38. package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +123 -474
  39. package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +34 -59
  40. package/src/llama.cpp/src/llama-kv-cells.h +21 -21
  41. package/src/llama.cpp/src/llama-memory-hybrid.cpp +34 -33
  42. package/src/llama.cpp/src/llama-memory-hybrid.h +24 -28
  43. package/src/llama.cpp/src/llama-memory-recurrent.cpp +7 -7
  44. package/src/llama.cpp/src/llama-memory-recurrent.h +8 -12
  45. package/src/llama.cpp/src/llama-memory.h +11 -8
  46. package/src/llama.cpp/src/llama-model.cpp +396 -187
  47. package/src/llama.cpp/src/llama-model.h +1 -0
package/lib/binding.ts CHANGED
@@ -59,6 +59,10 @@ export type LlamaModelOptions = {
59
59
  * Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
60
60
  */
61
61
  swa_full?: boolean
62
+ /**
63
+ * Number of layers to keep MoE weights on CPU
64
+ */
65
+ n_cpu_moe?: number
62
66
  use_mlock?: boolean
63
67
  use_mmap?: boolean
64
68
  vocab_only?: boolean
@@ -96,6 +100,11 @@ export type LlamaCompletionOptions = {
96
100
  enable_thinking?: boolean
97
101
  thinking_forced_open?: boolean
98
102
  prompt?: string
103
+ /**
104
+ * Text to prefill the response with.
105
+ * This text will be added to the beginning of the generated response.
106
+ */
107
+ prefill_text?: string
99
108
  temperature?: number
100
109
  top_k?: number
101
110
  top_p?: number
package/lib/index.js CHANGED
@@ -148,7 +148,12 @@ class LlamaContextWrapper {
148
148
  enable_thinking: (_a = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _a !== void 0 ? _a : true,
149
149
  add_generation_prompt: params === null || params === void 0 ? void 0 : params.add_generation_prompt,
150
150
  now: params === null || params === void 0 ? void 0 : params.now,
151
- chat_template_kwargs: params === null || params === void 0 ? void 0 : params.chat_template_kwargs,
151
+ chat_template_kwargs: (params === null || params === void 0 ? void 0 : params.chat_template_kwargs)
152
+ ? Object.entries(params.chat_template_kwargs).reduce((acc, [key, value]) => {
153
+ acc[key] = JSON.stringify(value); // Each value is a stringified JSON object
154
+ return acc;
155
+ }, {})
156
+ : undefined,
152
157
  });
153
158
  if (!useJinja) {
154
159
  return {
@@ -179,7 +184,9 @@ class LlamaContextWrapper {
179
184
  return this.ctx.embedding(text);
180
185
  }
181
186
  rerank(query, documents, params) {
182
- return this.ctx.rerank(query, documents, params).then((results) => {
187
+ return this.ctx
188
+ .rerank(query, documents, params)
189
+ .then((results) => {
183
190
  // Sort by score descending and add document text for convenience
184
191
  return results
185
192
  .map((result) => (Object.assign(Object.assign({}, result), { document: documents[result.index] })))
package/lib/index.ts CHANGED
@@ -165,11 +165,11 @@ class LlamaContextWrapper {
165
165
  response_format?: CompletionResponseFormat
166
166
  tools?: Tool[]
167
167
  parallel_tool_calls?: boolean
168
- tool_choice?: string,
169
- enable_thinking?: boolean,
170
- add_generation_prompt?: boolean,
171
- now?: string | number,
172
- chat_template_kwargs?: Record<string, string>,
168
+ tool_choice?: string
169
+ enable_thinking?: boolean
170
+ add_generation_prompt?: boolean
171
+ now?: string | number
172
+ chat_template_kwargs?: Record<string, string>
173
173
  },
174
174
  ): FormattedChatResult {
175
175
  const {
@@ -192,7 +192,15 @@ class LlamaContextWrapper {
192
192
  enable_thinking: params?.enable_thinking ?? true,
193
193
  add_generation_prompt: params?.add_generation_prompt,
194
194
  now: params?.now,
195
- chat_template_kwargs: params?.chat_template_kwargs,
195
+ chat_template_kwargs: params?.chat_template_kwargs
196
+ ? Object.entries(params.chat_template_kwargs).reduce(
197
+ (acc, [key, value]) => {
198
+ acc[key] = JSON.stringify(value) // Each value is a stringified JSON object
199
+ return acc
200
+ },
201
+ {} as Record<string, any>,
202
+ )
203
+ : undefined,
196
204
  })
197
205
 
198
206
  if (!useJinja) {
@@ -218,18 +226,24 @@ class LlamaContextWrapper {
218
226
  ): Promise<LlamaCompletionResult> {
219
227
  const { messages, media_paths = options.media_paths } =
220
228
  this._formatMediaChat(options.messages)
221
- return this.ctx.completion({
222
- ...options,
223
- messages,
224
- media_paths: options.media_paths || media_paths,
225
- }, callback || (() => {}))
229
+ return this.ctx.completion(
230
+ {
231
+ ...options,
232
+ messages,
233
+ media_paths: options.media_paths || media_paths,
234
+ },
235
+ callback || (() => {}),
236
+ )
226
237
  }
227
238
 
228
239
  stopCompletion(): void {
229
240
  return this.ctx.stopCompletion()
230
241
  }
231
242
 
232
- tokenize(text: string, { media_paths }: { media_paths?: string[] } = {}): Promise<TokenizeResult> {
243
+ tokenize(
244
+ text: string,
245
+ { media_paths }: { media_paths?: string[] } = {},
246
+ ): Promise<TokenizeResult> {
233
247
  return this.ctx.tokenize(text, media_paths)
234
248
  }
235
249
 
@@ -241,16 +255,27 @@ class LlamaContextWrapper {
241
255
  return this.ctx.embedding(text)
242
256
  }
243
257
 
244
- rerank(query: string, documents: string[], params?: RerankParams): Promise<Array<RerankResult & { document: string }>> {
245
- return this.ctx.rerank(query, documents, params).then((results: RerankResult[]) => {
246
- // Sort by score descending and add document text for convenience
247
- return results
248
- .map((result: RerankResult) => ({
249
- ...result,
250
- document: documents[result.index],
251
- }))
252
- .sort((a: RerankResult & { document: string }, b: RerankResult & { document: string }) => b.score - a.score)
253
- })
258
+ rerank(
259
+ query: string,
260
+ documents: string[],
261
+ params?: RerankParams,
262
+ ): Promise<Array<RerankResult & { document: string }>> {
263
+ return this.ctx
264
+ .rerank(query, documents, params)
265
+ .then((results: RerankResult[]) => {
266
+ // Sort by score descending and add document text for convenience
267
+ return results
268
+ .map((result: RerankResult) => ({
269
+ ...result,
270
+ document: documents[result.index],
271
+ }))
272
+ .sort(
273
+ (
274
+ a: RerankResult & { document: string },
275
+ b: RerankResult & { document: string },
276
+ ) => b.score - a.score,
277
+ )
278
+ })
254
279
  }
255
280
 
256
281
  saveSession(path: string): Promise<void> {
@@ -277,10 +302,7 @@ class LlamaContextWrapper {
277
302
  return this.ctx.getLoadedLoraAdapters()
278
303
  }
279
304
 
280
- initMultimodal(options: {
281
- path: string
282
- use_gpu?: boolean
283
- }): boolean {
305
+ initMultimodal(options: { path: string; use_gpu?: boolean }): boolean {
284
306
  return this.ctx.initMultimodal(options)
285
307
  }
286
308
 
@@ -299,7 +321,7 @@ class LlamaContextWrapper {
299
321
  return this.ctx.getMultimodalSupport()
300
322
  }
301
323
 
302
- initVocoder(options: { path: string, n_batch?: number }): boolean {
324
+ initVocoder(options: { path: string; n_batch?: number }): boolean {
303
325
  return this.ctx.initVocoder(options)
304
326
  }
305
327
 
@@ -311,7 +333,10 @@ class LlamaContextWrapper {
311
333
  return this.ctx.isVocoderEnabled()
312
334
  }
313
335
 
314
- getFormattedAudioCompletion(speaker: string|null, text: string): {
336
+ getFormattedAudioCompletion(
337
+ speaker: string | null,
338
+ text: string,
339
+ ): {
315
340
  prompt: string
316
341
  grammar?: string
317
342
  } {
@@ -322,7 +347,7 @@ class LlamaContextWrapper {
322
347
  return this.ctx.getAudioCompletionGuideTokens(text)
323
348
  }
324
349
 
325
- decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array> {
350
+ decodeAudioTokens(tokens: number[] | Int32Array): Promise<Float32Array> {
326
351
  return this.ctx.decodeAudioTokens(tokens)
327
352
  }
328
353
  }
@@ -348,7 +373,9 @@ const modelInfoSkip = [
348
373
  'tokenizer.ggml.scores',
349
374
  ]
350
375
 
351
- export const loadLlamaModelInfo = async (path: string): Promise<GGUFModelInfo> => {
376
+ export const loadLlamaModelInfo = async (
377
+ path: string,
378
+ ): Promise<GGUFModelInfo> => {
352
379
  const variant = 'default'
353
380
  mods[variant] ??= await loadModule(variant)
354
381
  refreshNativeLogSetup()
package/lib/version.js CHANGED
@@ -1,5 +1,5 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.BUILD_COMMIT = exports.BUILD_NUMBER = void 0;
4
- exports.BUILD_NUMBER = '6096';
5
- exports.BUILD_COMMIT = 'fd1234cb';
4
+ exports.BUILD_NUMBER = '6250';
5
+ exports.BUILD_COMMIT = 'e92734d51';
package/lib/version.ts CHANGED
@@ -1,2 +1,2 @@
1
- export const BUILD_NUMBER = '6096';
2
- export const BUILD_COMMIT = 'fd1234cb';
1
+ export const BUILD_NUMBER = '6250';
2
+ export const BUILD_COMMIT = 'e92734d51';
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.1.8",
4
+ "version": "1.1.10",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -71,19 +71,19 @@
71
71
  "CMakeLists.txt"
72
72
  ],
73
73
  "optionalDependencies": {
74
- "@fugood/node-llama-linux-x64": "1.1.8",
75
- "@fugood/node-llama-linux-x64-vulkan": "1.1.8",
76
- "@fugood/node-llama-linux-x64-cuda": "1.1.8",
77
- "@fugood/node-llama-linux-arm64": "1.1.8",
78
- "@fugood/node-llama-linux-arm64-vulkan": "1.1.8",
79
- "@fugood/node-llama-linux-arm64-cuda": "1.1.8",
80
- "@fugood/node-llama-win32-x64": "1.1.8",
81
- "@fugood/node-llama-win32-x64-vulkan": "1.1.8",
82
- "@fugood/node-llama-win32-x64-cuda": "1.1.8",
83
- "@fugood/node-llama-win32-arm64": "1.1.8",
84
- "@fugood/node-llama-win32-arm64-vulkan": "1.1.8",
85
- "@fugood/node-llama-darwin-x64": "1.1.8",
86
- "@fugood/node-llama-darwin-arm64": "1.1.8"
74
+ "@fugood/node-llama-linux-x64": "1.1.10",
75
+ "@fugood/node-llama-linux-x64-vulkan": "1.1.10",
76
+ "@fugood/node-llama-linux-x64-cuda": "1.1.10",
77
+ "@fugood/node-llama-linux-arm64": "1.1.10",
78
+ "@fugood/node-llama-linux-arm64-vulkan": "1.1.10",
79
+ "@fugood/node-llama-linux-arm64-cuda": "1.1.10",
80
+ "@fugood/node-llama-win32-x64": "1.1.10",
81
+ "@fugood/node-llama-win32-x64-vulkan": "1.1.10",
82
+ "@fugood/node-llama-win32-x64-cuda": "1.1.10",
83
+ "@fugood/node-llama-win32-arm64": "1.1.10",
84
+ "@fugood/node-llama-win32-arm64-vulkan": "1.1.10",
85
+ "@fugood/node-llama-darwin-x64": "1.1.10",
86
+ "@fugood/node-llama-darwin-arm64": "1.1.10"
87
87
  },
88
88
  "devDependencies": {
89
89
  "@babel/preset-env": "^7.24.4",
@@ -1,5 +1,5 @@
1
1
  diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
2
- index 23d3828f9..ca48af00c 100644
2
+ index 111b4a21b..16ce87672 100644
3
3
  --- a/src/llama.cpp/common/chat.cpp
4
4
  +++ b/src/llama.cpp/common/chat.cpp
5
5
  @@ -6,9 +6,6 @@
@@ -29,6 +29,16 @@ index 23d3828f9..ca48af00c 100644
29
29
  struct templates_params {
30
30
  json messages;
31
31
  json tools;
32
+ @@ -784,8 +771,7 @@ static std::string apply(
33
+ if (additional_context) {
34
+ tmpl_inputs.extra_context.merge_patch(*additional_context);
35
+ }
36
+ - // TODO: add flag to control date/time, if only for testing purposes.
37
+ - // tmpl_inputs.now = std::chrono::system_clock::now();
38
+ + tmpl_inputs.now = inputs.now;
39
+
40
+ minja::chat_template_options tmpl_opts;
41
+ // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
32
42
  diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
33
43
  index d1e480c91..437e64e29 100644
34
44
  --- a/src/llama.cpp/common/chat.h
@@ -54,10 +64,10 @@ index d1e480c91..437e64e29 100644
54
64
  struct common_chat_tool_call {
55
65
  std::string name;
56
66
  diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
57
- index 67dd5404f..909a97c66 100644
67
+ index fdce1dcde..55aac3412 100644
58
68
  --- a/src/llama.cpp/common/common.cpp
59
69
  +++ b/src/llama.cpp/common/common.cpp
60
- @@ -1117,6 +1117,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
70
+ @@ -1103,6 +1103,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
61
71
  mparams.n_gpu_layers = params.n_gpu_layers;
62
72
  }
63
73
 
@@ -66,10 +76,10 @@ index 67dd5404f..909a97c66 100644
66
76
  mparams.split_mode = params.split_mode;
67
77
  mparams.tensor_split = params.tensor_split;
68
78
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
69
- index 75596e6b3..0e04694c8 100644
79
+ index 390dda5e5..f259ca785 100644
70
80
  --- a/src/llama.cpp/common/common.h
71
81
  +++ b/src/llama.cpp/common/common.h
72
- @@ -267,6 +267,7 @@ struct lr_opt {
82
+ @@ -270,6 +270,7 @@ struct lr_opt {
73
83
  struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
74
84
 
75
85
  struct common_params {
@@ -35,12 +35,14 @@ LlamaCompletionWorker::LlamaCompletionWorker(
35
35
  const std::vector<std::string> &media_paths,
36
36
  const std::vector<llama_token> &guide_tokens,
37
37
  bool has_vocoder,
38
- tts_type tts_type_val)
38
+ tts_type tts_type_val,
39
+ const std::string &prefill_text)
39
40
  : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
40
41
  _params(params), _stop_words(stop_words), _chat_format(chat_format),
41
42
  _thinking_forced_open(thinking_forced_open),
42
43
  _reasoning_format(reasoning_format),
43
44
  _media_paths(media_paths), _guide_tokens(guide_tokens),
45
+ _prefill_text(prefill_text),
44
46
  _has_vocoder(has_vocoder), _tts_type(tts_type_val) {
45
47
  if (!callback.IsEmpty()) {
46
48
  _tsfn = Napi::ThreadSafeFunction::New(info.Env(), callback,
@@ -68,8 +70,11 @@ LlamaCompletionWorker::PartialOutput LlamaCompletionWorker::getPartialOutput(con
68
70
 
69
71
  chat_syntax.parse_tool_calls = true;
70
72
 
73
+ // Combine prefill_text with generated_text for parsing
74
+ std::string full_text = _prefill_text + generated_text;
75
+
71
76
  // Use is_partial=true for streaming partial output
72
- common_chat_msg parsed_msg = common_chat_parse(generated_text, true, chat_syntax);
77
+ common_chat_msg parsed_msg = common_chat_parse(full_text, true, chat_syntax);
73
78
 
74
79
  result.content = parsed_msg.content;
75
80
  result.reasoning_content = parsed_msg.reasoning_content;
@@ -156,6 +161,7 @@ void LlamaCompletionWorker::Execute() {
156
161
  auto embd = _sess->tokens_ptr();
157
162
  embd->reserve(embd->size() + max_len);
158
163
 
164
+
159
165
  if (is_enc_dec) {
160
166
  if (n_input > 0) {
161
167
  // Decode tokens in batches using n_batch as chunk size
@@ -378,8 +384,11 @@ void LlamaCompletionWorker::OnOK() {
378
384
  chat_syntax.thinking_forced_open = _thinking_forced_open;
379
385
 
380
386
  chat_syntax.reasoning_format = common_reasoning_format_from_name(_reasoning_format);
387
+
388
+ // Combine prefill_text with generated_text for final parsing
389
+ std::string full_text = _prefill_text + _result.text;
381
390
  common_chat_msg message = common_chat_parse(
382
- _result.text,
391
+ full_text,
383
392
  false,
384
393
  chat_syntax
385
394
  );
@@ -26,7 +26,8 @@ public:
26
26
  const std::vector<std::string> &media_paths = {},
27
27
  const std::vector<llama_token> &guide_tokens = {},
28
28
  bool has_vocoder = false,
29
- tts_type tts_type_val = UNKNOWN);
29
+ tts_type tts_type_val = UNKNOWN,
30
+ const std::string &prefill_text = "");
30
31
 
31
32
  ~LlamaCompletionWorker();
32
33
 
@@ -58,6 +59,7 @@ private:
58
59
  std::string _reasoning_format;
59
60
  std::vector<std::string> _media_paths;
60
61
  std::vector<llama_token> _guide_tokens;
62
+ std::string _prefill_text;
61
63
  std::function<void()> _onComplete;
62
64
  bool _has_callback = false;
63
65
  bool _interrupted = false;
@@ -15,6 +15,7 @@
15
15
  #include "llama-impl.h"
16
16
 
17
17
  #include <atomic>
18
+ #include <list>
18
19
  #include <mutex>
19
20
  #include <queue>
20
21
 
@@ -258,6 +259,16 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
258
259
  params.numa =
259
260
  static_cast<ggml_numa_strategy>(get_option<uint32_t>(options, "numa", 0));
260
261
 
262
+ int n_cpu_moe = get_option<int32_t>(options, "n_cpu_moe", 0);
263
+ if (n_cpu_moe > 0) {
264
+ static std::list<std::string> buft_overrides;
265
+ for (int i = 0; i < n_cpu_moe; ++i) {
266
+ buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
267
+ params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
268
+ }
269
+ params.tensor_buft_overrides.push_back({nullptr, nullptr});
270
+ }
271
+
261
272
  llama_backend_init();
262
273
  llama_numa_init(params.numa);
263
274
 
@@ -924,6 +935,8 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
924
935
  json_schema_to_grammar(json::parse(json_schema_str));
925
936
  }
926
937
 
938
+ std::string prefill_text = get_option<std::string>(options, "prefill_text", "");
939
+
927
940
  params.n_predict = get_option<int32_t>(options, "n_predict", -1);
928
941
  params.sampling.temp = get_option<float>(options, "temperature", 0.80f);
929
942
  params.sampling.top_k = get_option<int32_t>(options, "top_k", 40);
@@ -996,7 +1009,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
996
1009
  auto *worker =
997
1010
  new LlamaCompletionWorker(info, _sess, callback, params, stop_words,
998
1011
  chat_format, thinking_forced_open, reasoning_format, media_paths, guide_tokens,
999
- _has_vocoder, _tts_type);
1012
+ _has_vocoder, _tts_type, prefill_text);
1000
1013
  worker->Queue();
1001
1014
  _wip = worker;
1002
1015
  worker->OnComplete([this]() { _wip = nullptr; });
@@ -1532,7 +1532,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1532
1532
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
1533
1533
  add_opt(common_arg(
1534
1534
  {"--context-shift"},
1535
- string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
1535
+ string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
1536
1536
  [](common_params & params) {
1537
1537
  params.ctx_shift = true;
1538
1538
  }
@@ -1755,7 +1755,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1755
1755
  [](common_params & params) {
1756
1756
  params.warmup = false;
1757
1757
  }
1758
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
1758
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
1759
1759
  add_opt(common_arg(
1760
1760
  {"--spm-infill"},
1761
1761
  string_format(
@@ -2254,9 +2254,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2254
2254
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2255
2255
  add_opt(common_arg(
2256
2256
  {"-dt", "--defrag-thold"}, "N",
2257
- string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
2257
+ string_format("KV cache defragmentation threshold (DEPRECATED)"),
2258
2258
  [](common_params & params, const std::string & value) {
2259
- params.defrag_thold = std::stof(value);
2259
+ GGML_UNUSED(params);
2260
+ GGML_UNUSED(value);
2261
+ LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
2260
2262
  }
2261
2263
  ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
2262
2264
  add_opt(common_arg(
@@ -134,6 +134,7 @@ struct templates_params {
134
134
  json extra_context;
135
135
  bool add_bos;
136
136
  bool add_eos;
137
+ bool is_inference = true;
137
138
  };
138
139
 
139
140
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -770,8 +771,7 @@ static std::string apply(
770
771
  if (additional_context) {
771
772
  tmpl_inputs.extra_context.merge_patch(*additional_context);
772
773
  }
773
- // TODO: add flag to control date/time, if only for testing purposes.
774
- // tmpl_inputs.now = std::chrono::system_clock::now();
774
+ tmpl_inputs.now = inputs.now;
775
775
 
776
776
  minja::chat_template_options tmpl_opts;
777
777
  // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
@@ -1323,6 +1323,17 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
1323
1323
  common_chat_params data;
1324
1324
  auto prompt = apply(tmpl, inputs);
1325
1325
 
1326
+ // Check if we need to replace the return token with end token during
1327
+ // inference and without generation prompt. For more details see:
1328
+ // https://github.com/ggml-org/llama.cpp/issues/15417
1329
+ if (inputs.is_inference && !inputs.add_generation_prompt) {
1330
+ static constexpr std::string_view return_token = "<|return|>";
1331
+ static constexpr std::string_view end_token = "<|end|>";
1332
+ if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) {
1333
+ prompt.replace(pos, return_token.length(), end_token);
1334
+ }
1335
+ }
1336
+
1326
1337
  data.prompt = prompt;
1327
1338
  data.format = COMMON_CHAT_FORMAT_GPT_OSS;
1328
1339
 
@@ -1336,6 +1347,26 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
1336
1347
  "<|end|>",
1337
1348
  };
1338
1349
 
1350
+ if (!inputs.json_schema.is_null()) {
1351
+ data.grammar_lazy = false;
1352
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1353
+ auto schema = inputs.json_schema;
1354
+ builder.resolve_refs(schema);
1355
+
1356
+ auto not_end = builder.add_rule("not-end",
1357
+ "[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
1358
+ auto analysis = builder.add_rule("analysis",
1359
+ "\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
1360
+ auto constraint = builder.add_rule("constraint", "\"<|constrain|>\"? [a-zA-Z0-9_-]+");
1361
+ auto final = builder.add_rule("final",
1362
+ "\"<|channel|>final\" ( \" \" " + constraint + " )? \"<|message|>\" " +
1363
+ builder.add_schema("response", schema)
1364
+ );
1365
+
1366
+ builder.add_rule("root", "( " + analysis + " \"<|start|>assistant\" )? " + final);
1367
+ });
1368
+ }
1369
+
1339
1370
  if (inputs.tools.is_array() && !inputs.tools.empty()) {
1340
1371
  data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1341
1372
  data.grammar = build_grammar([&](const common_grammar_builder & builder) {
@@ -2096,7 +2127,7 @@ static common_chat_params common_chat_templates_apply_jinja(
2096
2127
  }
2097
2128
 
2098
2129
  // GPT-OSS
2099
- if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
2130
+ if (src.find("<|channel|>") != std::string::npos) {
2100
2131
  return common_chat_params_init_gpt_oss(tmpl, params);
2101
2132
  }
2102
2133
 
@@ -558,13 +558,6 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
558
558
 
559
559
  auto detokenized = common_token_to_piece(ctx, token);
560
560
 
561
- detokenized.erase(
562
- std::remove_if(
563
- detokenized.begin(),
564
- detokenized.end(),
565
- [](const unsigned char c) { return !std::isprint(c); }),
566
- detokenized.end());
567
-
568
561
  buf << "'" << detokenized << "'"
569
562
  << ":" << std::to_string(token);
570
563
  }
@@ -589,13 +582,6 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
589
582
 
590
583
  auto detokenized = common_token_to_piece(ctx, batch.token[i]);
591
584
 
592
- detokenized.erase(
593
- std::remove_if(
594
- detokenized.begin(),
595
- detokenized.end(),
596
- [](const unsigned char c) { return !std::isprint(c); }),
597
- detokenized.end());
598
-
599
585
  buf << "\n" << std::to_string(i)
600
586
  << ", token '" << detokenized << "'"
601
587
  << ", pos " << std::to_string(batch.pos[i])
@@ -1167,7 +1153,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1167
1153
  cparams.yarn_orig_ctx = params.yarn_orig_ctx;
1168
1154
  cparams.pooling_type = params.pooling_type;
1169
1155
  cparams.attention_type = params.attention_type;
1170
- cparams.defrag_thold = params.defrag_thold;
1171
1156
  cparams.cb_eval = params.cb_eval;
1172
1157
  cparams.cb_eval_user_data = params.cb_eval_user_data;
1173
1158
  cparams.offload_kqv = !params.no_kv_offload;
@@ -289,7 +289,6 @@ struct common_params {
289
289
  float yarn_beta_fast = 32.0f; // YaRN low correction dim
290
290
  float yarn_beta_slow = 1.0f; // YaRN high correction dim
291
291
  int32_t yarn_orig_ctx = 0; // YaRN original context length
292
- float defrag_thold = 0.1f; // KV cache defragmentation threshold
293
292
 
294
293
  // offload params
295
294
  std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
@@ -376,7 +375,7 @@ struct common_params {
376
375
  bool cont_batching = true; // insert new sequences for decoding on-the-fly
377
376
  bool flash_attn = false; // flash attention
378
377
  bool no_perf = false; // disable performance metrics
379
- bool ctx_shift = false; // context shift on inifinite text generation
378
+ bool ctx_shift = false; // context shift on infinite text generation
380
379
  bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
381
380
  bool kv_unified = false; // enable unified KV cache
382
381
 
@@ -158,7 +158,6 @@ option(GGML_CUDA "ggml: use CUDA"
158
158
  option(GGML_MUSA "ggml: use MUSA" OFF)
159
159
  option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
160
160
  option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
161
- option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
162
161
  set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
163
162
  "ggml: max. batch size for using peer access")
164
163
  option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
@@ -244,6 +244,13 @@
244
244
  #define GGML_MROPE_SECTIONS 4
245
245
 
246
246
  #define GGML_UNUSED(x) (void)(x)
247
+ #ifdef __CUDACC__
248
+ template<typename... Args>
249
+ __host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexcept {}
250
+ #define GGML_UNUSED_VARS(...) ggml_unused_vars_impl(__VA_ARGS__)
251
+ #else
252
+ #define GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0)
253
+ #endif // __CUDACC__
247
254
 
248
255
  #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
249
256
 
@@ -505,6 +512,7 @@ extern "C" {
505
512
  GGML_OP_IM2COL,
506
513
  GGML_OP_IM2COL_BACK,
507
514
  GGML_OP_CONV_2D,
515
+ GGML_OP_CONV_3D,
508
516
  GGML_OP_CONV_2D_DW,
509
517
  GGML_OP_CONV_TRANSPOSE_2D,
510
518
  GGML_OP_POOL_1D,
@@ -1933,6 +1941,23 @@ extern "C" {
1933
1941
  int d0, // dilation dimension 0
1934
1942
  int d1); // dilation dimension 1
1935
1943
 
1944
+ GGML_API struct ggml_tensor * ggml_conv_3d(
1945
+ struct ggml_context * ctx,
1946
+ struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
1947
+ struct ggml_tensor * b, // input [W, H, D, C * N]
1948
+ int s0, // stride
1949
+ int s1,
1950
+ int s2,
1951
+ int p0, // padding
1952
+ int p1,
1953
+ int p2,
1954
+ int d0, // dilation
1955
+ int d1,
1956
+ int d2,
1957
+ int n_channels,
1958
+ int n_batch,
1959
+ int n_channels_out);
1960
+
1936
1961
  enum ggml_op_pool {
1937
1962
  GGML_OP_POOL_MAX,
1938
1963
  GGML_OP_POOL_AVG,