@fugood/llama.node 1.1.8 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/lib/binding.ts +4 -0
  2. package/lib/index.js +9 -2
  3. package/lib/index.ts +57 -30
  4. package/lib/version.js +2 -2
  5. package/lib/version.ts +2 -2
  6. package/package.json +14 -14
  7. package/src/LlamaContext.cpp +11 -0
  8. package/src/llama.cpp/common/arg.cpp +6 -4
  9. package/src/llama.cpp/common/chat.cpp +33 -1
  10. package/src/llama.cpp/common/common.cpp +0 -15
  11. package/src/llama.cpp/common/common.h +1 -2
  12. package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
  13. package/src/llama.cpp/ggml/include/ggml.h +25 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -2
  16. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  20. package/src/llama.cpp/include/llama.h +1 -110
  21. package/src/llama.cpp/src/CMakeLists.txt +2 -2
  22. package/src/llama.cpp/src/llama-arch.cpp +19 -0
  23. package/src/llama.cpp/src/llama-arch.h +1 -0
  24. package/src/llama.cpp/src/llama-chat.cpp +13 -2
  25. package/src/llama.cpp/src/llama-chat.h +1 -0
  26. package/src/llama.cpp/src/llama-context.cpp +5 -192
  27. package/src/llama.cpp/src/llama-context.h +2 -7
  28. package/src/llama.cpp/src/llama-cparams.h +0 -1
  29. package/src/llama.cpp/src/llama-graph.cpp +35 -57
  30. package/src/llama.cpp/src/llama-graph.h +36 -46
  31. package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +47 -47
  32. package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +26 -26
  33. package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +88 -441
  34. package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +20 -43
  35. package/src/llama.cpp/src/llama-kv-cells.h +21 -21
  36. package/src/llama.cpp/src/llama-memory-hybrid.cpp +5 -5
  37. package/src/llama.cpp/src/llama-memory-hybrid.h +6 -6
  38. package/src/llama.cpp/src/llama-memory-recurrent.h +1 -1
  39. package/src/llama.cpp/src/llama-memory.h +3 -8
  40. package/src/llama.cpp/src/llama-model.cpp +369 -176
  41. package/src/llama.cpp/src/llama-model.h +1 -0
package/lib/binding.ts CHANGED
@@ -59,6 +59,10 @@ export type LlamaModelOptions = {
59
59
  * Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
60
60
  */
61
61
  swa_full?: boolean
62
+ /**
63
+ * Number of layers to keep MoE weights on CPU
64
+ */
65
+ n_cpu_moe?: number
62
66
  use_mlock?: boolean
63
67
  use_mmap?: boolean
64
68
  vocab_only?: boolean
package/lib/index.js CHANGED
@@ -148,7 +148,12 @@ class LlamaContextWrapper {
148
148
  enable_thinking: (_a = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _a !== void 0 ? _a : true,
149
149
  add_generation_prompt: params === null || params === void 0 ? void 0 : params.add_generation_prompt,
150
150
  now: params === null || params === void 0 ? void 0 : params.now,
151
- chat_template_kwargs: params === null || params === void 0 ? void 0 : params.chat_template_kwargs,
151
+ chat_template_kwargs: (params === null || params === void 0 ? void 0 : params.chat_template_kwargs)
152
+ ? Object.entries(params.chat_template_kwargs).reduce((acc, [key, value]) => {
153
+ acc[key] = JSON.stringify(value); // Each value is a stringified JSON object
154
+ return acc;
155
+ }, {})
156
+ : undefined,
152
157
  });
153
158
  if (!useJinja) {
154
159
  return {
@@ -179,7 +184,9 @@ class LlamaContextWrapper {
179
184
  return this.ctx.embedding(text);
180
185
  }
181
186
  rerank(query, documents, params) {
182
- return this.ctx.rerank(query, documents, params).then((results) => {
187
+ return this.ctx
188
+ .rerank(query, documents, params)
189
+ .then((results) => {
183
190
  // Sort by score descending and add document text for convenience
184
191
  return results
185
192
  .map((result) => (Object.assign(Object.assign({}, result), { document: documents[result.index] })))
package/lib/index.ts CHANGED
@@ -165,11 +165,11 @@ class LlamaContextWrapper {
165
165
  response_format?: CompletionResponseFormat
166
166
  tools?: Tool[]
167
167
  parallel_tool_calls?: boolean
168
- tool_choice?: string,
169
- enable_thinking?: boolean,
170
- add_generation_prompt?: boolean,
171
- now?: string | number,
172
- chat_template_kwargs?: Record<string, string>,
168
+ tool_choice?: string
169
+ enable_thinking?: boolean
170
+ add_generation_prompt?: boolean
171
+ now?: string | number
172
+ chat_template_kwargs?: Record<string, string>
173
173
  },
174
174
  ): FormattedChatResult {
175
175
  const {
@@ -192,7 +192,15 @@ class LlamaContextWrapper {
192
192
  enable_thinking: params?.enable_thinking ?? true,
193
193
  add_generation_prompt: params?.add_generation_prompt,
194
194
  now: params?.now,
195
- chat_template_kwargs: params?.chat_template_kwargs,
195
+ chat_template_kwargs: params?.chat_template_kwargs
196
+ ? Object.entries(params.chat_template_kwargs).reduce(
197
+ (acc, [key, value]) => {
198
+ acc[key] = JSON.stringify(value) // Each value is a stringified JSON object
199
+ return acc
200
+ },
201
+ {} as Record<string, any>,
202
+ )
203
+ : undefined,
196
204
  })
197
205
 
198
206
  if (!useJinja) {
@@ -218,18 +226,24 @@ class LlamaContextWrapper {
218
226
  ): Promise<LlamaCompletionResult> {
219
227
  const { messages, media_paths = options.media_paths } =
220
228
  this._formatMediaChat(options.messages)
221
- return this.ctx.completion({
222
- ...options,
223
- messages,
224
- media_paths: options.media_paths || media_paths,
225
- }, callback || (() => {}))
229
+ return this.ctx.completion(
230
+ {
231
+ ...options,
232
+ messages,
233
+ media_paths: options.media_paths || media_paths,
234
+ },
235
+ callback || (() => {}),
236
+ )
226
237
  }
227
238
 
228
239
  stopCompletion(): void {
229
240
  return this.ctx.stopCompletion()
230
241
  }
231
242
 
232
- tokenize(text: string, { media_paths }: { media_paths?: string[] } = {}): Promise<TokenizeResult> {
243
+ tokenize(
244
+ text: string,
245
+ { media_paths }: { media_paths?: string[] } = {},
246
+ ): Promise<TokenizeResult> {
233
247
  return this.ctx.tokenize(text, media_paths)
234
248
  }
235
249
 
@@ -241,16 +255,27 @@ class LlamaContextWrapper {
241
255
  return this.ctx.embedding(text)
242
256
  }
243
257
 
244
- rerank(query: string, documents: string[], params?: RerankParams): Promise<Array<RerankResult & { document: string }>> {
245
- return this.ctx.rerank(query, documents, params).then((results: RerankResult[]) => {
246
- // Sort by score descending and add document text for convenience
247
- return results
248
- .map((result: RerankResult) => ({
249
- ...result,
250
- document: documents[result.index],
251
- }))
252
- .sort((a: RerankResult & { document: string }, b: RerankResult & { document: string }) => b.score - a.score)
253
- })
258
+ rerank(
259
+ query: string,
260
+ documents: string[],
261
+ params?: RerankParams,
262
+ ): Promise<Array<RerankResult & { document: string }>> {
263
+ return this.ctx
264
+ .rerank(query, documents, params)
265
+ .then((results: RerankResult[]) => {
266
+ // Sort by score descending and add document text for convenience
267
+ return results
268
+ .map((result: RerankResult) => ({
269
+ ...result,
270
+ document: documents[result.index],
271
+ }))
272
+ .sort(
273
+ (
274
+ a: RerankResult & { document: string },
275
+ b: RerankResult & { document: string },
276
+ ) => b.score - a.score,
277
+ )
278
+ })
254
279
  }
255
280
 
256
281
  saveSession(path: string): Promise<void> {
@@ -277,10 +302,7 @@ class LlamaContextWrapper {
277
302
  return this.ctx.getLoadedLoraAdapters()
278
303
  }
279
304
 
280
- initMultimodal(options: {
281
- path: string
282
- use_gpu?: boolean
283
- }): boolean {
305
+ initMultimodal(options: { path: string; use_gpu?: boolean }): boolean {
284
306
  return this.ctx.initMultimodal(options)
285
307
  }
286
308
 
@@ -299,7 +321,7 @@ class LlamaContextWrapper {
299
321
  return this.ctx.getMultimodalSupport()
300
322
  }
301
323
 
302
- initVocoder(options: { path: string, n_batch?: number }): boolean {
324
+ initVocoder(options: { path: string; n_batch?: number }): boolean {
303
325
  return this.ctx.initVocoder(options)
304
326
  }
305
327
 
@@ -311,7 +333,10 @@ class LlamaContextWrapper {
311
333
  return this.ctx.isVocoderEnabled()
312
334
  }
313
335
 
314
- getFormattedAudioCompletion(speaker: string|null, text: string): {
336
+ getFormattedAudioCompletion(
337
+ speaker: string | null,
338
+ text: string,
339
+ ): {
315
340
  prompt: string
316
341
  grammar?: string
317
342
  } {
@@ -322,7 +347,7 @@ class LlamaContextWrapper {
322
347
  return this.ctx.getAudioCompletionGuideTokens(text)
323
348
  }
324
349
 
325
- decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array> {
350
+ decodeAudioTokens(tokens: number[] | Int32Array): Promise<Float32Array> {
326
351
  return this.ctx.decodeAudioTokens(tokens)
327
352
  }
328
353
  }
@@ -348,7 +373,9 @@ const modelInfoSkip = [
348
373
  'tokenizer.ggml.scores',
349
374
  ]
350
375
 
351
- export const loadLlamaModelInfo = async (path: string): Promise<GGUFModelInfo> => {
376
+ export const loadLlamaModelInfo = async (
377
+ path: string,
378
+ ): Promise<GGUFModelInfo> => {
352
379
  const variant = 'default'
353
380
  mods[variant] ??= await loadModule(variant)
354
381
  refreshNativeLogSetup()
package/lib/version.js CHANGED
@@ -1,5 +1,5 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.BUILD_COMMIT = exports.BUILD_NUMBER = void 0;
4
- exports.BUILD_NUMBER = '6096';
5
- exports.BUILD_COMMIT = 'fd1234cb';
4
+ exports.BUILD_NUMBER = '6250';
5
+ exports.BUILD_COMMIT = 'e92734d51';
package/lib/version.ts CHANGED
@@ -1,2 +1,2 @@
1
- export const BUILD_NUMBER = '6096';
2
- export const BUILD_COMMIT = 'fd1234cb';
1
+ export const BUILD_NUMBER = '6250';
2
+ export const BUILD_COMMIT = 'e92734d51';
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.1.8",
4
+ "version": "1.1.9",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -71,19 +71,19 @@
71
71
  "CMakeLists.txt"
72
72
  ],
73
73
  "optionalDependencies": {
74
- "@fugood/node-llama-linux-x64": "1.1.8",
75
- "@fugood/node-llama-linux-x64-vulkan": "1.1.8",
76
- "@fugood/node-llama-linux-x64-cuda": "1.1.8",
77
- "@fugood/node-llama-linux-arm64": "1.1.8",
78
- "@fugood/node-llama-linux-arm64-vulkan": "1.1.8",
79
- "@fugood/node-llama-linux-arm64-cuda": "1.1.8",
80
- "@fugood/node-llama-win32-x64": "1.1.8",
81
- "@fugood/node-llama-win32-x64-vulkan": "1.1.8",
82
- "@fugood/node-llama-win32-x64-cuda": "1.1.8",
83
- "@fugood/node-llama-win32-arm64": "1.1.8",
84
- "@fugood/node-llama-win32-arm64-vulkan": "1.1.8",
85
- "@fugood/node-llama-darwin-x64": "1.1.8",
86
- "@fugood/node-llama-darwin-arm64": "1.1.8"
74
+ "@fugood/node-llama-linux-x64": "1.1.9",
75
+ "@fugood/node-llama-linux-x64-vulkan": "1.1.9",
76
+ "@fugood/node-llama-linux-x64-cuda": "1.1.9",
77
+ "@fugood/node-llama-linux-arm64": "1.1.9",
78
+ "@fugood/node-llama-linux-arm64-vulkan": "1.1.9",
79
+ "@fugood/node-llama-linux-arm64-cuda": "1.1.9",
80
+ "@fugood/node-llama-win32-x64": "1.1.9",
81
+ "@fugood/node-llama-win32-x64-vulkan": "1.1.9",
82
+ "@fugood/node-llama-win32-x64-cuda": "1.1.9",
83
+ "@fugood/node-llama-win32-arm64": "1.1.9",
84
+ "@fugood/node-llama-win32-arm64-vulkan": "1.1.9",
85
+ "@fugood/node-llama-darwin-x64": "1.1.9",
86
+ "@fugood/node-llama-darwin-arm64": "1.1.9"
87
87
  },
88
88
  "devDependencies": {
89
89
  "@babel/preset-env": "^7.24.4",
@@ -15,6 +15,7 @@
15
15
  #include "llama-impl.h"
16
16
 
17
17
  #include <atomic>
18
+ #include <list>
18
19
  #include <mutex>
19
20
  #include <queue>
20
21
 
@@ -258,6 +259,16 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
258
259
  params.numa =
259
260
  static_cast<ggml_numa_strategy>(get_option<uint32_t>(options, "numa", 0));
260
261
 
262
+ int n_cpu_moe = get_option<int32_t>(options, "n_cpu_moe", 0);
263
+ if (n_cpu_moe > 0) {
264
+ static std::list<std::string> buft_overrides;
265
+ for (int i = 0; i < n_cpu_moe; ++i) {
266
+ buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
267
+ params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
268
+ }
269
+ params.tensor_buft_overrides.push_back({nullptr, nullptr});
270
+ }
271
+
261
272
  llama_backend_init();
262
273
  llama_numa_init(params.numa);
263
274
 
@@ -1532,7 +1532,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1532
1532
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
1533
1533
  add_opt(common_arg(
1534
1534
  {"--context-shift"},
1535
- string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
1535
+ string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
1536
1536
  [](common_params & params) {
1537
1537
  params.ctx_shift = true;
1538
1538
  }
@@ -1755,7 +1755,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1755
1755
  [](common_params & params) {
1756
1756
  params.warmup = false;
1757
1757
  }
1758
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
1758
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
1759
1759
  add_opt(common_arg(
1760
1760
  {"--spm-infill"},
1761
1761
  string_format(
@@ -2254,9 +2254,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2254
2254
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2255
2255
  add_opt(common_arg(
2256
2256
  {"-dt", "--defrag-thold"}, "N",
2257
- string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
2257
+ string_format("KV cache defragmentation threshold (DEPRECATED)"),
2258
2258
  [](common_params & params, const std::string & value) {
2259
- params.defrag_thold = std::stof(value);
2259
+ GGML_UNUSED(params);
2260
+ GGML_UNUSED(value);
2261
+ LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
2260
2262
  }
2261
2263
  ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
2262
2264
  add_opt(common_arg(
@@ -134,6 +134,7 @@ struct templates_params {
134
134
  json extra_context;
135
135
  bool add_bos;
136
136
  bool add_eos;
137
+ bool is_inference = true;
137
138
  };
138
139
 
139
140
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -1323,6 +1324,17 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
1323
1324
  common_chat_params data;
1324
1325
  auto prompt = apply(tmpl, inputs);
1325
1326
 
1327
+ // Check if we need to replace the return token with end token during
1328
+ // inference and without generation prompt. For more details see:
1329
+ // https://github.com/ggml-org/llama.cpp/issues/15417
1330
+ if (inputs.is_inference && !inputs.add_generation_prompt) {
1331
+ static constexpr std::string_view return_token = "<|return|>";
1332
+ static constexpr std::string_view end_token = "<|end|>";
1333
+ if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) {
1334
+ prompt.replace(pos, return_token.length(), end_token);
1335
+ }
1336
+ }
1337
+
1326
1338
  data.prompt = prompt;
1327
1339
  data.format = COMMON_CHAT_FORMAT_GPT_OSS;
1328
1340
 
@@ -1336,6 +1348,26 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
1336
1348
  "<|end|>",
1337
1349
  };
1338
1350
 
1351
+ if (!inputs.json_schema.is_null()) {
1352
+ data.grammar_lazy = false;
1353
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1354
+ auto schema = inputs.json_schema;
1355
+ builder.resolve_refs(schema);
1356
+
1357
+ auto not_end = builder.add_rule("not-end",
1358
+ "[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
1359
+ auto analysis = builder.add_rule("analysis",
1360
+ "\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
1361
+ auto constraint = builder.add_rule("constraint", "\"<|constrain|>\"? [a-zA-Z0-9_-]+");
1362
+ auto final = builder.add_rule("final",
1363
+ "\"<|channel|>final\" ( \" \" " + constraint + " )? \"<|message|>\" " +
1364
+ builder.add_schema("response", schema)
1365
+ );
1366
+
1367
+ builder.add_rule("root", "( " + analysis + " \"<|start|>assistant\" )? " + final);
1368
+ });
1369
+ }
1370
+
1339
1371
  if (inputs.tools.is_array() && !inputs.tools.empty()) {
1340
1372
  data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1341
1373
  data.grammar = build_grammar([&](const common_grammar_builder & builder) {
@@ -2096,7 +2128,7 @@ static common_chat_params common_chat_templates_apply_jinja(
2096
2128
  }
2097
2129
 
2098
2130
  // GPT-OSS
2099
- if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
2131
+ if (src.find("<|channel|>") != std::string::npos) {
2100
2132
  return common_chat_params_init_gpt_oss(tmpl, params);
2101
2133
  }
2102
2134
 
@@ -558,13 +558,6 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
558
558
 
559
559
  auto detokenized = common_token_to_piece(ctx, token);
560
560
 
561
- detokenized.erase(
562
- std::remove_if(
563
- detokenized.begin(),
564
- detokenized.end(),
565
- [](const unsigned char c) { return !std::isprint(c); }),
566
- detokenized.end());
567
-
568
561
  buf << "'" << detokenized << "'"
569
562
  << ":" << std::to_string(token);
570
563
  }
@@ -589,13 +582,6 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
589
582
 
590
583
  auto detokenized = common_token_to_piece(ctx, batch.token[i]);
591
584
 
592
- detokenized.erase(
593
- std::remove_if(
594
- detokenized.begin(),
595
- detokenized.end(),
596
- [](const unsigned char c) { return !std::isprint(c); }),
597
- detokenized.end());
598
-
599
585
  buf << "\n" << std::to_string(i)
600
586
  << ", token '" << detokenized << "'"
601
587
  << ", pos " << std::to_string(batch.pos[i])
@@ -1167,7 +1153,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1167
1153
  cparams.yarn_orig_ctx = params.yarn_orig_ctx;
1168
1154
  cparams.pooling_type = params.pooling_type;
1169
1155
  cparams.attention_type = params.attention_type;
1170
- cparams.defrag_thold = params.defrag_thold;
1171
1156
  cparams.cb_eval = params.cb_eval;
1172
1157
  cparams.cb_eval_user_data = params.cb_eval_user_data;
1173
1158
  cparams.offload_kqv = !params.no_kv_offload;
@@ -289,7 +289,6 @@ struct common_params {
289
289
  float yarn_beta_fast = 32.0f; // YaRN low correction dim
290
290
  float yarn_beta_slow = 1.0f; // YaRN high correction dim
291
291
  int32_t yarn_orig_ctx = 0; // YaRN original context length
292
- float defrag_thold = 0.1f; // KV cache defragmentation threshold
293
292
 
294
293
  // offload params
295
294
  std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
@@ -376,7 +375,7 @@ struct common_params {
376
375
  bool cont_batching = true; // insert new sequences for decoding on-the-fly
377
376
  bool flash_attn = false; // flash attention
378
377
  bool no_perf = false; // disable performance metrics
379
- bool ctx_shift = false; // context shift on inifinite text generation
378
+ bool ctx_shift = false; // context shift on infinite text generation
380
379
  bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
381
380
  bool kv_unified = false; // enable unified KV cache
382
381
 
@@ -158,7 +158,6 @@ option(GGML_CUDA "ggml: use CUDA"
158
158
  option(GGML_MUSA "ggml: use MUSA" OFF)
159
159
  option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
160
160
  option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
161
- option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
162
161
  set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
163
162
  "ggml: max. batch size for using peer access")
164
163
  option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
@@ -244,6 +244,13 @@
244
244
  #define GGML_MROPE_SECTIONS 4
245
245
 
246
246
  #define GGML_UNUSED(x) (void)(x)
247
+ #ifdef __CUDACC__
248
+ template<typename... Args>
249
+ __host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexcept {}
250
+ #define GGML_UNUSED_VARS(...) ggml_unused_vars_impl(__VA_ARGS__)
251
+ #else
252
+ #define GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0)
253
+ #endif // __CUDACC__
247
254
 
248
255
  #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
249
256
 
@@ -505,6 +512,7 @@ extern "C" {
505
512
  GGML_OP_IM2COL,
506
513
  GGML_OP_IM2COL_BACK,
507
514
  GGML_OP_CONV_2D,
515
+ GGML_OP_CONV_3D,
508
516
  GGML_OP_CONV_2D_DW,
509
517
  GGML_OP_CONV_TRANSPOSE_2D,
510
518
  GGML_OP_POOL_1D,
@@ -1933,6 +1941,23 @@ extern "C" {
1933
1941
  int d0, // dilation dimension 0
1934
1942
  int d1); // dilation dimension 1
1935
1943
 
1944
+ GGML_API struct ggml_tensor * ggml_conv_3d(
1945
+ struct ggml_context * ctx,
1946
+ struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
1947
+ struct ggml_tensor * b, // input [W, H, D, C * N]
1948
+ int s0, // stride
1949
+ int s1,
1950
+ int s2,
1951
+ int p0, // padding
1952
+ int p1,
1953
+ int p2,
1954
+ int d0, // dilation
1955
+ int d1,
1956
+ int d2,
1957
+ int n_channels,
1958
+ int n_batch,
1959
+ int n_channels_out);
1960
+
1936
1961
  enum ggml_op_pool {
1937
1962
  GGML_OP_POOL_MAX,
1938
1963
  GGML_OP_POOL_AVG,