@fugood/llama.node 1.1.5 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/lib/binding.ts +4 -0
  2. package/lib/index.js +6 -1
  3. package/lib/index.ts +6 -0
  4. package/lib/version.js +5 -0
  5. package/lib/version.ts +2 -0
  6. package/package.json +14 -14
  7. package/scripts/llama.cpp.patch +19 -15
  8. package/src/LlamaCompletionWorker.cpp +73 -18
  9. package/src/LlamaCompletionWorker.h +8 -0
  10. package/src/llama.cpp/CMakeLists.txt +2 -0
  11. package/src/llama.cpp/common/arg.cpp +147 -46
  12. package/src/llama.cpp/common/chat-parser.cpp +9 -1
  13. package/src/llama.cpp/common/chat.cpp +350 -3
  14. package/src/llama.cpp/common/chat.h +11 -3
  15. package/src/llama.cpp/common/common.cpp +54 -0
  16. package/src/llama.cpp/common/common.h +44 -9
  17. package/src/llama.cpp/ggml/CMakeLists.txt +5 -2
  18. package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
  19. package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
  20. package/src/llama.cpp/ggml/include/ggml.h +65 -3
  21. package/src/llama.cpp/ggml/src/CMakeLists.txt +13 -1
  22. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +20 -1
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
  29. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +270 -11
  31. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +3 -8
  32. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
  35. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
  37. package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
  39. package/src/llama.cpp/include/llama.h +26 -0
  40. package/src/llama.cpp/src/llama-arch.cpp +65 -0
  41. package/src/llama.cpp/src/llama-arch.h +10 -0
  42. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  43. package/src/llama.cpp/src/llama-chat.cpp +15 -4
  44. package/src/llama.cpp/src/llama-chat.h +1 -0
  45. package/src/llama.cpp/src/llama-context.cpp +37 -25
  46. package/src/llama.cpp/src/llama-context.h +6 -5
  47. package/src/llama.cpp/src/llama-graph.cpp +118 -9
  48. package/src/llama.cpp/src/llama-graph.h +38 -0
  49. package/src/llama.cpp/src/llama-hparams.h +5 -3
  50. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
  51. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
  52. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +93 -69
  53. package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
  54. package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
  55. package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
  56. package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
  57. package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
  58. package/src/llama.cpp/src/llama-memory.h +2 -2
  59. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  60. package/src/llama.cpp/src/llama-model-loader.h +3 -2
  61. package/src/llama.cpp/src/llama-model.cpp +500 -4
  62. package/src/llama.cpp/src/llama-model.h +25 -4
  63. package/src/llama.cpp/src/llama-quant.cpp +37 -1
  64. package/src/llama.cpp/src/llama-vocab.cpp +43 -0
package/lib/binding.ts CHANGED
@@ -167,6 +167,10 @@ export type LlamaCompletionResult = {
167
167
 
168
168
  export type LlamaCompletionToken = {
169
169
  token: string
170
+ content?: string
171
+ reasoning_content?: string
172
+ tool_calls?: ToolCall[]
173
+ accumulated_text?: string
170
174
  }
171
175
 
172
176
  export type TokenizeResult = {
package/lib/index.js CHANGED
@@ -23,9 +23,10 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
23
23
  });
24
24
  };
25
25
  Object.defineProperty(exports, "__esModule", { value: true });
26
- exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = void 0;
26
+ exports.BuildInfo = exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = void 0;
27
27
  exports.addNativeLogListener = addNativeLogListener;
28
28
  const binding_1 = require("./binding");
29
+ const version_1 = require("./version");
29
30
  __exportStar(require("./binding"), exports);
30
31
  exports.MTMD_DEFAULT_MEDIA_MARKER = '<__media__>';
31
32
  const mods = {};
@@ -259,3 +260,7 @@ const loadLlamaModelInfo = (path) => __awaiter(void 0, void 0, void 0, function*
259
260
  return mods[variant].LlamaContext.loadModelInfo(path, modelInfoSkip);
260
261
  });
261
262
  exports.loadLlamaModelInfo = loadLlamaModelInfo;
263
+ exports.BuildInfo = {
264
+ number: version_1.BUILD_NUMBER,
265
+ commit: version_1.BUILD_COMMIT,
266
+ };
package/lib/index.ts CHANGED
@@ -17,6 +17,7 @@ import type {
17
17
  Tool,
18
18
  GGUFModelInfo,
19
19
  } from './binding'
20
+ import { BUILD_NUMBER, BUILD_COMMIT } from './version'
20
21
 
21
22
  export * from './binding'
22
23
 
@@ -353,3 +354,8 @@ export const loadLlamaModelInfo = async (path: string): Promise<GGUFModelInfo> =
353
354
  refreshNativeLogSetup()
354
355
  return mods[variant].LlamaContext.loadModelInfo(path, modelInfoSkip)
355
356
  }
357
+
358
+ export const BuildInfo = {
359
+ number: BUILD_NUMBER,
360
+ commit: BUILD_COMMIT,
361
+ }
package/lib/version.js ADDED
@@ -0,0 +1,5 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.BUILD_COMMIT = exports.BUILD_NUMBER = void 0;
4
+ exports.BUILD_NUMBER = '6096';
5
+ exports.BUILD_COMMIT = 'fd1234cb';
package/lib/version.ts ADDED
@@ -0,0 +1,2 @@
1
+ export const BUILD_NUMBER = '6096';
2
+ export const BUILD_COMMIT = 'fd1234cb';
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.1.5",
4
+ "version": "1.1.7",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -71,19 +71,19 @@
71
71
  "CMakeLists.txt"
72
72
  ],
73
73
  "optionalDependencies": {
74
- "@fugood/node-llama-linux-x64": "1.1.5",
75
- "@fugood/node-llama-linux-x64-vulkan": "1.1.5",
76
- "@fugood/node-llama-linux-x64-cuda": "1.1.5",
77
- "@fugood/node-llama-linux-arm64": "1.1.5",
78
- "@fugood/node-llama-linux-arm64-vulkan": "1.1.5",
79
- "@fugood/node-llama-linux-arm64-cuda": "1.1.5",
80
- "@fugood/node-llama-win32-x64": "1.1.5",
81
- "@fugood/node-llama-win32-x64-vulkan": "1.1.5",
82
- "@fugood/node-llama-win32-x64-cuda": "1.1.5",
83
- "@fugood/node-llama-win32-arm64": "1.1.5",
84
- "@fugood/node-llama-win32-arm64-vulkan": "1.1.5",
85
- "@fugood/node-llama-darwin-x64": "1.1.5",
86
- "@fugood/node-llama-darwin-arm64": "1.1.5"
74
+ "@fugood/node-llama-linux-x64": "1.1.7",
75
+ "@fugood/node-llama-linux-x64-vulkan": "1.1.7",
76
+ "@fugood/node-llama-linux-x64-cuda": "1.1.7",
77
+ "@fugood/node-llama-linux-arm64": "1.1.7",
78
+ "@fugood/node-llama-linux-arm64-vulkan": "1.1.7",
79
+ "@fugood/node-llama-linux-arm64-cuda": "1.1.7",
80
+ "@fugood/node-llama-win32-x64": "1.1.7",
81
+ "@fugood/node-llama-win32-x64-vulkan": "1.1.7",
82
+ "@fugood/node-llama-win32-x64-cuda": "1.1.7",
83
+ "@fugood/node-llama-win32-arm64": "1.1.7",
84
+ "@fugood/node-llama-win32-arm64-vulkan": "1.1.7",
85
+ "@fugood/node-llama-darwin-x64": "1.1.7",
86
+ "@fugood/node-llama-darwin-arm64": "1.1.7"
87
87
  },
88
88
  "devDependencies": {
89
89
  "@babel/preset-env": "^7.24.4",
@@ -1,5 +1,5 @@
1
1
  diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
2
- index 114dbfcc..6771bd43 100644
2
+ index 23d3828f9..ca48af00c 100644
3
3
  --- a/src/llama.cpp/common/chat.cpp
4
4
  +++ b/src/llama.cpp/common/chat.cpp
5
5
  @@ -6,9 +6,6 @@
@@ -12,13 +12,15 @@ index 114dbfcc..6771bd43 100644
12
12
  #include <cstdio>
13
13
  #include <exception>
14
14
  #include <iostream>
15
- @@ -123,14 +120,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
15
+ @@ -123,16 +120,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
16
16
  return diffs;
17
17
  }
18
18
 
19
19
  -typedef minja::chat_template common_chat_template;
20
20
  -
21
21
  -struct common_chat_templates {
22
+ - bool add_bos;
23
+ - bool add_eos;
22
24
  - bool has_explicit_template; // Model had builtin template or template overridde was specified.
23
25
  - std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
24
26
  - std::unique_ptr<common_chat_template> template_tool_use;
@@ -27,21 +29,23 @@ index 114dbfcc..6771bd43 100644
27
29
  struct templates_params {
28
30
  json messages;
29
31
  json tools;
30
- diff --git a/common/chat.h b/common/chat.h
31
- index ca807c14..56649863 100644
32
+ diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
33
+ index d1e480c91..437e64e29 100644
32
34
  --- a/src/llama.cpp/common/chat.h
33
35
  +++ b/src/llama.cpp/common/chat.h
34
- @@ -9,7 +9,16 @@
36
+ @@ -9,7 +9,18 @@
35
37
  #include <vector>
36
38
  #include <map>
37
39
 
38
40
  -struct common_chat_templates;
39
- +#include <minja/chat-template.hpp>
40
- +#include <minja/minja.hpp>
41
+ +#include "minja/chat-template.hpp"
42
+ +#include "minja/minja.hpp"
41
43
  +
42
44
  +typedef minja::chat_template common_chat_template;
43
45
  +
44
46
  +struct common_chat_templates {
47
+ + bool add_bos;
48
+ + bool add_eos;
45
49
  + bool has_explicit_template; // Model had builtin template or template overridde was specified.
46
50
  + std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
47
51
  + std::unique_ptr<common_chat_template> template_tool_use;
@@ -50,10 +54,10 @@ index ca807c14..56649863 100644
50
54
  struct common_chat_tool_call {
51
55
  std::string name;
52
56
  diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
53
- index e4e71ad1..091ddda4 100644
57
+ index 67dd5404f..909a97c66 100644
54
58
  --- a/src/llama.cpp/common/common.cpp
55
59
  +++ b/src/llama.cpp/common/common.cpp
56
- @@ -1101,6 +1101,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
60
+ @@ -1117,6 +1117,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
57
61
  mparams.n_gpu_layers = params.n_gpu_layers;
58
62
  }
59
63
 
@@ -62,11 +66,11 @@ index e4e71ad1..091ddda4 100644
62
66
  mparams.split_mode = params.split_mode;
63
67
  mparams.tensor_split = params.tensor_split;
64
68
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
65
- index 8922090e..3c2d1a6a 100644
69
+ index 75596e6b3..0e04694c8 100644
66
70
  --- a/src/llama.cpp/common/common.h
67
71
  +++ b/src/llama.cpp/common/common.h
68
- @@ -224,6 +224,7 @@ enum common_reasoning_format {
69
- };
72
+ @@ -267,6 +267,7 @@ struct lr_opt {
73
+ struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
70
74
 
71
75
  struct common_params {
72
76
  + bool vocab_only = false;
@@ -74,10 +78,10 @@ index 8922090e..3c2d1a6a 100644
74
78
  int32_t n_ctx = 4096; // context size
75
79
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
76
80
  diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
77
- index 671fad4d..93fc3cd7 100644
81
+ index ce0a3e128..df9300224 100644
78
82
  --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
79
83
  +++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
80
- @@ -104,7 +104,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
84
+ @@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
81
85
  )
82
86
 
83
87
  if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
@@ -87,7 +91,7 @@ index 671fad4d..93fc3cd7 100644
87
91
  check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
88
92
  if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
89
93
  diff --git a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
90
- index b97e7bf9..c3eb9519 100644
94
+ index b97e7bf99..c3eb9519f 100644
91
95
  --- a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
92
96
  +++ b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
93
97
  @@ -111,7 +111,7 @@ if (Vulkan_FOUND)
@@ -55,6 +55,32 @@ LlamaCompletionWorker::~LlamaCompletionWorker() {
55
55
  }
56
56
  }
57
57
 
58
+ LlamaCompletionWorker::PartialOutput LlamaCompletionWorker::getPartialOutput(const std::string &generated_text) {
59
+ PartialOutput result;
60
+
61
+ try {
62
+ common_chat_syntax chat_syntax;
63
+ chat_syntax.format = static_cast<common_chat_format>(_chat_format);
64
+ chat_syntax.thinking_forced_open = _thinking_forced_open;
65
+
66
+ // Set reasoning format using the common function
67
+ chat_syntax.reasoning_format = common_reasoning_format_from_name(_reasoning_format);
68
+
69
+ chat_syntax.parse_tool_calls = true;
70
+
71
+ // Use is_partial=true for streaming partial output
72
+ common_chat_msg parsed_msg = common_chat_parse(generated_text, true, chat_syntax);
73
+
74
+ result.content = parsed_msg.content;
75
+ result.reasoning_content = parsed_msg.reasoning_content;
76
+ result.tool_calls = parsed_msg.tool_calls;
77
+ } catch (const std::exception &e) {
78
+ // If parsing fails, leave content empty - this is expected for partial content
79
+ }
80
+
81
+ return result;
82
+ }
83
+
58
84
  void LlamaCompletionWorker::Execute() {
59
85
  _sess->get_mutex().lock();
60
86
  const auto t_main_start = ggml_time_us();
@@ -222,6 +248,13 @@ void LlamaCompletionWorker::Execute() {
222
248
 
223
249
  // sample the next token
224
250
  llama_token new_token_id = common_sampler_sample(sampling.get(), ctx, -1);
251
+
252
+ // is it an end of generation?
253
+ if (llama_vocab_is_eog(vocab, new_token_id)) {
254
+ _result.stopped_eos = true;
255
+ break;
256
+ }
257
+
225
258
  if (_next_token_uses_guide_token && !_guide_tokens.empty() &&
226
259
  !llama_vocab_is_control(vocab, new_token_id) &&
227
260
  !llama_vocab_is_eog(vocab, new_token_id)) {
@@ -250,21 +283,49 @@ void LlamaCompletionWorker::Execute() {
250
283
  if (_has_callback) {
251
284
  // TODO: When we got possible stop words (startsWith)
252
285
  // we should avoid calling the callback, wait for the next token
253
- const char *c_token = strdup(token.c_str());
254
- _tsfn.BlockingCall(c_token, [](Napi::Env env, Napi::Function jsCallback,
255
- const char *value) {
286
+ struct TokenData {
287
+ std::string token;
288
+ std::string content;
289
+ std::string reasoning_content;
290
+ std::vector<common_chat_tool_call> tool_calls;
291
+ std::string accumulated_text;
292
+ };
293
+
294
+ auto partial = getPartialOutput(_result.text);
295
+ TokenData *token_data = new TokenData{token, partial.content, partial.reasoning_content, partial.tool_calls, _result.text};
296
+
297
+ _tsfn.BlockingCall(token_data, [](Napi::Env env, Napi::Function jsCallback,
298
+ TokenData *data) {
256
299
  auto obj = Napi::Object::New(env);
257
- obj.Set("token", Napi::String::New(env, value));
258
- delete value;
300
+ obj.Set("token", Napi::String::New(env, data->token));
301
+ if (!data->content.empty()) {
302
+ obj.Set("content", Napi::String::New(env, data->content));
303
+ }
304
+ if (!data->reasoning_content.empty()) {
305
+ obj.Set("reasoning_content", Napi::String::New(env, data->reasoning_content));
306
+ }
307
+ if (!data->tool_calls.empty()) {
308
+ Napi::Array tool_calls = Napi::Array::New(env);
309
+ for (size_t i = 0; i < data->tool_calls.size(); i++) {
310
+ const auto &tc = data->tool_calls[i];
311
+ Napi::Object tool_call = Napi::Object::New(env);
312
+ tool_call.Set("type", "function");
313
+ Napi::Object function = Napi::Object::New(env);
314
+ function.Set("name", tc.name);
315
+ function.Set("arguments", tc.arguments);
316
+ tool_call.Set("function", function);
317
+ if (!tc.id.empty()) {
318
+ tool_call.Set("id", tc.id);
319
+ }
320
+ tool_calls.Set(i, tool_call);
321
+ }
322
+ obj.Set("tool_calls", tool_calls);
323
+ }
324
+ obj.Set("accumulated_text", Napi::String::New(env, data->accumulated_text));
325
+ delete data;
259
326
  jsCallback.Call({obj});
260
327
  });
261
328
  }
262
- // is it an end of generation?
263
- if (llama_vocab_is_eog(vocab, new_token_id)) {
264
- _result.stopped_eos = true;
265
- // TODO: EOS token should be cut
266
- break;
267
- }
268
329
  // check for stop words
269
330
  if (!_stop_words.empty()) {
270
331
  const size_t stop_pos =
@@ -316,13 +377,7 @@ void LlamaCompletionWorker::OnOK() {
316
377
 
317
378
  chat_syntax.thinking_forced_open = _thinking_forced_open;
318
379
 
319
- if (_reasoning_format == "deepseek") {
320
- chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
321
- } else if (_reasoning_format == "deepseek-legacy") {
322
- chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
323
- } else {
324
- chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_NONE;
325
- }
380
+ chat_syntax.reasoning_format = common_reasoning_format_from_name(_reasoning_format);
326
381
  common_chat_msg message = common_chat_parse(
327
382
  _result.text,
328
383
  false,
@@ -42,6 +42,14 @@ protected:
42
42
  void OnError(const Napi::Error &err) override;
43
43
 
44
44
  private:
45
+ struct PartialOutput {
46
+ std::string content = "";
47
+ std::string reasoning_content = "";
48
+ std::vector<common_chat_tool_call> tool_calls;
49
+ };
50
+
51
+ PartialOutput getPartialOutput(const std::string &generated_text);
52
+
45
53
  LlamaSessionPtr _sess;
46
54
  common_params _params;
47
55
  std::vector<std::string> _stop_words;
@@ -12,6 +12,8 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
12
12
  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
13
13
  endif()
14
14
 
15
+ message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
16
+
15
17
  # Add path to modules
16
18
  list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
17
19
 
@@ -24,6 +24,7 @@
24
24
  #include <cstdarg>
25
25
  #include <filesystem>
26
26
  #include <fstream>
27
+ #include <list>
27
28
  #include <regex>
28
29
  #include <set>
29
30
  #include <string>
@@ -748,6 +749,39 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
748
749
  // utils
749
750
  //
750
751
 
752
+ // Helper function to parse tensor buffer override strings
753
+ static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
754
+ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
755
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
756
+ auto * dev = ggml_backend_dev_get(i);
757
+ auto * buft = ggml_backend_dev_buffer_type(dev);
758
+ if (buft) {
759
+ buft_list[ggml_backend_buft_name(buft)] = buft;
760
+ }
761
+ }
762
+
763
+ for (const auto & override : string_split<std::string>(value, ',')) {
764
+ std::string::size_type pos = override.find('=');
765
+ if (pos == std::string::npos) {
766
+ throw std::invalid_argument("invalid value");
767
+ }
768
+ std::string tensor_name = override.substr(0, pos);
769
+ std::string buffer_type = override.substr(pos + 1);
770
+
771
+ if (buft_list.find(buffer_type) == buft_list.end()) {
772
+ printf("Available buffer types:\n");
773
+ for (const auto & it : buft_list) {
774
+ printf(" %s\n", ggml_backend_buft_name(it.second));
775
+ }
776
+ throw std::invalid_argument("unknown buffer type");
777
+ }
778
+ // keep strings alive and avoid leaking memory by storing them in a static vector
779
+ static std::list<std::string> buft_overrides;
780
+ buft_overrides.push_back(tensor_name);
781
+ overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
782
+ }
783
+ }
784
+
751
785
  struct handle_model_result {
752
786
  bool found_mmproj = false;
753
787
  common_params_model mmproj;
@@ -992,6 +1026,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
992
1026
  params.tensor_buft_overrides.push_back({nullptr, nullptr});
993
1027
  }
994
1028
 
1029
+ if (!params.speculative.tensor_buft_overrides.empty()) {
1030
+ params.speculative.tensor_buft_overrides.push_back({nullptr, nullptr});
1031
+ }
1032
+
995
1033
  if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
996
1034
  throw std::runtime_error(string_format(
997
1035
  "error: the supplied chat template is not supported: %s%s\n",
@@ -1200,6 +1238,7 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
1200
1238
  common_params_print_completion(ctx_arg);
1201
1239
  exit(0);
1202
1240
  }
1241
+ params.lr.init();
1203
1242
  } catch (const std::invalid_argument & ex) {
1204
1243
  fprintf(stderr, "%s\n", ex.what());
1205
1244
  ctx_arg.params = params_org;
@@ -1468,6 +1507,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1468
1507
  params.swa_full = true;
1469
1508
  }
1470
1509
  ).set_env("LLAMA_ARG_SWA_FULL"));
1510
+ add_opt(common_arg(
1511
+ {"--swa-checkpoints"}, "N",
1512
+ string_format("max number of SWA checkpoints per slot to create (default: %d)\n"
1513
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_swa_checkpoints),
1514
+ [](common_params & params, int value) {
1515
+ params.n_swa_checkpoints = value;
1516
+ }
1517
+ ).set_env("LLAMA_ARG_SWA_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
1471
1518
  add_opt(common_arg(
1472
1519
  {"--kv-unified", "-kvu"},
1473
1520
  string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
@@ -2348,47 +2395,58 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2348
2395
  add_opt(common_arg(
2349
2396
  {"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
2350
2397
  "override tensor buffer type", [](common_params & params, const std::string & value) {
2351
- /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
2352
- if (buft_list.empty()) {
2353
- // enumerate all the devices and add their buffer types to the list
2354
- for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
2355
- auto * dev = ggml_backend_dev_get(i);
2356
- auto * buft = ggml_backend_dev_buffer_type(dev);
2357
- if (buft) {
2358
- buft_list[ggml_backend_buft_name(buft)] = buft;
2359
- }
2360
- }
2361
- }
2362
-
2363
- for (const auto & override : string_split<std::string>(value, ',')) {
2364
- std::string::size_type pos = override.find('=');
2365
- if (pos == std::string::npos) {
2366
- throw std::invalid_argument("invalid value");
2367
- }
2368
- std::string tensor_name = override.substr(0, pos);
2369
- std::string buffer_type = override.substr(pos + 1);
2370
-
2371
- if (buft_list.find(buffer_type) == buft_list.end()) {
2372
- printf("Available buffer types:\n");
2373
- for (const auto & it : buft_list) {
2374
- printf(" %s\n", ggml_backend_buft_name(it.second));
2375
- }
2376
- throw std::invalid_argument("unknown buffer type");
2377
- }
2378
- // FIXME: this leaks memory
2379
- params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
2380
- }
2398
+ parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
2381
2399
  }
2382
2400
  ));
2383
2401
  add_opt(common_arg(
2384
- {"--cpu-moe"},
2385
- "use CPU for Mixture of Experts (MoE) weights",
2402
+ {"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
2403
+ "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
2404
+ parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
2405
+ }
2406
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2407
+ add_opt(common_arg(
2408
+ {"--cpu-moe", "-cmoe"},
2409
+ "keep all Mixture of Experts (MoE) weights in the CPU",
2386
2410
  [](common_params & params) {
2387
- params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2388
- params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2389
- params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2411
+ params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
2390
2412
  }
2391
2413
  ).set_env("LLAMA_ARG_CPU_MOE"));
2414
+ add_opt(common_arg(
2415
+ {"--n-cpu-moe", "-ncmoe"}, "N",
2416
+ "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
2417
+ [](common_params & params, int value) {
2418
+ if (value < 0) {
2419
+ throw std::invalid_argument("invalid value");
2420
+ }
2421
+ for (int i = 0; i < value; ++i) {
2422
+ // keep strings alive and avoid leaking memory by storing them in a static vector
2423
+ static std::list<std::string> buft_overrides;
2424
+ buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2425
+ params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
2426
+ }
2427
+ }
2428
+ ).set_env("LLAMA_ARG_N_CPU_MOE"));
2429
+ add_opt(common_arg(
2430
+ {"--cpu-moe-draft", "-cmoed"},
2431
+ "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
2432
+ [](common_params & params) {
2433
+ params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
2434
+ }
2435
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
2436
+ add_opt(common_arg(
2437
+ {"--n-cpu-moe-draft", "-ncmoed"}, "N",
2438
+ "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
2439
+ [](common_params & params, int value) {
2440
+ if (value < 0) {
2441
+ throw std::invalid_argument("invalid value");
2442
+ }
2443
+ for (int i = 0; i < value; ++i) {
2444
+ static std::list<std::string> buft_overrides_draft;
2445
+ buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2446
+ params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
2447
+ }
2448
+ }
2449
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
2392
2450
  add_opt(common_arg(
2393
2451
  {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
2394
2452
  "number of layers to store in VRAM",
@@ -2639,7 +2697,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2639
2697
  [](common_params & params, const std::string & value) {
2640
2698
  params.out_file = value;
2641
2699
  }
2642
- ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
2700
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
2643
2701
  add_opt(common_arg(
2644
2702
  {"-ofreq", "--output-frequency"}, "N",
2645
2703
  string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -2649,10 +2707,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2649
2707
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2650
2708
  add_opt(common_arg(
2651
2709
  {"--output-format"}, "{gguf,dat}",
2652
- string_format("output format for imatrix file (default: %s)", params.imat_dat ? "dat" : "gguf"),
2710
+ string_format("output format for imatrix file (default: %s)", params.imat_dat > 0 ? "dat" : "gguf"),
2653
2711
  [](common_params & params, const std::string & value) {
2654
- /**/ if (value == "gguf") { params.imat_dat = false; }
2655
- else if (value == "dat") { params.imat_dat = true; }
2712
+ /**/ if (value == "gguf") { params.imat_dat = -1; }
2713
+ else if (value == "dat") { params.imat_dat = 1; }
2656
2714
  else { throw std::invalid_argument("invalid output format"); }
2657
2715
  }
2658
2716
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
@@ -2931,12 +2989,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2931
2989
  "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
2932
2990
  "- none: leaves thoughts unparsed in `message.content`\n"
2933
2991
  "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
2934
- "(default: deepseek)",
2992
+ "(default: auto)",
2935
2993
  [](common_params & params, const std::string & value) {
2936
- /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
2937
- else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
2938
- else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2939
- else { throw std::invalid_argument("invalid value"); }
2994
+ params.reasoning_format = common_reasoning_format_from_name(value);
2940
2995
  }
2941
2996
  ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
2942
2997
  add_opt(common_arg(
@@ -3117,7 +3172,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3117
3172
  params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
3118
3173
  }
3119
3174
  }
3120
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3175
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
3121
3176
  add_opt(common_arg(
3122
3177
  {"-tbd", "--threads-batch-draft"}, "N",
3123
3178
  "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
@@ -3127,7 +3182,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3127
3182
  params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
3128
3183
  }
3129
3184
  }
3130
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3185
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
3131
3186
  add_opt(common_arg(
3132
3187
  {"-Cd", "--cpu-mask-draft"}, "M",
3133
3188
  "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
@@ -3520,5 +3575,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3520
3575
  ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3521
3576
 
3522
3577
 
3578
+ add_opt(
3579
+ common_arg({ "-lr", "--learning-rate" }, "ALPHA",
3580
+ string_format(
3581
+ "adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)",
3582
+ (double) params.lr.lr0),
3583
+ [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); })
3584
+ .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3585
+ add_opt(
3586
+ common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
3587
+ string_format(
3588
+ "(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
3589
+ (double) params.lr.lr_min),
3590
+ [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
3591
+ .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3592
+ add_opt(
3593
+ common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
3594
+ string_format(
3595
+ "(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
3596
+ (double) params.lr.decay_epochs),
3597
+ [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
3598
+ .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3599
+ add_opt(common_arg(
3600
+ { "-wd", "--weight-decay" }, "WD",
3601
+ string_format(
3602
+ "adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
3603
+ (double) params.lr.wd),
3604
+ [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
3605
+ .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3606
+ add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
3607
+ string_format("fraction of data to use as validation set for training (default: %.2g).",
3608
+ (double) params.val_split),
3609
+ [](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
3610
+ .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3611
+ add_opt(common_arg({ "-epochs", "--epochs" }, "N",
3612
+ string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
3613
+ [](common_params & params, int epochs) { params.lr.epochs = epochs; })
3614
+ .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3615
+ add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
3616
+ [](common_params & params, const std::string & name) {
3617
+ params.optimizer = common_opt_get_optimizer(name.c_str());
3618
+ if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
3619
+ throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
3620
+ }
3621
+ })
3622
+ .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3623
+
3523
3624
  return ctx_arg;
3524
3625
  }
@@ -55,7 +55,15 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
55
55
  bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
56
56
  std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
57
57
  std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
58
- std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
58
+ std::string arguments = "";
59
+ if (tool_call.contains("arguments")) {
60
+ if (tool_call.at("arguments").is_object()) {
61
+ arguments = tool_call.at("arguments").dump();
62
+ } else {
63
+ arguments = tool_call.at("arguments");
64
+ }
65
+ }
66
+
59
67
  return add_tool_call(name, id, arguments);
60
68
  }
61
69