@fugood/llama.node 1.4.13 → 1.4.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/binding.ts CHANGED
@@ -112,7 +112,7 @@ export type CompletionResponseFormat = {
112
112
  export type LlamaCompletionOptions = {
113
113
  messages?: ChatMessage[]
114
114
  jinja?: boolean
115
- reasoning_format?: string
115
+ reasoning_format?: 'none' | 'auto' | 'deepseek'
116
116
  chat_template?: string
117
117
  response_format?: CompletionResponseFormat
118
118
  tools?: Tool[]
@@ -200,6 +200,13 @@ export type LlamaParallelCompletionOptions = LlamaCompletionOptions & {
200
200
  */
201
201
  save_state_path?: string
202
202
 
203
+ /**
204
+ * File path to save prompt-only state to after prompt processing.
205
+ * Useful for fast prompt reuse (especially for recurrent/hybrid models).
206
+ * Example: `'/path/to/prompt_state.bin'` or `'file:///path/to/prompt_state.bin'`
207
+ */
208
+ save_prompt_state_path?: string
209
+
203
210
  /**
204
211
  * Number of tokens to load when loading state.
205
212
  * If not specified or <= 0, all tokens from the state file will be loaded.
@@ -363,6 +370,8 @@ export type ModelInfo = {
363
370
  nEmbd: number
364
371
  nParams: number
365
372
  size: number
373
+ is_recurrent: boolean
374
+ is_hybrid: boolean
366
375
  chatTemplates: {
367
376
  llamaChat: boolean
368
377
  minja: {
@@ -475,6 +484,7 @@ export interface LlamaContext {
475
484
  parallel_tool_calls?: boolean
476
485
  tool_choice?: string
477
486
  enable_thinking?: boolean
487
+ reasoning_format?: 'none' | 'auto' | 'deepseek'
478
488
  add_generation_prompt?: boolean
479
489
  now?: string | number
480
490
  chat_template_kwargs?: Record<string, string>
package/lib/index.js CHANGED
@@ -87,7 +87,7 @@ class LlamaContextWrapper {
87
87
  return !!this.ctx.getModelInfo().chatTemplates.llamaChat;
88
88
  }
89
89
  getFormattedChat(messages, template, params) {
90
- var _a, _b;
90
+ var _a, _b, _c;
91
91
  const { messages: chat, has_media, media_paths } = (0, utils_1.formatMediaChat)(messages);
92
92
  const useJinja = this.isJinjaSupported() && ((_a = params === null || params === void 0 ? void 0 : params.jinja) !== null && _a !== void 0 ? _a : true);
93
93
  let tmpl;
@@ -100,6 +100,7 @@ class LlamaContextWrapper {
100
100
  parallel_tool_calls: params === null || params === void 0 ? void 0 : params.parallel_tool_calls,
101
101
  tool_choice: params === null || params === void 0 ? void 0 : params.tool_choice,
102
102
  enable_thinking: (_b = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _b !== void 0 ? _b : true,
103
+ reasoning_format: (_c = params === null || params === void 0 ? void 0 : params.reasoning_format) !== null && _c !== void 0 ? _c : 'none',
103
104
  add_generation_prompt: params === null || params === void 0 ? void 0 : params.add_generation_prompt,
104
105
  now: params === null || params === void 0 ? void 0 : params.now,
105
106
  chat_template_kwargs: (params === null || params === void 0 ? void 0 : params.chat_template_kwargs)
package/lib/index.ts CHANGED
@@ -118,6 +118,7 @@ class LlamaContextWrapper {
118
118
  parallel_tool_calls?: boolean
119
119
  tool_choice?: string
120
120
  enable_thinking?: boolean
121
+ reasoning_format?: 'none' | 'auto' | 'deepseek'
121
122
  add_generation_prompt?: boolean
122
123
  now?: string | number
123
124
  chat_template_kwargs?: Record<string, string>
@@ -136,6 +137,7 @@ class LlamaContextWrapper {
136
137
  parallel_tool_calls: params?.parallel_tool_calls,
137
138
  tool_choice: params?.tool_choice,
138
139
  enable_thinking: params?.enable_thinking ?? true,
140
+ reasoning_format: params?.reasoning_format ?? 'none',
139
141
  add_generation_prompt: params?.add_generation_prompt,
140
142
  now: params?.now,
141
143
  chat_template_kwargs: params?.chat_template_kwargs
package/lib/parallel.ts CHANGED
@@ -1,10 +1,10 @@
1
1
  // Parallel decoding API implementation for llama.node
2
2
  import type {
3
3
  LlamaContext,
4
- LlamaCompletionOptions,
5
4
  LlamaCompletionToken,
6
5
  RerankParams,
7
6
  ParallelStatus,
7
+ LlamaParallelCompletionOptions,
8
8
  } from './binding'
9
9
  import { formatMediaChat } from './utils'
10
10
 
@@ -68,7 +68,7 @@ export class LlamaParallelAPI {
68
68
  * @returns Object with requestId, promise for result, and stop function
69
69
  */
70
70
  async completion(
71
- options: LlamaCompletionOptions,
71
+ options: LlamaParallelCompletionOptions,
72
72
  onToken?: (requestId: number, data: LlamaCompletionToken) => void,
73
73
  ): Promise<{
74
74
  requestId: number
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.4.13",
4
+ "version": "1.4.14",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,20 +72,20 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-darwin-arm64": "1.4.13",
76
- "@fugood/node-llama-darwin-x64": "1.4.13",
77
- "@fugood/node-llama-linux-arm64": "1.4.13",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.4.13",
79
- "@fugood/node-llama-linux-arm64-snapdragon": "1.4.13",
80
- "@fugood/node-llama-linux-arm64-vulkan": "1.4.13",
81
- "@fugood/node-llama-linux-x64": "1.4.13",
82
- "@fugood/node-llama-linux-x64-cuda": "1.4.13",
83
- "@fugood/node-llama-linux-x64-vulkan": "1.4.13",
84
- "@fugood/node-llama-win32-arm64": "1.4.13",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.4.13",
86
- "@fugood/node-llama-win32-x64": "1.4.13",
87
- "@fugood/node-llama-win32-x64-cuda": "1.4.13",
88
- "@fugood/node-llama-win32-x64-vulkan": "1.4.13"
75
+ "@fugood/node-llama-darwin-arm64": "1.4.14",
76
+ "@fugood/node-llama-darwin-x64": "1.4.14",
77
+ "@fugood/node-llama-linux-arm64": "1.4.14",
78
+ "@fugood/node-llama-linux-arm64-cuda": "1.4.14",
79
+ "@fugood/node-llama-linux-arm64-snapdragon": "1.4.14",
80
+ "@fugood/node-llama-linux-arm64-vulkan": "1.4.14",
81
+ "@fugood/node-llama-linux-x64": "1.4.14",
82
+ "@fugood/node-llama-linux-x64-cuda": "1.4.14",
83
+ "@fugood/node-llama-linux-x64-vulkan": "1.4.14",
84
+ "@fugood/node-llama-win32-arm64": "1.4.14",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.4.14",
86
+ "@fugood/node-llama-win32-x64": "1.4.14",
87
+ "@fugood/node-llama-win32-x64-cuda": "1.4.14",
88
+ "@fugood/node-llama-win32-x64-vulkan": "1.4.14"
89
89
  },
90
90
  "devDependencies": {
91
91
  "@babel/preset-env": "^7.24.4",
@@ -96,7 +96,7 @@ index 8bd4a325f..333b3301f 100644
96
96
  struct common_chat_tool_call {
97
97
  std::string name;
98
98
  diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
99
- index 41b2b6833..fe9ba05aa 100644
99
+ index 744f0b4ee..04fcebb9e 100644
100
100
  --- a/src/llama.cpp/common/common.cpp
101
101
  +++ b/src/llama.cpp/common/common.cpp
102
102
  @@ -1361,6 +1361,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
@@ -108,7 +108,7 @@ index 41b2b6833..fe9ba05aa 100644
108
108
  mparams.main_gpu = params.main_gpu;
109
109
  mparams.split_mode = params.split_mode;
110
110
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
111
- index d6fd0d37a..477209ce5 100644
111
+ index 7794c0268..5b77ae0c3 100644
112
112
  --- a/src/llama.cpp/common/common.h
113
113
  +++ b/src/llama.cpp/common/common.h
114
114
  @@ -310,6 +310,7 @@ struct lr_opt {
@@ -595,6 +595,8 @@ Napi::Value LlamaContext::GetModelInfo(const Napi::CallbackInfo &info) {
595
595
  details.Set("nEmbd", llama_model_n_embd(model));
596
596
  details.Set("nParams", llama_model_n_params(model));
597
597
  details.Set("size", llama_model_size(model));
598
+ details.Set("is_recurrent", llama_model_is_recurrent(model));
599
+ details.Set("is_hybrid", llama_model_is_hybrid(model));
598
600
 
599
601
  Napi::Object chatTemplates = Napi::Object::New(info.Env());
600
602
  chatTemplates.Set("llamaChat", _rn_ctx->validateModelChatTemplate(false, nullptr));
@@ -703,6 +705,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
703
705
  get_option<bool>(params, "parallel_tool_calls", false);
704
706
  auto tool_choice = get_option<std::string>(params, "tool_choice", "");
705
707
  auto enable_thinking = get_option<bool>(params, "enable_thinking", false);
708
+ auto reasoning_format = get_option<std::string>(params, "reasoning_format", "none");
706
709
  auto add_generation_prompt = get_option<bool>(params, "add_generation_prompt", true);
707
710
  auto now_str = get_option<std::string>(params, "now", "");
708
711
 
@@ -721,7 +724,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
721
724
  try {
722
725
  chatParams = _rn_ctx->getFormattedChatWithJinja(
723
726
  messages, chat_template, json_schema_str, tools_str,
724
- parallel_tool_calls, tool_choice, enable_thinking,
727
+ parallel_tool_calls, tool_choice, enable_thinking, reasoning_format,
725
728
  add_generation_prompt, now_str, chat_template_kwargs);
726
729
  } catch (const nlohmann::json_abi_v3_12_0::detail::parse_error& e) {
727
730
  Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
@@ -962,7 +965,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
962
965
  try {
963
966
  chatParams = _rn_ctx->getFormattedChatWithJinja(
964
967
  json_stringify(messages), chat_template,
965
- json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking,
968
+ json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking, reasoning_format,
966
969
  add_generation_prompt, now_str, chat_template_kwargs);
967
970
  } catch (const std::exception &e) {
968
971
  Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
@@ -6,6 +6,7 @@
6
6
  #include "log.h"
7
7
  #include "sampling.h"
8
8
  #include "download.h"
9
+ #include "preset.h"
9
10
 
10
11
  // fix problem with std::min and std::max
11
12
  #if defined(_WIN32)
@@ -268,6 +269,46 @@ static void parse_tensor_buffer_overrides(const std::string & value, std::vector
268
269
  }
269
270
  }
270
271
 
272
+ static std::string clean_file_name(const std::string & fname) {
273
+ std::string clean_fname = fname;
274
+ string_replace_all(clean_fname, "\\", "_");
275
+ string_replace_all(clean_fname, "/", "_");
276
+ return clean_fname;
277
+ }
278
+
279
+ static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
280
+ GGML_ASSERT(!params.model.hf_repo.empty());
281
+
282
+ const bool offline = params.offline;
283
+ std::string model_endpoint = get_model_endpoint();
284
+ auto preset_url = model_endpoint + params.model.hf_repo + "/resolve/main/preset.ini";
285
+
286
+ // prepare local path for caching
287
+ auto preset_fname = clean_file_name(params.model.hf_repo + "_preset.ini");
288
+ auto preset_path = fs_get_cache_file(preset_fname);
289
+ const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
290
+ const bool has_preset = status >= 200 && status < 400;
291
+
292
+ // remote preset is optional, so we don't error out if not found
293
+ if (has_preset) {
294
+ LOG_INF("applying remote preset from %s\n", preset_url.c_str());
295
+ common_preset_context ctx(ex, /* only_remote_allowed */ true);
296
+ common_preset global; // unused for now
297
+ auto remote_presets = ctx.load_from_ini(preset_path, global);
298
+ if (remote_presets.find(COMMON_PRESET_DEFAULT_NAME) != remote_presets.end()) {
299
+ common_preset & preset = remote_presets.at(COMMON_PRESET_DEFAULT_NAME);
300
+ LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
301
+ preset.apply_to_params(params);
302
+ } else {
303
+ throw std::runtime_error("Remote preset.ini does not contain [" + std::string(COMMON_PRESET_DEFAULT_NAME) + "] section");
304
+ }
305
+ } else {
306
+ LOG_INF("%s", "no remote preset found, skipping\n");
307
+ }
308
+
309
+ return has_preset;
310
+ }
311
+
271
312
  struct handle_model_result {
272
313
  bool found_mmproj = false;
273
314
  common_params_model mmproj;
@@ -309,9 +350,7 @@ static handle_model_result common_params_handle_model(
309
350
  // make sure model path is present (for caching purposes)
310
351
  if (model.path.empty()) {
311
352
  // this is to avoid different repo having same file name, or same file name in different subdirs
312
- std::string filename = model.hf_repo + "_" + model.hf_file;
313
- // to make sure we don't have any slashes in the filename
314
- string_replace_all(filename, "/", "_");
353
+ std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
315
354
  model.path = fs_get_cache_file(filename);
316
355
  }
317
356
 
@@ -425,61 +464,87 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
425
464
  }
426
465
  };
427
466
 
428
- std::set<std::string> seen_args;
467
+ auto parse_cli_args = [&]() {
468
+ std::set<std::string> seen_args;
429
469
 
430
- for (int i = 1; i < argc; i++) {
431
- const std::string arg_prefix = "--";
470
+ for (int i = 1; i < argc; i++) {
471
+ const std::string arg_prefix = "--";
432
472
 
433
- std::string arg = argv[i];
434
- if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
435
- std::replace(arg.begin(), arg.end(), '_', '-');
436
- }
437
- if (arg_to_options.find(arg) == arg_to_options.end()) {
438
- throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
439
- }
440
- if (!seen_args.insert(arg).second) {
441
- LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
442
- }
443
- auto & tmp = arg_to_options[arg];
444
- auto opt = *tmp.first;
445
- bool is_positive = tmp.second;
446
- if (opt.has_value_from_env()) {
447
- fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
448
- }
449
- try {
450
- if (opt.handler_void) {
451
- opt.handler_void(params);
452
- continue;
473
+ std::string arg = argv[i];
474
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
475
+ std::replace(arg.begin(), arg.end(), '_', '-');
453
476
  }
454
- if (opt.handler_bool) {
455
- opt.handler_bool(params, is_positive);
456
- continue;
477
+ if (arg_to_options.find(arg) == arg_to_options.end()) {
478
+ throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
457
479
  }
458
-
459
- // arg with single value
460
- check_arg(i);
461
- std::string val = argv[++i];
462
- if (opt.handler_int) {
463
- opt.handler_int(params, std::stoi(val));
464
- continue;
480
+ if (!seen_args.insert(arg).second) {
481
+ LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
465
482
  }
466
- if (opt.handler_string) {
467
- opt.handler_string(params, val);
468
- continue;
483
+ auto & tmp = arg_to_options[arg];
484
+ auto opt = *tmp.first;
485
+ bool is_positive = tmp.second;
486
+ if (opt.has_value_from_env()) {
487
+ fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
469
488
  }
489
+ try {
490
+ if (opt.handler_void) {
491
+ opt.handler_void(params);
492
+ continue;
493
+ }
494
+ if (opt.handler_bool) {
495
+ opt.handler_bool(params, is_positive);
496
+ continue;
497
+ }
470
498
 
471
- // arg with 2 values
472
- check_arg(i);
473
- std::string val2 = argv[++i];
474
- if (opt.handler_str_str) {
475
- opt.handler_str_str(params, val, val2);
476
- continue;
499
+ // arg with single value
500
+ check_arg(i);
501
+ std::string val = argv[++i];
502
+ if (opt.handler_int) {
503
+ opt.handler_int(params, std::stoi(val));
504
+ continue;
505
+ }
506
+ if (opt.handler_string) {
507
+ opt.handler_string(params, val);
508
+ continue;
509
+ }
510
+
511
+ // arg with 2 values
512
+ check_arg(i);
513
+ std::string val2 = argv[++i];
514
+ if (opt.handler_str_str) {
515
+ opt.handler_str_str(params, val, val2);
516
+ continue;
517
+ }
518
+ } catch (std::exception & e) {
519
+ throw std::invalid_argument(string_format(
520
+ "error while handling argument \"%s\": %s\n\n"
521
+ "usage:\n%s\n\nto show complete usage, run with -h",
522
+ arg.c_str(), e.what(), opt.to_string().c_str()));
477
523
  }
478
- } catch (std::exception & e) {
479
- throw std::invalid_argument(string_format(
480
- "error while handling argument \"%s\": %s\n\n"
481
- "usage:\n%s\n\nto show complete usage, run with -h",
482
- arg.c_str(), e.what(), opt.to_string().c_str()));
524
+ }
525
+ };
526
+
527
+ // parse the first time to get -hf option (used for remote preset)
528
+ parse_cli_args();
529
+
530
+ // maybe handle remote preset
531
+ if (!params.model.hf_repo.empty()) {
532
+ std::string cli_hf_repo = params.model.hf_repo;
533
+ bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
534
+
535
+ // special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
536
+ // this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
537
+ std::string preset_hf_repo = params.model.hf_repo;
538
+ bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
539
+
540
+ if (has_preset) {
541
+ // re-parse CLI args to override preset values
542
+ parse_cli_args();
543
+ }
544
+
545
+ // preserve hf_repo from preset if needed
546
+ if (preset_has_hf_repo) {
547
+ params.model.hf_repo = preset_hf_repo;
483
548
  }
484
549
  }
485
550
 
@@ -2088,11 +2153,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2088
2153
  add_opt(common_arg(
2089
2154
  {"--mmap"},
2090
2155
  {"--no-mmap"},
2091
- string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
2156
+ string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
2092
2157
  [](common_params & params, bool value) {
2093
2158
  params.use_mmap = value;
2159
+ if (value) {
2160
+ params.use_direct_io = false; // disable direct io when mmap is explicitly enabled
2161
+ }
2094
2162
  }
2095
2163
  ).set_env("LLAMA_ARG_MMAP"));
2164
+ add_opt(common_arg(
2165
+ {"-dio", "--direct-io"},
2166
+ {"-ndio", "--no-direct-io"},
2167
+ string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
2168
+ [](common_params & params, bool value) {
2169
+ params.use_direct_io = value;
2170
+ }
2171
+ ).set_env("LLAMA_ARG_DIO"));
2096
2172
  add_opt(common_arg(
2097
2173
  {"--numa"}, "TYPE",
2098
2174
  "attempt optimizations that help on some NUMA systems\n"
@@ -2244,7 +2320,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2244
2320
  std::vector<std::string> split_arg{ it, {} };
2245
2321
  if (split_arg.size() >= llama_max_devices()) {
2246
2322
  throw std::invalid_argument(
2247
- string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
2323
+ string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
2248
2324
  );
2249
2325
  }
2250
2326
  for (size_t i = 0; i < llama_max_devices(); ++i) {
@@ -2284,10 +2360,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2284
2360
  }
2285
2361
  ).set_env("LLAMA_ARG_FIT"));
2286
2362
  add_opt(common_arg(
2287
- { "-fitt", "--fit-target" }, "MiB",
2288
- string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
2289
- [](common_params & params, int value) {
2290
- params.fit_params_target = value * size_t(1024*1024);
2363
+ { "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
2364
+ string_format("target margin per device for --fit, comma-separated list of values, "
2365
+ "single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
2366
+ [](common_params & params, const std::string & value) {
2367
+ std::string arg_next = value;
2368
+
2369
+ // split string by , and /
2370
+ const std::regex regex{ R"([,/]+)" };
2371
+ std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
2372
+ std::vector<std::string> split_arg{ it, {} };
2373
+ if (split_arg.size() >= llama_max_devices()) {
2374
+ throw std::invalid_argument(
2375
+ string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
2376
+ );
2377
+ }
2378
+ if (split_arg.size() == 1) {
2379
+ std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
2380
+ return;
2381
+ }
2382
+ for (size_t i = 0; i < split_arg.size(); i++) {
2383
+ params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
2384
+ }
2291
2385
  }
2292
2386
  ).set_env("LLAMA_ARG_FIT_TARGET"));
2293
2387
  add_opt(common_arg(
@@ -129,11 +129,3 @@ void common_params_add_preset_options(std::vector<common_arg> & args);
129
129
 
130
130
  // initialize argument parser context - used by test-arg-parser and preset
131
131
  common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
132
-
133
- struct common_remote_params {
134
- std::vector<std::string> headers;
135
- long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
136
- long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
137
- };
138
- // get remote file content, returns <http_code, raw_response_body>
139
- std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
@@ -1097,7 +1097,7 @@ common_init_result::common_init_result(common_params & params) :
1097
1097
  if (params.fit_params) {
1098
1098
  LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
1099
1099
  llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
1100
- params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
1100
+ params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
1101
1101
  params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
1102
1102
  }
1103
1103
 
@@ -1367,6 +1367,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
1367
1367
  mparams.split_mode = params.split_mode;
1368
1368
  mparams.tensor_split = params.tensor_split;
1369
1369
  mparams.use_mmap = params.use_mmap;
1370
+ mparams.use_direct_io = params.use_direct_io;
1370
1371
  mparams.use_mlock = params.use_mlock;
1371
1372
  mparams.check_tensors = params.check_tensors;
1372
1373
  mparams.use_extra_bufts = !params.no_extra_bufts;
@@ -333,12 +333,14 @@ struct common_params {
333
333
  // offload params
334
334
  std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
335
335
 
336
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
337
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
338
- float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
339
- bool fit_params = true; // whether to fit unset model/context parameters to free device memory
340
- size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
341
- int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
336
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
337
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
338
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
339
+ bool fit_params = true; // whether to fit unset model/context parameters to free device memory
340
+ int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
341
+
342
+ // margin per device in bytes for fitting parameters to free memory:
343
+ std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
342
344
 
343
345
  enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
344
346
 
@@ -429,7 +431,8 @@ struct common_params {
429
431
  bool kv_unified = false; // enable unified KV cache
430
432
 
431
433
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
432
- bool use_mmap = true; // use mmap for faster loads
434
+ bool use_mmap = true; // enable mmap to use filesystem cache
435
+ bool use_direct_io = true; // read from disk without buffering for faster model loading
433
436
  bool use_mlock = false; // use mlock to keep model in memory
434
437
  bool verbose_prompt = false; // print prompt tokens before generation
435
438
  bool display_prompt = true; // print prompt before generation