@fugood/llama.node 1.4.13 → 1.4.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +11 -1
- package/lib/index.js +2 -1
- package/lib/index.ts +2 -0
- package/lib/parallel.ts +2 -2
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +2 -2
- package/src/LlamaContext.cpp +5 -2
- package/src/llama.cpp/common/arg.cpp +150 -56
- package/src/llama.cpp/common/arg.h +0 -8
- package/src/llama.cpp/common/common.cpp +2 -1
- package/src/llama.cpp/common/common.h +10 -7
- package/src/llama.cpp/common/download.cpp +104 -55
- package/src/llama.cpp/common/download.h +26 -5
- package/src/llama.cpp/common/preset.cpp +76 -1
- package/src/llama.cpp/common/preset.h +10 -1
- package/src/llama.cpp/ggml/include/ggml.h +5 -0
- package/src/llama.cpp/include/llama.h +5 -2
- package/src/llama.cpp/src/llama-mmap.cpp +70 -37
- package/src/llama.cpp/src/llama-mmap.h +5 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +15 -5
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1 -1
- package/src/llama.cpp/src/llama.cpp +63 -27
package/lib/binding.ts
CHANGED
|
@@ -112,7 +112,7 @@ export type CompletionResponseFormat = {
|
|
|
112
112
|
export type LlamaCompletionOptions = {
|
|
113
113
|
messages?: ChatMessage[]
|
|
114
114
|
jinja?: boolean
|
|
115
|
-
reasoning_format?:
|
|
115
|
+
reasoning_format?: 'none' | 'auto' | 'deepseek'
|
|
116
116
|
chat_template?: string
|
|
117
117
|
response_format?: CompletionResponseFormat
|
|
118
118
|
tools?: Tool[]
|
|
@@ -200,6 +200,13 @@ export type LlamaParallelCompletionOptions = LlamaCompletionOptions & {
|
|
|
200
200
|
*/
|
|
201
201
|
save_state_path?: string
|
|
202
202
|
|
|
203
|
+
/**
|
|
204
|
+
* File path to save prompt-only state to after prompt processing.
|
|
205
|
+
* Useful for fast prompt reuse (especially for recurrent/hybrid models).
|
|
206
|
+
* Example: `'/path/to/prompt_state.bin'` or `'file:///path/to/prompt_state.bin'`
|
|
207
|
+
*/
|
|
208
|
+
save_prompt_state_path?: string
|
|
209
|
+
|
|
203
210
|
/**
|
|
204
211
|
* Number of tokens to load when loading state.
|
|
205
212
|
* If not specified or <= 0, all tokens from the state file will be loaded.
|
|
@@ -363,6 +370,8 @@ export type ModelInfo = {
|
|
|
363
370
|
nEmbd: number
|
|
364
371
|
nParams: number
|
|
365
372
|
size: number
|
|
373
|
+
is_recurrent: boolean
|
|
374
|
+
is_hybrid: boolean
|
|
366
375
|
chatTemplates: {
|
|
367
376
|
llamaChat: boolean
|
|
368
377
|
minja: {
|
|
@@ -475,6 +484,7 @@ export interface LlamaContext {
|
|
|
475
484
|
parallel_tool_calls?: boolean
|
|
476
485
|
tool_choice?: string
|
|
477
486
|
enable_thinking?: boolean
|
|
487
|
+
reasoning_format?: 'none' | 'auto' | 'deepseek'
|
|
478
488
|
add_generation_prompt?: boolean
|
|
479
489
|
now?: string | number
|
|
480
490
|
chat_template_kwargs?: Record<string, string>
|
package/lib/index.js
CHANGED
|
@@ -87,7 +87,7 @@ class LlamaContextWrapper {
|
|
|
87
87
|
return !!this.ctx.getModelInfo().chatTemplates.llamaChat;
|
|
88
88
|
}
|
|
89
89
|
getFormattedChat(messages, template, params) {
|
|
90
|
-
var _a, _b;
|
|
90
|
+
var _a, _b, _c;
|
|
91
91
|
const { messages: chat, has_media, media_paths } = (0, utils_1.formatMediaChat)(messages);
|
|
92
92
|
const useJinja = this.isJinjaSupported() && ((_a = params === null || params === void 0 ? void 0 : params.jinja) !== null && _a !== void 0 ? _a : true);
|
|
93
93
|
let tmpl;
|
|
@@ -100,6 +100,7 @@ class LlamaContextWrapper {
|
|
|
100
100
|
parallel_tool_calls: params === null || params === void 0 ? void 0 : params.parallel_tool_calls,
|
|
101
101
|
tool_choice: params === null || params === void 0 ? void 0 : params.tool_choice,
|
|
102
102
|
enable_thinking: (_b = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _b !== void 0 ? _b : true,
|
|
103
|
+
reasoning_format: (_c = params === null || params === void 0 ? void 0 : params.reasoning_format) !== null && _c !== void 0 ? _c : 'none',
|
|
103
104
|
add_generation_prompt: params === null || params === void 0 ? void 0 : params.add_generation_prompt,
|
|
104
105
|
now: params === null || params === void 0 ? void 0 : params.now,
|
|
105
106
|
chat_template_kwargs: (params === null || params === void 0 ? void 0 : params.chat_template_kwargs)
|
package/lib/index.ts
CHANGED
|
@@ -118,6 +118,7 @@ class LlamaContextWrapper {
|
|
|
118
118
|
parallel_tool_calls?: boolean
|
|
119
119
|
tool_choice?: string
|
|
120
120
|
enable_thinking?: boolean
|
|
121
|
+
reasoning_format?: 'none' | 'auto' | 'deepseek'
|
|
121
122
|
add_generation_prompt?: boolean
|
|
122
123
|
now?: string | number
|
|
123
124
|
chat_template_kwargs?: Record<string, string>
|
|
@@ -136,6 +137,7 @@ class LlamaContextWrapper {
|
|
|
136
137
|
parallel_tool_calls: params?.parallel_tool_calls,
|
|
137
138
|
tool_choice: params?.tool_choice,
|
|
138
139
|
enable_thinking: params?.enable_thinking ?? true,
|
|
140
|
+
reasoning_format: params?.reasoning_format ?? 'none',
|
|
139
141
|
add_generation_prompt: params?.add_generation_prompt,
|
|
140
142
|
now: params?.now,
|
|
141
143
|
chat_template_kwargs: params?.chat_template_kwargs
|
package/lib/parallel.ts
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
// Parallel decoding API implementation for llama.node
|
|
2
2
|
import type {
|
|
3
3
|
LlamaContext,
|
|
4
|
-
LlamaCompletionOptions,
|
|
5
4
|
LlamaCompletionToken,
|
|
6
5
|
RerankParams,
|
|
7
6
|
ParallelStatus,
|
|
7
|
+
LlamaParallelCompletionOptions,
|
|
8
8
|
} from './binding'
|
|
9
9
|
import { formatMediaChat } from './utils'
|
|
10
10
|
|
|
@@ -68,7 +68,7 @@ export class LlamaParallelAPI {
|
|
|
68
68
|
* @returns Object with requestId, promise for result, and stop function
|
|
69
69
|
*/
|
|
70
70
|
async completion(
|
|
71
|
-
options:
|
|
71
|
+
options: LlamaParallelCompletionOptions,
|
|
72
72
|
onToken?: (requestId: number, data: LlamaCompletionToken) => void,
|
|
73
73
|
): Promise<{
|
|
74
74
|
requestId: number
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.4.
|
|
4
|
+
"version": "1.4.14",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,20 +72,20 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-darwin-arm64": "1.4.
|
|
76
|
-
"@fugood/node-llama-darwin-x64": "1.4.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.4.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.4.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.4.
|
|
81
|
-
"@fugood/node-llama-linux-x64": "1.4.
|
|
82
|
-
"@fugood/node-llama-linux-x64-cuda": "1.4.
|
|
83
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.4.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.4.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.4.
|
|
86
|
-
"@fugood/node-llama-win32-x64": "1.4.
|
|
87
|
-
"@fugood/node-llama-win32-x64-cuda": "1.4.
|
|
88
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.4.
|
|
75
|
+
"@fugood/node-llama-darwin-arm64": "1.4.14",
|
|
76
|
+
"@fugood/node-llama-darwin-x64": "1.4.14",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.4.14",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.4.14",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.14",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.4.14",
|
|
81
|
+
"@fugood/node-llama-linux-x64": "1.4.14",
|
|
82
|
+
"@fugood/node-llama-linux-x64-cuda": "1.4.14",
|
|
83
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.4.14",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.4.14",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.4.14",
|
|
86
|
+
"@fugood/node-llama-win32-x64": "1.4.14",
|
|
87
|
+
"@fugood/node-llama-win32-x64-cuda": "1.4.14",
|
|
88
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.4.14"
|
|
89
89
|
},
|
|
90
90
|
"devDependencies": {
|
|
91
91
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -96,7 +96,7 @@ index 8bd4a325f..333b3301f 100644
|
|
|
96
96
|
struct common_chat_tool_call {
|
|
97
97
|
std::string name;
|
|
98
98
|
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
|
|
99
|
-
index
|
|
99
|
+
index 744f0b4ee..04fcebb9e 100644
|
|
100
100
|
--- a/src/llama.cpp/common/common.cpp
|
|
101
101
|
+++ b/src/llama.cpp/common/common.cpp
|
|
102
102
|
@@ -1361,6 +1361,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
@@ -108,7 +108,7 @@ index 41b2b6833..fe9ba05aa 100644
|
|
|
108
108
|
mparams.main_gpu = params.main_gpu;
|
|
109
109
|
mparams.split_mode = params.split_mode;
|
|
110
110
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
111
|
-
index
|
|
111
|
+
index 7794c0268..5b77ae0c3 100644
|
|
112
112
|
--- a/src/llama.cpp/common/common.h
|
|
113
113
|
+++ b/src/llama.cpp/common/common.h
|
|
114
114
|
@@ -310,6 +310,7 @@ struct lr_opt {
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -595,6 +595,8 @@ Napi::Value LlamaContext::GetModelInfo(const Napi::CallbackInfo &info) {
|
|
|
595
595
|
details.Set("nEmbd", llama_model_n_embd(model));
|
|
596
596
|
details.Set("nParams", llama_model_n_params(model));
|
|
597
597
|
details.Set("size", llama_model_size(model));
|
|
598
|
+
details.Set("is_recurrent", llama_model_is_recurrent(model));
|
|
599
|
+
details.Set("is_hybrid", llama_model_is_hybrid(model));
|
|
598
600
|
|
|
599
601
|
Napi::Object chatTemplates = Napi::Object::New(info.Env());
|
|
600
602
|
chatTemplates.Set("llamaChat", _rn_ctx->validateModelChatTemplate(false, nullptr));
|
|
@@ -703,6 +705,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
703
705
|
get_option<bool>(params, "parallel_tool_calls", false);
|
|
704
706
|
auto tool_choice = get_option<std::string>(params, "tool_choice", "");
|
|
705
707
|
auto enable_thinking = get_option<bool>(params, "enable_thinking", false);
|
|
708
|
+
auto reasoning_format = get_option<std::string>(params, "reasoning_format", "none");
|
|
706
709
|
auto add_generation_prompt = get_option<bool>(params, "add_generation_prompt", true);
|
|
707
710
|
auto now_str = get_option<std::string>(params, "now", "");
|
|
708
711
|
|
|
@@ -721,7 +724,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
721
724
|
try {
|
|
722
725
|
chatParams = _rn_ctx->getFormattedChatWithJinja(
|
|
723
726
|
messages, chat_template, json_schema_str, tools_str,
|
|
724
|
-
parallel_tool_calls, tool_choice, enable_thinking,
|
|
727
|
+
parallel_tool_calls, tool_choice, enable_thinking, reasoning_format,
|
|
725
728
|
add_generation_prompt, now_str, chat_template_kwargs);
|
|
726
729
|
} catch (const nlohmann::json_abi_v3_12_0::detail::parse_error& e) {
|
|
727
730
|
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
@@ -962,7 +965,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
962
965
|
try {
|
|
963
966
|
chatParams = _rn_ctx->getFormattedChatWithJinja(
|
|
964
967
|
json_stringify(messages), chat_template,
|
|
965
|
-
json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking,
|
|
968
|
+
json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking, reasoning_format,
|
|
966
969
|
add_generation_prompt, now_str, chat_template_kwargs);
|
|
967
970
|
} catch (const std::exception &e) {
|
|
968
971
|
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
#include "log.h"
|
|
7
7
|
#include "sampling.h"
|
|
8
8
|
#include "download.h"
|
|
9
|
+
#include "preset.h"
|
|
9
10
|
|
|
10
11
|
// fix problem with std::min and std::max
|
|
11
12
|
#if defined(_WIN32)
|
|
@@ -268,6 +269,46 @@ static void parse_tensor_buffer_overrides(const std::string & value, std::vector
|
|
|
268
269
|
}
|
|
269
270
|
}
|
|
270
271
|
|
|
272
|
+
static std::string clean_file_name(const std::string & fname) {
|
|
273
|
+
std::string clean_fname = fname;
|
|
274
|
+
string_replace_all(clean_fname, "\\", "_");
|
|
275
|
+
string_replace_all(clean_fname, "/", "_");
|
|
276
|
+
return clean_fname;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
|
|
280
|
+
GGML_ASSERT(!params.model.hf_repo.empty());
|
|
281
|
+
|
|
282
|
+
const bool offline = params.offline;
|
|
283
|
+
std::string model_endpoint = get_model_endpoint();
|
|
284
|
+
auto preset_url = model_endpoint + params.model.hf_repo + "/resolve/main/preset.ini";
|
|
285
|
+
|
|
286
|
+
// prepare local path for caching
|
|
287
|
+
auto preset_fname = clean_file_name(params.model.hf_repo + "_preset.ini");
|
|
288
|
+
auto preset_path = fs_get_cache_file(preset_fname);
|
|
289
|
+
const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
|
|
290
|
+
const bool has_preset = status >= 200 && status < 400;
|
|
291
|
+
|
|
292
|
+
// remote preset is optional, so we don't error out if not found
|
|
293
|
+
if (has_preset) {
|
|
294
|
+
LOG_INF("applying remote preset from %s\n", preset_url.c_str());
|
|
295
|
+
common_preset_context ctx(ex, /* only_remote_allowed */ true);
|
|
296
|
+
common_preset global; // unused for now
|
|
297
|
+
auto remote_presets = ctx.load_from_ini(preset_path, global);
|
|
298
|
+
if (remote_presets.find(COMMON_PRESET_DEFAULT_NAME) != remote_presets.end()) {
|
|
299
|
+
common_preset & preset = remote_presets.at(COMMON_PRESET_DEFAULT_NAME);
|
|
300
|
+
LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
|
|
301
|
+
preset.apply_to_params(params);
|
|
302
|
+
} else {
|
|
303
|
+
throw std::runtime_error("Remote preset.ini does not contain [" + std::string(COMMON_PRESET_DEFAULT_NAME) + "] section");
|
|
304
|
+
}
|
|
305
|
+
} else {
|
|
306
|
+
LOG_INF("%s", "no remote preset found, skipping\n");
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
return has_preset;
|
|
310
|
+
}
|
|
311
|
+
|
|
271
312
|
struct handle_model_result {
|
|
272
313
|
bool found_mmproj = false;
|
|
273
314
|
common_params_model mmproj;
|
|
@@ -309,9 +350,7 @@ static handle_model_result common_params_handle_model(
|
|
|
309
350
|
// make sure model path is present (for caching purposes)
|
|
310
351
|
if (model.path.empty()) {
|
|
311
352
|
// this is to avoid different repo having same file name, or same file name in different subdirs
|
|
312
|
-
std::string filename = model.hf_repo + "_" + model.hf_file;
|
|
313
|
-
// to make sure we don't have any slashes in the filename
|
|
314
|
-
string_replace_all(filename, "/", "_");
|
|
353
|
+
std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
|
|
315
354
|
model.path = fs_get_cache_file(filename);
|
|
316
355
|
}
|
|
317
356
|
|
|
@@ -425,61 +464,87 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
425
464
|
}
|
|
426
465
|
};
|
|
427
466
|
|
|
428
|
-
|
|
467
|
+
auto parse_cli_args = [&]() {
|
|
468
|
+
std::set<std::string> seen_args;
|
|
429
469
|
|
|
430
|
-
|
|
431
|
-
|
|
470
|
+
for (int i = 1; i < argc; i++) {
|
|
471
|
+
const std::string arg_prefix = "--";
|
|
432
472
|
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
}
|
|
437
|
-
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
|
438
|
-
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
|
439
|
-
}
|
|
440
|
-
if (!seen_args.insert(arg).second) {
|
|
441
|
-
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
|
442
|
-
}
|
|
443
|
-
auto & tmp = arg_to_options[arg];
|
|
444
|
-
auto opt = *tmp.first;
|
|
445
|
-
bool is_positive = tmp.second;
|
|
446
|
-
if (opt.has_value_from_env()) {
|
|
447
|
-
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
|
|
448
|
-
}
|
|
449
|
-
try {
|
|
450
|
-
if (opt.handler_void) {
|
|
451
|
-
opt.handler_void(params);
|
|
452
|
-
continue;
|
|
473
|
+
std::string arg = argv[i];
|
|
474
|
+
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
|
475
|
+
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
453
476
|
}
|
|
454
|
-
if (
|
|
455
|
-
|
|
456
|
-
continue;
|
|
477
|
+
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
|
478
|
+
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
|
457
479
|
}
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
check_arg(i);
|
|
461
|
-
std::string val = argv[++i];
|
|
462
|
-
if (opt.handler_int) {
|
|
463
|
-
opt.handler_int(params, std::stoi(val));
|
|
464
|
-
continue;
|
|
480
|
+
if (!seen_args.insert(arg).second) {
|
|
481
|
+
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
|
465
482
|
}
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
483
|
+
auto & tmp = arg_to_options[arg];
|
|
484
|
+
auto opt = *tmp.first;
|
|
485
|
+
bool is_positive = tmp.second;
|
|
486
|
+
if (opt.has_value_from_env()) {
|
|
487
|
+
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
|
|
469
488
|
}
|
|
489
|
+
try {
|
|
490
|
+
if (opt.handler_void) {
|
|
491
|
+
opt.handler_void(params);
|
|
492
|
+
continue;
|
|
493
|
+
}
|
|
494
|
+
if (opt.handler_bool) {
|
|
495
|
+
opt.handler_bool(params, is_positive);
|
|
496
|
+
continue;
|
|
497
|
+
}
|
|
470
498
|
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
499
|
+
// arg with single value
|
|
500
|
+
check_arg(i);
|
|
501
|
+
std::string val = argv[++i];
|
|
502
|
+
if (opt.handler_int) {
|
|
503
|
+
opt.handler_int(params, std::stoi(val));
|
|
504
|
+
continue;
|
|
505
|
+
}
|
|
506
|
+
if (opt.handler_string) {
|
|
507
|
+
opt.handler_string(params, val);
|
|
508
|
+
continue;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
// arg with 2 values
|
|
512
|
+
check_arg(i);
|
|
513
|
+
std::string val2 = argv[++i];
|
|
514
|
+
if (opt.handler_str_str) {
|
|
515
|
+
opt.handler_str_str(params, val, val2);
|
|
516
|
+
continue;
|
|
517
|
+
}
|
|
518
|
+
} catch (std::exception & e) {
|
|
519
|
+
throw std::invalid_argument(string_format(
|
|
520
|
+
"error while handling argument \"%s\": %s\n\n"
|
|
521
|
+
"usage:\n%s\n\nto show complete usage, run with -h",
|
|
522
|
+
arg.c_str(), e.what(), opt.to_string().c_str()));
|
|
477
523
|
}
|
|
478
|
-
}
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
524
|
+
}
|
|
525
|
+
};
|
|
526
|
+
|
|
527
|
+
// parse the first time to get -hf option (used for remote preset)
|
|
528
|
+
parse_cli_args();
|
|
529
|
+
|
|
530
|
+
// maybe handle remote preset
|
|
531
|
+
if (!params.model.hf_repo.empty()) {
|
|
532
|
+
std::string cli_hf_repo = params.model.hf_repo;
|
|
533
|
+
bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
|
|
534
|
+
|
|
535
|
+
// special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
|
|
536
|
+
// this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
|
|
537
|
+
std::string preset_hf_repo = params.model.hf_repo;
|
|
538
|
+
bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
|
|
539
|
+
|
|
540
|
+
if (has_preset) {
|
|
541
|
+
// re-parse CLI args to override preset values
|
|
542
|
+
parse_cli_args();
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
// preserve hf_repo from preset if needed
|
|
546
|
+
if (preset_has_hf_repo) {
|
|
547
|
+
params.model.hf_repo = preset_hf_repo;
|
|
483
548
|
}
|
|
484
549
|
}
|
|
485
550
|
|
|
@@ -2088,11 +2153,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2088
2153
|
add_opt(common_arg(
|
|
2089
2154
|
{"--mmap"},
|
|
2090
2155
|
{"--no-mmap"},
|
|
2091
|
-
string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
|
|
2156
|
+
string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
|
|
2092
2157
|
[](common_params & params, bool value) {
|
|
2093
2158
|
params.use_mmap = value;
|
|
2159
|
+
if (value) {
|
|
2160
|
+
params.use_direct_io = false; // disable direct io when mmap is explicitly enabled
|
|
2161
|
+
}
|
|
2094
2162
|
}
|
|
2095
2163
|
).set_env("LLAMA_ARG_MMAP"));
|
|
2164
|
+
add_opt(common_arg(
|
|
2165
|
+
{"-dio", "--direct-io"},
|
|
2166
|
+
{"-ndio", "--no-direct-io"},
|
|
2167
|
+
string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
|
|
2168
|
+
[](common_params & params, bool value) {
|
|
2169
|
+
params.use_direct_io = value;
|
|
2170
|
+
}
|
|
2171
|
+
).set_env("LLAMA_ARG_DIO"));
|
|
2096
2172
|
add_opt(common_arg(
|
|
2097
2173
|
{"--numa"}, "TYPE",
|
|
2098
2174
|
"attempt optimizations that help on some NUMA systems\n"
|
|
@@ -2244,7 +2320,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2244
2320
|
std::vector<std::string> split_arg{ it, {} };
|
|
2245
2321
|
if (split_arg.size() >= llama_max_devices()) {
|
|
2246
2322
|
throw std::invalid_argument(
|
|
2247
|
-
string_format("got %
|
|
2323
|
+
string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
|
|
2248
2324
|
);
|
|
2249
2325
|
}
|
|
2250
2326
|
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
|
@@ -2284,10 +2360,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2284
2360
|
}
|
|
2285
2361
|
).set_env("LLAMA_ARG_FIT"));
|
|
2286
2362
|
add_opt(common_arg(
|
|
2287
|
-
{ "-fitt", "--fit-target" }, "
|
|
2288
|
-
string_format("target margin per device for --fit
|
|
2289
|
-
|
|
2290
|
-
|
|
2363
|
+
{ "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
|
|
2364
|
+
string_format("target margin per device for --fit, comma-separated list of values, "
|
|
2365
|
+
"single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
|
|
2366
|
+
[](common_params & params, const std::string & value) {
|
|
2367
|
+
std::string arg_next = value;
|
|
2368
|
+
|
|
2369
|
+
// split string by , and /
|
|
2370
|
+
const std::regex regex{ R"([,/]+)" };
|
|
2371
|
+
std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
|
|
2372
|
+
std::vector<std::string> split_arg{ it, {} };
|
|
2373
|
+
if (split_arg.size() >= llama_max_devices()) {
|
|
2374
|
+
throw std::invalid_argument(
|
|
2375
|
+
string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
|
|
2376
|
+
);
|
|
2377
|
+
}
|
|
2378
|
+
if (split_arg.size() == 1) {
|
|
2379
|
+
std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
|
|
2380
|
+
return;
|
|
2381
|
+
}
|
|
2382
|
+
for (size_t i = 0; i < split_arg.size(); i++) {
|
|
2383
|
+
params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
|
|
2384
|
+
}
|
|
2291
2385
|
}
|
|
2292
2386
|
).set_env("LLAMA_ARG_FIT_TARGET"));
|
|
2293
2387
|
add_opt(common_arg(
|
|
@@ -129,11 +129,3 @@ void common_params_add_preset_options(std::vector<common_arg> & args);
|
|
|
129
129
|
|
|
130
130
|
// initialize argument parser context - used by test-arg-parser and preset
|
|
131
131
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
|
132
|
-
|
|
133
|
-
struct common_remote_params {
|
|
134
|
-
std::vector<std::string> headers;
|
|
135
|
-
long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
|
|
136
|
-
long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
|
|
137
|
-
};
|
|
138
|
-
// get remote file content, returns <http_code, raw_response_body>
|
|
139
|
-
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
|
|
@@ -1097,7 +1097,7 @@ common_init_result::common_init_result(common_params & params) :
|
|
|
1097
1097
|
if (params.fit_params) {
|
|
1098
1098
|
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
|
|
1099
1099
|
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
|
1100
|
-
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
|
1100
|
+
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
|
|
1101
1101
|
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
|
1102
1102
|
}
|
|
1103
1103
|
|
|
@@ -1367,6 +1367,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
|
1367
1367
|
mparams.split_mode = params.split_mode;
|
|
1368
1368
|
mparams.tensor_split = params.tensor_split;
|
|
1369
1369
|
mparams.use_mmap = params.use_mmap;
|
|
1370
|
+
mparams.use_direct_io = params.use_direct_io;
|
|
1370
1371
|
mparams.use_mlock = params.use_mlock;
|
|
1371
1372
|
mparams.check_tensors = params.check_tensors;
|
|
1372
1373
|
mparams.use_extra_bufts = !params.no_extra_bufts;
|
|
@@ -333,12 +333,14 @@ struct common_params {
|
|
|
333
333
|
// offload params
|
|
334
334
|
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
|
335
335
|
|
|
336
|
-
int32_t n_gpu_layers = -1;
|
|
337
|
-
int32_t main_gpu = 0;
|
|
338
|
-
float tensor_split[128] = {0};
|
|
339
|
-
bool fit_params = true;
|
|
340
|
-
|
|
341
|
-
|
|
336
|
+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
|
337
|
+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
|
338
|
+
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
|
339
|
+
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
|
340
|
+
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
|
341
|
+
|
|
342
|
+
// margin per device in bytes for fitting parameters to free memory:
|
|
343
|
+
std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
|
|
342
344
|
|
|
343
345
|
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
|
344
346
|
|
|
@@ -429,7 +431,8 @@ struct common_params {
|
|
|
429
431
|
bool kv_unified = false; // enable unified KV cache
|
|
430
432
|
|
|
431
433
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
|
432
|
-
bool use_mmap = true; //
|
|
434
|
+
bool use_mmap = true; // enable mmap to use filesystem cache
|
|
435
|
+
bool use_direct_io = true; // read from disk without buffering for faster model loading
|
|
433
436
|
bool use_mlock = false; // use mlock to keep model in memory
|
|
434
437
|
bool verbose_prompt = false; // print prompt tokens before generation
|
|
435
438
|
bool display_prompt = true; // print prompt before generation
|