@fugood/llama.node 1.4.8 → 1.4.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +43 -0
- package/lib/parallel.js +26 -0
- package/lib/parallel.ts +33 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +12 -14
- package/src/LlamaCompletionWorker.cpp +3 -1
- package/src/LlamaCompletionWorker.h +2 -0
- package/src/LlamaContext.cpp +16 -1
- package/src/LlamaContext.h +3 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -4
- package/src/llama.cpp/common/arg.cpp +159 -42
- package/src/llama.cpp/common/arg.h +10 -1
- package/src/llama.cpp/common/common.cpp +1 -1
- package/src/llama.cpp/common/common.h +6 -2
- package/src/llama.cpp/common/preset.cpp +197 -5
- package/src/llama.cpp/common/preset.h +45 -3
- package/src/llama.cpp/common/sampling.cpp +51 -37
- package/src/llama.cpp/common/sampling.h +6 -3
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +283 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +51 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +286 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
- package/src/llama.cpp/src/llama-arch.cpp +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +123 -28
- package/src/llama.cpp/src/llama-mmap.h +5 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +56 -13
- package/src/llama.cpp/src/llama-model.cpp +7 -5
- package/src/llama.cpp/src/llama-sampling.cpp +16 -0
- package/src/llama.cpp/src/llama.cpp +22 -32
|
@@ -96,6 +96,11 @@ common_arg & common_arg::set_sparam() {
|
|
|
96
96
|
return *this;
|
|
97
97
|
}
|
|
98
98
|
|
|
99
|
+
common_arg & common_arg::set_preset_only() {
|
|
100
|
+
is_preset_only = true;
|
|
101
|
+
return *this;
|
|
102
|
+
}
|
|
103
|
+
|
|
99
104
|
bool common_arg::in_example(enum llama_example ex) {
|
|
100
105
|
return examples.find(ex) != examples.end();
|
|
101
106
|
}
|
|
@@ -420,6 +425,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
420
425
|
}
|
|
421
426
|
};
|
|
422
427
|
|
|
428
|
+
std::set<std::string> seen_args;
|
|
429
|
+
|
|
423
430
|
for (int i = 1; i < argc; i++) {
|
|
424
431
|
const std::string arg_prefix = "--";
|
|
425
432
|
|
|
@@ -430,6 +437,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
430
437
|
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
|
431
438
|
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
|
432
439
|
}
|
|
440
|
+
if (!seen_args.insert(arg).second) {
|
|
441
|
+
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
|
442
|
+
}
|
|
433
443
|
auto & tmp = arg_to_options[arg];
|
|
434
444
|
auto opt = *tmp.first;
|
|
435
445
|
bool is_positive = tmp.second;
|
|
@@ -750,6 +760,8 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
|
|
|
750
760
|
}
|
|
751
761
|
};
|
|
752
762
|
|
|
763
|
+
std::set<std::string> seen_args;
|
|
764
|
+
|
|
753
765
|
for (int i = 1; i < argc; i++) {
|
|
754
766
|
const std::string arg_prefix = "--";
|
|
755
767
|
|
|
@@ -760,8 +772,16 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
|
|
|
760
772
|
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
|
761
773
|
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
|
762
774
|
}
|
|
775
|
+
if (!seen_args.insert(arg).second) {
|
|
776
|
+
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
|
777
|
+
}
|
|
763
778
|
auto opt = *arg_to_options[arg];
|
|
764
779
|
std::string val;
|
|
780
|
+
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
|
|
781
|
+
// bool arg (need to reverse the meaning for negative args)
|
|
782
|
+
bool is_neg = std::find(opt.args_neg.begin(), opt.args_neg.end(), arg) != opt.args_neg.end();
|
|
783
|
+
val = is_neg ? "0" : "1";
|
|
784
|
+
}
|
|
765
785
|
if (opt.value_hint != nullptr) {
|
|
766
786
|
// arg with single value
|
|
767
787
|
check_arg(i);
|
|
@@ -863,7 +883,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
863
883
|
sampler_type_chars += common_sampler_type_to_chr(sampler);
|
|
864
884
|
sampler_type_names += common_sampler_type_to_str(sampler) + ";";
|
|
865
885
|
}
|
|
866
|
-
sampler_type_names.
|
|
886
|
+
if (!sampler_type_names.empty()) {
|
|
887
|
+
sampler_type_names.pop_back(); // remove last semicolon
|
|
888
|
+
}
|
|
867
889
|
|
|
868
890
|
|
|
869
891
|
/**
|
|
@@ -1127,7 +1149,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1127
1149
|
}
|
|
1128
1150
|
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
1129
1151
|
add_opt(common_arg(
|
|
1130
|
-
{"
|
|
1152
|
+
{"-cram", "--cache-ram"}, "N",
|
|
1131
1153
|
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
|
|
1132
1154
|
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
|
|
1133
1155
|
[](common_params & params, int value) {
|
|
@@ -1135,7 +1157,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1135
1157
|
}
|
|
1136
1158
|
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
1137
1159
|
add_opt(common_arg(
|
|
1138
|
-
{"
|
|
1160
|
+
{"-kvu", "--kv-unified"},
|
|
1139
1161
|
"use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
|
|
1140
1162
|
[](common_params & params) {
|
|
1141
1163
|
params.kv_unified = true;
|
|
@@ -1184,7 +1206,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1184
1206
|
[](common_params & params, const std::string & value) {
|
|
1185
1207
|
params.system_prompt = value;
|
|
1186
1208
|
}
|
|
1187
|
-
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
|
|
1209
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
|
|
1188
1210
|
add_opt(common_arg(
|
|
1189
1211
|
{"--perf"},
|
|
1190
1212
|
{"--no-perf"},
|
|
@@ -1226,13 +1248,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1226
1248
|
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
|
|
1227
1249
|
add_opt(common_arg(
|
|
1228
1250
|
{"--in-file"}, "FNAME",
|
|
1229
|
-
"an input file (
|
|
1251
|
+
"an input file (use comma-separated values to specify multiple files)",
|
|
1230
1252
|
[](common_params & params, const std::string & value) {
|
|
1231
|
-
std::
|
|
1232
|
-
|
|
1233
|
-
|
|
1253
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
1254
|
+
std::ifstream file(item);
|
|
1255
|
+
if (!file) {
|
|
1256
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
|
1257
|
+
}
|
|
1258
|
+
params.in_files.push_back(item);
|
|
1234
1259
|
}
|
|
1235
|
-
params.in_files.push_back(value);
|
|
1236
1260
|
}
|
|
1237
1261
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
1238
1262
|
add_opt(common_arg(
|
|
@@ -1401,7 +1425,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1401
1425
|
}
|
|
1402
1426
|
).set_sparam());
|
|
1403
1427
|
add_opt(common_arg(
|
|
1404
|
-
{"--
|
|
1428
|
+
{"--sampler-seq", "--sampling-seq"}, "SEQUENCE",
|
|
1405
1429
|
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
|
|
1406
1430
|
[](common_params & params, const std::string & value) {
|
|
1407
1431
|
params.sampling.samplers = common_sampler_types_from_chars(value);
|
|
@@ -1969,9 +1993,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1969
1993
|
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
|
|
1970
1994
|
add_opt(common_arg(
|
|
1971
1995
|
{"--image", "--audio"}, "FILE",
|
|
1972
|
-
"path to an image or audio file. use with multimodal models,
|
|
1996
|
+
"path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
|
|
1973
1997
|
[](common_params & params, const std::string & value) {
|
|
1974
|
-
|
|
1998
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
1999
|
+
params.image.emplace_back(item);
|
|
2000
|
+
}
|
|
1975
2001
|
}
|
|
1976
2002
|
).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
|
|
1977
2003
|
add_opt(common_arg(
|
|
@@ -2057,26 +2083,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2057
2083
|
}
|
|
2058
2084
|
));
|
|
2059
2085
|
add_opt(common_arg(
|
|
2060
|
-
{"
|
|
2086
|
+
{"-ot", "--override-tensor"}, "<tensor name pattern>=<buffer type>,...",
|
|
2061
2087
|
"override tensor buffer type", [](common_params & params, const std::string & value) {
|
|
2062
2088
|
parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
|
|
2063
2089
|
}
|
|
2064
2090
|
));
|
|
2065
2091
|
add_opt(common_arg(
|
|
2066
|
-
{"--override-tensor-draft"
|
|
2092
|
+
{"-otd", "--override-tensor-draft"}, "<tensor name pattern>=<buffer type>,...",
|
|
2067
2093
|
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
|
|
2068
2094
|
parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
|
|
2069
2095
|
}
|
|
2070
2096
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
2071
2097
|
add_opt(common_arg(
|
|
2072
|
-
{"
|
|
2098
|
+
{"-cmoe", "--cpu-moe"},
|
|
2073
2099
|
"keep all Mixture of Experts (MoE) weights in the CPU",
|
|
2074
2100
|
[](common_params & params) {
|
|
2075
2101
|
params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
|
2076
2102
|
}
|
|
2077
2103
|
).set_env("LLAMA_ARG_CPU_MOE"));
|
|
2078
2104
|
add_opt(common_arg(
|
|
2079
|
-
{"--n-cpu-moe"
|
|
2105
|
+
{"-ncmoe", "--n-cpu-moe"}, "N",
|
|
2080
2106
|
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
|
|
2081
2107
|
[](common_params & params, int value) {
|
|
2082
2108
|
if (value < 0) {
|
|
@@ -2091,14 +2117,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2091
2117
|
}
|
|
2092
2118
|
).set_env("LLAMA_ARG_N_CPU_MOE"));
|
|
2093
2119
|
add_opt(common_arg(
|
|
2094
|
-
{"--cpu-moe-draft"
|
|
2120
|
+
{"-cmoed", "--cpu-moe-draft"},
|
|
2095
2121
|
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
|
|
2096
2122
|
[](common_params & params) {
|
|
2097
2123
|
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
|
2098
2124
|
}
|
|
2099
2125
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
|
|
2100
2126
|
add_opt(common_arg(
|
|
2101
|
-
{"--n-cpu-moe-draft"
|
|
2127
|
+
{"-ncmoed", "--n-cpu-moe-draft"}, "N",
|
|
2102
2128
|
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
|
|
2103
2129
|
[](common_params & params, int value) {
|
|
2104
2130
|
if (value < 0) {
|
|
@@ -2218,12 +2244,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2218
2244
|
}
|
|
2219
2245
|
));
|
|
2220
2246
|
add_opt(common_arg(
|
|
2221
|
-
{"--override-kv"}, "KEY=TYPE:VALUE",
|
|
2222
|
-
"advanced option to override model metadata by key.
|
|
2223
|
-
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
|
|
2247
|
+
{"--override-kv"}, "KEY=TYPE:VALUE,...",
|
|
2248
|
+
"advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.\n"
|
|
2249
|
+
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
|
|
2224
2250
|
[](common_params & params, const std::string & value) {
|
|
2225
|
-
|
|
2226
|
-
|
|
2251
|
+
std::vector<std::string> kv_overrides;
|
|
2252
|
+
|
|
2253
|
+
std::string current;
|
|
2254
|
+
bool escaping = false;
|
|
2255
|
+
|
|
2256
|
+
for (const char c : value) {
|
|
2257
|
+
if (escaping) {
|
|
2258
|
+
current.push_back(c);
|
|
2259
|
+
escaping = false;
|
|
2260
|
+
} else if (c == '\\') {
|
|
2261
|
+
escaping = true;
|
|
2262
|
+
} else if (c == ',') {
|
|
2263
|
+
kv_overrides.push_back(current);
|
|
2264
|
+
current.clear();
|
|
2265
|
+
} else {
|
|
2266
|
+
current.push_back(c);
|
|
2267
|
+
}
|
|
2268
|
+
}
|
|
2269
|
+
|
|
2270
|
+
if (escaping) {
|
|
2271
|
+
current.push_back('\\');
|
|
2272
|
+
}
|
|
2273
|
+
|
|
2274
|
+
kv_overrides.push_back(current);
|
|
2275
|
+
|
|
2276
|
+
for (const auto & kv_override : kv_overrides) {
|
|
2277
|
+
if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
|
|
2278
|
+
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
|
|
2279
|
+
}
|
|
2227
2280
|
}
|
|
2228
2281
|
}
|
|
2229
2282
|
));
|
|
@@ -2237,33 +2290,50 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2237
2290
|
));
|
|
2238
2291
|
add_opt(common_arg(
|
|
2239
2292
|
{"--lora"}, "FNAME",
|
|
2240
|
-
"path to LoRA adapter (
|
|
2293
|
+
"path to LoRA adapter (use comma-separated values to load multiple adapters)",
|
|
2241
2294
|
[](common_params & params, const std::string & value) {
|
|
2242
|
-
|
|
2295
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
2296
|
+
params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
|
|
2297
|
+
}
|
|
2243
2298
|
}
|
|
2244
2299
|
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
|
2245
2300
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
2246
2301
|
add_opt(common_arg(
|
|
2247
|
-
{"--lora-scaled"}, "FNAME
|
|
2248
|
-
"path to LoRA adapter with user defined scaling (
|
|
2249
|
-
|
|
2250
|
-
|
|
2302
|
+
{"--lora-scaled"}, "FNAME:SCALE,...",
|
|
2303
|
+
"path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
|
|
2304
|
+
"note: use comma-separated values",
|
|
2305
|
+
[](common_params & params, const std::string & value) {
|
|
2306
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
2307
|
+
auto parts = string_split<std::string>(item, ':');
|
|
2308
|
+
if (parts.size() != 2) {
|
|
2309
|
+
throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
|
|
2310
|
+
}
|
|
2311
|
+
params.lora_adapters.push_back({ parts[0], std::stof(parts[1]), "", "", nullptr });
|
|
2312
|
+
}
|
|
2251
2313
|
}
|
|
2252
2314
|
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
|
2253
2315
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
2254
2316
|
add_opt(common_arg(
|
|
2255
2317
|
{"--control-vector"}, "FNAME",
|
|
2256
|
-
"add a control vector\nnote:
|
|
2318
|
+
"add a control vector\nnote: use comma-separated values to add multiple control vectors",
|
|
2257
2319
|
[](common_params & params, const std::string & value) {
|
|
2258
|
-
|
|
2320
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
2321
|
+
params.control_vectors.push_back({ 1.0f, item, });
|
|
2322
|
+
}
|
|
2259
2323
|
}
|
|
2260
2324
|
));
|
|
2261
2325
|
add_opt(common_arg(
|
|
2262
|
-
{"--control-vector-scaled"}, "FNAME
|
|
2326
|
+
{"--control-vector-scaled"}, "FNAME:SCALE,...",
|
|
2263
2327
|
"add a control vector with user defined scaling SCALE\n"
|
|
2264
|
-
"note:
|
|
2265
|
-
[](common_params & params, const std::string &
|
|
2266
|
-
|
|
2328
|
+
"note: use comma-separated values (format: FNAME:SCALE,...)",
|
|
2329
|
+
[](common_params & params, const std::string & value) {
|
|
2330
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
2331
|
+
auto parts = string_split<std::string>(item, ':');
|
|
2332
|
+
if (parts.size() != 2) {
|
|
2333
|
+
throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
|
|
2334
|
+
}
|
|
2335
|
+
params.control_vectors.push_back({ std::stof(parts[1]), parts[0] });
|
|
2336
|
+
}
|
|
2267
2337
|
}
|
|
2268
2338
|
));
|
|
2269
2339
|
add_opt(common_arg(
|
|
@@ -2353,13 +2423,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2353
2423
|
).set_env("HF_TOKEN"));
|
|
2354
2424
|
add_opt(common_arg(
|
|
2355
2425
|
{"--context-file"}, "FNAME",
|
|
2356
|
-
"file to load context from (
|
|
2426
|
+
"file to load context from (use comma-separated values to specify multiple files)",
|
|
2357
2427
|
[](common_params & params, const std::string & value) {
|
|
2358
|
-
std::
|
|
2359
|
-
|
|
2360
|
-
|
|
2428
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
2429
|
+
std::ifstream file(item, std::ios::binary);
|
|
2430
|
+
if (!file) {
|
|
2431
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
|
2432
|
+
}
|
|
2433
|
+
params.context_files.push_back(item);
|
|
2361
2434
|
}
|
|
2362
|
-
params.context_files.push_back(value);
|
|
2363
2435
|
}
|
|
2364
2436
|
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
|
2365
2437
|
add_opt(common_arg(
|
|
@@ -2550,6 +2622,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2550
2622
|
params.api_prefix = value;
|
|
2551
2623
|
}
|
|
2552
2624
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
|
2625
|
+
add_opt(common_arg(
|
|
2626
|
+
{"--webui-config"}, "JSON",
|
|
2627
|
+
"JSON that provides default WebUI settings (overrides WebUI defaults)",
|
|
2628
|
+
[](common_params & params, const std::string & value) {
|
|
2629
|
+
params.webui_config_json = value;
|
|
2630
|
+
}
|
|
2631
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
|
|
2632
|
+
add_opt(common_arg(
|
|
2633
|
+
{"--webui-config-file"}, "PATH",
|
|
2634
|
+
"JSON file that provides default WebUI settings (overrides WebUI defaults)",
|
|
2635
|
+
[](common_params & params, const std::string & value) {
|
|
2636
|
+
params.webui_config_json = read_file(value);
|
|
2637
|
+
}
|
|
2638
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
|
|
2553
2639
|
add_opt(common_arg(
|
|
2554
2640
|
{"--webui"},
|
|
2555
2641
|
{"--no-webui"},
|
|
@@ -2566,7 +2652,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2566
2652
|
}
|
|
2567
2653
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
|
2568
2654
|
add_opt(common_arg(
|
|
2569
|
-
{"--
|
|
2655
|
+
{"--rerank", "--reranking"},
|
|
2570
2656
|
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
|
2571
2657
|
[](common_params & params) {
|
|
2572
2658
|
params.embedding = true;
|
|
@@ -2801,6 +2887,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2801
2887
|
params.lora_init_without_apply = true;
|
|
2802
2888
|
}
|
|
2803
2889
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2890
|
+
add_opt(common_arg(
|
|
2891
|
+
{"--sleep-idle-seconds"}, "SECONDS",
|
|
2892
|
+
string_format("number of seconds of idleness after which the server will sleep (default: %d; -1 = disabled)", params.sleep_idle_seconds),
|
|
2893
|
+
[](common_params & params, int value) {
|
|
2894
|
+
if (value == 0 || value < -1) {
|
|
2895
|
+
throw std::invalid_argument("invalid value: cannot be 0 or less than -1");
|
|
2896
|
+
}
|
|
2897
|
+
params.sleep_idle_seconds = value;
|
|
2898
|
+
}
|
|
2899
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2804
2900
|
add_opt(common_arg(
|
|
2805
2901
|
{"--simple-io"},
|
|
2806
2902
|
"use basic IO for better compatibility in subprocesses and limited consoles",
|
|
@@ -3037,7 +3133,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3037
3133
|
}
|
|
3038
3134
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
3039
3135
|
add_opt(common_arg(
|
|
3040
|
-
{"--draft
|
|
3136
|
+
{"--draft", "--draft-n", "--draft-max"}, "N",
|
|
3041
3137
|
string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
|
|
3042
3138
|
[](common_params & params, int value) {
|
|
3043
3139
|
params.speculative.n_max = value;
|
|
@@ -3413,3 +3509,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3413
3509
|
|
|
3414
3510
|
return ctx_arg;
|
|
3415
3511
|
}
|
|
3512
|
+
|
|
3513
|
+
void common_params_add_preset_options(std::vector<common_arg> & args) {
|
|
3514
|
+
// arguments below won't be treated as CLI args, only preset options
|
|
3515
|
+
args.push_back(common_arg(
|
|
3516
|
+
{"load-on-startup"}, "NAME",
|
|
3517
|
+
"in server router mode, autoload this model on startup",
|
|
3518
|
+
[](common_params &, const std::string &) { /* unused */ }
|
|
3519
|
+
).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
|
|
3520
|
+
|
|
3521
|
+
// args.push_back(common_arg(
|
|
3522
|
+
// {"pin"},
|
|
3523
|
+
// "in server router mode, do not unload this model if models_max is exceeded",
|
|
3524
|
+
// [](common_params &) { /* unused */ }
|
|
3525
|
+
// ).set_preset_only());
|
|
3526
|
+
|
|
3527
|
+
// args.push_back(common_arg(
|
|
3528
|
+
// {"unload-idle-seconds"}, "SECONDS",
|
|
3529
|
+
// "in server router mode, unload models idle for more than this many seconds",
|
|
3530
|
+
// [](common_params &, int) { /* unused */ }
|
|
3531
|
+
// ).set_preset_only());
|
|
3532
|
+
}
|
|
@@ -8,6 +8,9 @@
|
|
|
8
8
|
#include <vector>
|
|
9
9
|
#include <cstring>
|
|
10
10
|
|
|
11
|
+
// pseudo-env variable to identify preset-only arguments
|
|
12
|
+
#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
|
|
13
|
+
|
|
11
14
|
//
|
|
12
15
|
// CLI argument parsing
|
|
13
16
|
//
|
|
@@ -22,6 +25,7 @@ struct common_arg {
|
|
|
22
25
|
const char * env = nullptr;
|
|
23
26
|
std::string help;
|
|
24
27
|
bool is_sparam = false; // is current arg a sampling param?
|
|
28
|
+
bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
|
|
25
29
|
void (*handler_void) (common_params & params) = nullptr;
|
|
26
30
|
void (*handler_string) (common_params & params, const std::string &) = nullptr;
|
|
27
31
|
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
|
@@ -70,6 +74,7 @@ struct common_arg {
|
|
|
70
74
|
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
|
|
71
75
|
common_arg & set_env(const char * env);
|
|
72
76
|
common_arg & set_sparam();
|
|
77
|
+
common_arg & set_preset_only();
|
|
73
78
|
bool in_example(enum llama_example ex);
|
|
74
79
|
bool is_exclude(enum llama_example ex);
|
|
75
80
|
bool get_value_from_env(std::string & output) const;
|
|
@@ -114,9 +119,13 @@ struct common_params_context {
|
|
|
114
119
|
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
|
115
120
|
|
|
116
121
|
// parse input arguments from CLI into a map
|
|
117
|
-
// TODO: support repeated args in the future
|
|
118
122
|
bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
|
|
119
123
|
|
|
124
|
+
// populate preset-only arguments
|
|
125
|
+
// these arguments are not treated as command line arguments
|
|
126
|
+
// see: https://github.com/ggml-org/llama.cpp/issues/18163
|
|
127
|
+
void common_params_add_preset_options(std::vector<common_arg> & args);
|
|
128
|
+
|
|
120
129
|
// initialize argument parser context - used by test-arg-parser and preset
|
|
121
130
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
|
122
131
|
|
|
@@ -1092,7 +1092,7 @@ common_init_result::common_init_result(common_params & params) :
|
|
|
1092
1092
|
auto cparams = common_context_params_to_llama(params);
|
|
1093
1093
|
|
|
1094
1094
|
if (params.fit_params) {
|
|
1095
|
-
LOG_INF("%s: fitting params to device memory,
|
|
1095
|
+
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
|
|
1096
1096
|
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
|
1097
1097
|
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
|
1098
1098
|
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
|
@@ -476,7 +476,8 @@ struct common_params {
|
|
|
476
476
|
bool enable_chat_template = true;
|
|
477
477
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
|
478
478
|
int reasoning_budget = -1;
|
|
479
|
-
bool prefill_assistant = true;
|
|
479
|
+
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
|
480
|
+
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
|
|
480
481
|
|
|
481
482
|
std::vector<std::string> api_keys;
|
|
482
483
|
|
|
@@ -485,8 +486,11 @@ struct common_params {
|
|
|
485
486
|
|
|
486
487
|
std::map<std::string, std::string> default_template_kwargs;
|
|
487
488
|
|
|
489
|
+
// webui configs
|
|
490
|
+
bool webui = true;
|
|
491
|
+
std::string webui_config_json;
|
|
492
|
+
|
|
488
493
|
// "advanced" endpoints are disabled by default for better security
|
|
489
|
-
bool webui = true;
|
|
490
494
|
bool endpoint_slots = true;
|
|
491
495
|
bool endpoint_props = false; // only control POST requests, not GET
|
|
492
496
|
bool endpoint_metrics = false;
|