@fugood/llama.node 1.4.7 → 1.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +22 -23
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +364 -193
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +140 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +40 -16
- package/src/llama.cpp/common/console.cpp +98 -18
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +91 -92
- package/src/llama.cpp/common/sampling.h +11 -6
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +60 -39
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +93 -23
- package/src/llama.cpp/src/llama-context.h +8 -2
- package/src/llama.cpp/src/llama-graph.cpp +84 -16
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +103 -44
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +675 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +5 -5
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
#include <nlohmann/json.hpp>
|
|
21
21
|
|
|
22
22
|
#include <algorithm>
|
|
23
|
+
#include <cinttypes>
|
|
23
24
|
#include <climits>
|
|
24
25
|
#include <cstdarg>
|
|
25
26
|
#include <fstream>
|
|
@@ -47,10 +48,12 @@
|
|
|
47
48
|
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
|
48
49
|
|
|
49
50
|
using json = nlohmann::ordered_json;
|
|
51
|
+
using namespace common_arg_utils;
|
|
50
52
|
|
|
51
53
|
static std::initializer_list<enum llama_example> mmproj_examples = {
|
|
52
54
|
LLAMA_EXAMPLE_MTMD,
|
|
53
55
|
LLAMA_EXAMPLE_SERVER,
|
|
56
|
+
LLAMA_EXAMPLE_CLI,
|
|
54
57
|
};
|
|
55
58
|
|
|
56
59
|
static std::string read_file(const std::string & fname) {
|
|
@@ -63,6 +66,15 @@ static std::string read_file(const std::string & fname) {
|
|
|
63
66
|
return content;
|
|
64
67
|
}
|
|
65
68
|
|
|
69
|
+
static const std::vector<common_arg> & get_common_arg_defs() {
|
|
70
|
+
static const std::vector<common_arg> options = [] {
|
|
71
|
+
common_params params;
|
|
72
|
+
auto ctx = common_params_parser_init(params, LLAMA_EXAMPLE_SERVER, nullptr);
|
|
73
|
+
return ctx.options;
|
|
74
|
+
}();
|
|
75
|
+
return options;
|
|
76
|
+
}
|
|
77
|
+
|
|
66
78
|
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
|
|
67
79
|
this->examples = examples;
|
|
68
80
|
return *this;
|
|
@@ -94,6 +106,16 @@ bool common_arg::is_exclude(enum llama_example ex) {
|
|
|
94
106
|
|
|
95
107
|
bool common_arg::get_value_from_env(std::string & output) const {
|
|
96
108
|
if (env == nullptr) return false;
|
|
109
|
+
if (!args_neg.empty()) {
|
|
110
|
+
// for compatibility, we need to check LLAMA_ARG_NO_ env as well
|
|
111
|
+
std::string neg_env = env;
|
|
112
|
+
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
|
|
113
|
+
char * neg_value = std::getenv(neg_env.c_str());
|
|
114
|
+
if (neg_value) {
|
|
115
|
+
output = "0"; // falsey
|
|
116
|
+
return true;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
97
119
|
char * value = std::getenv(env);
|
|
98
120
|
if (value) {
|
|
99
121
|
output = value;
|
|
@@ -103,6 +125,14 @@ bool common_arg::get_value_from_env(std::string & output) const {
|
|
|
103
125
|
}
|
|
104
126
|
|
|
105
127
|
bool common_arg::has_value_from_env() const {
|
|
128
|
+
if (env != nullptr && !args_neg.empty()) {
|
|
129
|
+
// for compatibility, we need to check LLAMA_ARG_NO_ env as well
|
|
130
|
+
std::string neg_env = env;
|
|
131
|
+
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
|
|
132
|
+
if (std::getenv(neg_env.c_str())) {
|
|
133
|
+
return true;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
106
136
|
return env != nullptr && std::getenv(env);
|
|
107
137
|
}
|
|
108
138
|
|
|
@@ -133,16 +163,17 @@ static std::vector<std::string> break_str_into_lines(std::string input, size_t m
|
|
|
133
163
|
return result;
|
|
134
164
|
}
|
|
135
165
|
|
|
136
|
-
std::string common_arg::to_string() {
|
|
166
|
+
std::string common_arg::to_string() const {
|
|
137
167
|
// params for printing to console
|
|
138
168
|
const static int n_leading_spaces = 40;
|
|
139
169
|
const static int n_char_per_line_help = 70; // TODO: detect this based on current console
|
|
140
170
|
std::string leading_spaces(n_leading_spaces, ' ');
|
|
141
171
|
|
|
142
172
|
std::ostringstream ss;
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
173
|
+
auto all_args = get_args(); // also contains args_neg
|
|
174
|
+
for (const auto & arg : all_args) {
|
|
175
|
+
if (arg == all_args.front()) {
|
|
176
|
+
if (all_args.size() == 1) {
|
|
146
177
|
ss << arg;
|
|
147
178
|
} else {
|
|
148
179
|
// first arg is usually abbreviation, we need padding to make it more beautiful
|
|
@@ -151,7 +182,7 @@ std::string common_arg::to_string() {
|
|
|
151
182
|
ss << tmp << spaces;
|
|
152
183
|
}
|
|
153
184
|
} else {
|
|
154
|
-
ss << arg << (arg !=
|
|
185
|
+
ss << arg << (arg != all_args.back() ? ", " : "");
|
|
155
186
|
}
|
|
156
187
|
}
|
|
157
188
|
if (value_hint) ss << " " << value_hint;
|
|
@@ -170,6 +201,31 @@ std::string common_arg::to_string() {
|
|
|
170
201
|
return ss.str();
|
|
171
202
|
}
|
|
172
203
|
|
|
204
|
+
std::vector<std::string> common_arg::get_args() const {
|
|
205
|
+
std::vector<std::string> result;
|
|
206
|
+
for (const auto & arg : args) {
|
|
207
|
+
result.push_back(std::string(arg));
|
|
208
|
+
}
|
|
209
|
+
for (const auto & arg : args_neg) {
|
|
210
|
+
result.push_back(std::string(arg));
|
|
211
|
+
}
|
|
212
|
+
return result;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
std::vector<std::string> common_arg::get_env() const {
|
|
216
|
+
std::vector<std::string> result;
|
|
217
|
+
if (env) {
|
|
218
|
+
result.push_back(std::string(env));
|
|
219
|
+
}
|
|
220
|
+
if (!args_neg.empty() && env) {
|
|
221
|
+
// for compatibility, we need to add LLAMA_ARG_NO_ variant
|
|
222
|
+
std::string neg_env = env;
|
|
223
|
+
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
|
|
224
|
+
result.push_back(neg_env);
|
|
225
|
+
}
|
|
226
|
+
return result;
|
|
227
|
+
}
|
|
228
|
+
|
|
173
229
|
//
|
|
174
230
|
// utils
|
|
175
231
|
//
|
|
@@ -305,6 +361,16 @@ static std::string get_all_kv_cache_types() {
|
|
|
305
361
|
return msg.str();
|
|
306
362
|
}
|
|
307
363
|
|
|
364
|
+
static bool parse_bool_value(const std::string & value) {
|
|
365
|
+
if (is_truthy(value)) {
|
|
366
|
+
return true;
|
|
367
|
+
} else if (is_falsey(value)) {
|
|
368
|
+
return false;
|
|
369
|
+
} else {
|
|
370
|
+
throw std::invalid_argument("invalid boolean value");
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
|
|
308
374
|
//
|
|
309
375
|
// CLI argument parsing functions
|
|
310
376
|
//
|
|
@@ -312,10 +378,13 @@ static std::string get_all_kv_cache_types() {
|
|
|
312
378
|
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
|
|
313
379
|
common_params & params = ctx_arg.params;
|
|
314
380
|
|
|
315
|
-
std::unordered_map<std::string, common_arg
|
|
381
|
+
std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
|
|
316
382
|
for (auto & opt : ctx_arg.options) {
|
|
317
383
|
for (const auto & arg : opt.args) {
|
|
318
|
-
arg_to_options[arg] = &opt;
|
|
384
|
+
arg_to_options[arg] = {&opt, /* is_positive */ true};
|
|
385
|
+
}
|
|
386
|
+
for (const auto & arg : opt.args_neg) {
|
|
387
|
+
arg_to_options[arg] = {&opt, /* is_positive */ false};
|
|
319
388
|
}
|
|
320
389
|
}
|
|
321
390
|
|
|
@@ -324,12 +393,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
324
393
|
std::string value;
|
|
325
394
|
if (opt.get_value_from_env(value)) {
|
|
326
395
|
try {
|
|
327
|
-
if (opt.handler_void && (value
|
|
396
|
+
if (opt.handler_void && is_truthy(value)) {
|
|
328
397
|
opt.handler_void(params);
|
|
329
398
|
}
|
|
330
399
|
if (opt.handler_int) {
|
|
331
400
|
opt.handler_int(params, std::stoi(value));
|
|
332
401
|
}
|
|
402
|
+
if (opt.handler_bool) {
|
|
403
|
+
opt.handler_bool(params, parse_bool_value(value));
|
|
404
|
+
}
|
|
333
405
|
if (opt.handler_string) {
|
|
334
406
|
opt.handler_string(params, value);
|
|
335
407
|
continue;
|
|
@@ -358,7 +430,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
358
430
|
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
|
359
431
|
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
|
360
432
|
}
|
|
361
|
-
auto
|
|
433
|
+
auto & tmp = arg_to_options[arg];
|
|
434
|
+
auto opt = *tmp.first;
|
|
435
|
+
bool is_positive = tmp.second;
|
|
362
436
|
if (opt.has_value_from_env()) {
|
|
363
437
|
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
|
|
364
438
|
}
|
|
@@ -367,6 +441,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
367
441
|
opt.handler_void(params);
|
|
368
442
|
continue;
|
|
369
443
|
}
|
|
444
|
+
if (opt.handler_bool) {
|
|
445
|
+
opt.handler_bool(params, is_positive);
|
|
446
|
+
continue;
|
|
447
|
+
}
|
|
370
448
|
|
|
371
449
|
// arg with single value
|
|
372
450
|
check_arg(i);
|
|
@@ -391,7 +469,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
391
469
|
throw std::invalid_argument(string_format(
|
|
392
470
|
"error while handling argument \"%s\": %s\n\n"
|
|
393
471
|
"usage:\n%s\n\nto show complete usage, run with -h",
|
|
394
|
-
arg.c_str(), e.what(),
|
|
472
|
+
arg.c_str(), e.what(), opt.to_string().c_str()));
|
|
395
473
|
}
|
|
396
474
|
}
|
|
397
475
|
|
|
@@ -427,7 +505,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
427
505
|
|
|
428
506
|
// model is required (except for server)
|
|
429
507
|
// TODO @ngxson : maybe show a list of available models in CLI in this case
|
|
430
|
-
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) {
|
|
508
|
+
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) {
|
|
431
509
|
throw std::invalid_argument("error: --model is required\n");
|
|
432
510
|
}
|
|
433
511
|
|
|
@@ -452,7 +530,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
452
530
|
params.kv_overrides.back().key[0] = 0;
|
|
453
531
|
}
|
|
454
532
|
|
|
455
|
-
|
|
533
|
+
// pad tensor_buft_overrides for llama_params_fit:
|
|
534
|
+
const size_t ntbo = llama_max_tensor_buft_overrides();
|
|
535
|
+
while (params.tensor_buft_overrides.size() < ntbo) {
|
|
456
536
|
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
|
457
537
|
}
|
|
458
538
|
|
|
@@ -468,6 +548,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
468
548
|
));
|
|
469
549
|
}
|
|
470
550
|
|
|
551
|
+
common_log_set_verbosity_thold(params.verbosity);
|
|
552
|
+
|
|
471
553
|
return true;
|
|
472
554
|
}
|
|
473
555
|
|
|
@@ -560,6 +642,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
|
|
560
642
|
"llama-batched-bench",
|
|
561
643
|
"llama-bench",
|
|
562
644
|
"llama-cli",
|
|
645
|
+
"llama-completion",
|
|
563
646
|
"llama-convert-llama2c-to-ggml",
|
|
564
647
|
"llama-cvector-generator",
|
|
565
648
|
"llama-embedding",
|
|
@@ -644,6 +727,56 @@ static void add_rpc_devices(const std::string & servers) {
|
|
|
644
727
|
}
|
|
645
728
|
}
|
|
646
729
|
|
|
730
|
+
bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
|
|
731
|
+
common_params dummy_params;
|
|
732
|
+
common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
|
|
733
|
+
|
|
734
|
+
std::unordered_map<std::string, common_arg *> arg_to_options;
|
|
735
|
+
for (auto & opt : ctx_arg.options) {
|
|
736
|
+
for (const auto & arg : opt.args) {
|
|
737
|
+
arg_to_options[arg] = &opt;
|
|
738
|
+
}
|
|
739
|
+
for (const auto & arg : opt.args_neg) {
|
|
740
|
+
arg_to_options[arg] = &opt;
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
// TODO @ngxson : find a way to deduplicate this code
|
|
745
|
+
|
|
746
|
+
// handle command line arguments
|
|
747
|
+
auto check_arg = [&](int i) {
|
|
748
|
+
if (i+1 >= argc) {
|
|
749
|
+
throw std::invalid_argument("expected value for argument");
|
|
750
|
+
}
|
|
751
|
+
};
|
|
752
|
+
|
|
753
|
+
for (int i = 1; i < argc; i++) {
|
|
754
|
+
const std::string arg_prefix = "--";
|
|
755
|
+
|
|
756
|
+
std::string arg = argv[i];
|
|
757
|
+
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
|
758
|
+
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
759
|
+
}
|
|
760
|
+
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
|
761
|
+
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
|
762
|
+
}
|
|
763
|
+
auto opt = *arg_to_options[arg];
|
|
764
|
+
std::string val;
|
|
765
|
+
if (opt.value_hint != nullptr) {
|
|
766
|
+
// arg with single value
|
|
767
|
+
check_arg(i);
|
|
768
|
+
val = argv[++i];
|
|
769
|
+
}
|
|
770
|
+
if (opt.value_hint_2 != nullptr) {
|
|
771
|
+
// TODO: support arg with 2 values
|
|
772
|
+
throw std::invalid_argument("error: argument with 2 values is not yet supported\n");
|
|
773
|
+
}
|
|
774
|
+
out_map[opt] = val;
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
return true;
|
|
778
|
+
}
|
|
779
|
+
|
|
647
780
|
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
|
648
781
|
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
|
|
649
782
|
const common_params params_org = ctx_arg.params; // the example can modify the default params
|
|
@@ -689,23 +822,30 @@ static std::string list_builtin_chat_templates() {
|
|
|
689
822
|
return msg.str();
|
|
690
823
|
}
|
|
691
824
|
|
|
692
|
-
|
|
693
|
-
return value == "on" || value == "enabled" || value == "1";
|
|
825
|
+
bool common_arg_utils::is_truthy(const std::string & value) {
|
|
826
|
+
return value == "on" || value == "enabled" || value == "true" || value == "1";
|
|
694
827
|
}
|
|
695
828
|
|
|
696
|
-
|
|
697
|
-
return value == "off" || value == "disabled" || value == "0";
|
|
829
|
+
bool common_arg_utils::is_falsey(const std::string & value) {
|
|
830
|
+
return value == "off" || value == "disabled" || value == "false" || value == "0";
|
|
698
831
|
}
|
|
699
832
|
|
|
700
|
-
|
|
833
|
+
bool common_arg_utils::is_autoy(const std::string & value) {
|
|
701
834
|
return value == "auto" || value == "-1";
|
|
702
835
|
}
|
|
703
836
|
|
|
704
837
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
|
705
|
-
// default
|
|
706
|
-
//
|
|
707
|
-
if (ex ==
|
|
708
|
-
params.use_jinja =
|
|
838
|
+
// per-example default params
|
|
839
|
+
// we define here to make sure it's included in llama-gen-docs
|
|
840
|
+
if (ex == LLAMA_EXAMPLE_COMPLETION) {
|
|
841
|
+
params.use_jinja = false; // disable jinja by default
|
|
842
|
+
|
|
843
|
+
} else if (ex == LLAMA_EXAMPLE_MTMD) {
|
|
844
|
+
params.use_jinja = false; // disable jinja by default
|
|
845
|
+
params.sampling.temp = 0.2; // lower temp by default for better quality
|
|
846
|
+
|
|
847
|
+
} else if (ex == LLAMA_EXAMPLE_SERVER) {
|
|
848
|
+
params.n_parallel = -1; // auto by default
|
|
709
849
|
}
|
|
710
850
|
|
|
711
851
|
params.use_color = tty_can_use_colors();
|
|
@@ -785,12 +925,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
785
925
|
}
|
|
786
926
|
));
|
|
787
927
|
add_opt(common_arg(
|
|
928
|
+
{"--display-prompt"},
|
|
788
929
|
{"--no-display-prompt"},
|
|
789
|
-
string_format("
|
|
790
|
-
[](common_params & params) {
|
|
791
|
-
params.display_prompt =
|
|
930
|
+
string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
|
|
931
|
+
[](common_params & params, bool value) {
|
|
932
|
+
params.display_prompt = value;
|
|
792
933
|
}
|
|
793
|
-
).set_examples({
|
|
934
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
|
794
935
|
add_opt(common_arg(
|
|
795
936
|
{"-co", "--color"}, "[on|off|auto]",
|
|
796
937
|
"Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
|
|
@@ -807,7 +948,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
807
948
|
string_format("error: unknown value for --color: '%s'\n", value.c_str()));
|
|
808
949
|
}
|
|
809
950
|
}
|
|
810
|
-
).set_examples({
|
|
951
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
|
811
952
|
add_opt(common_arg(
|
|
812
953
|
{"-t", "--threads"}, "N",
|
|
813
954
|
string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
|
@@ -940,7 +1081,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
940
1081
|
add_opt(common_arg(
|
|
941
1082
|
{"-n", "--predict", "--n-predict"}, "N",
|
|
942
1083
|
string_format(
|
|
943
|
-
ex ==
|
|
1084
|
+
ex == LLAMA_EXAMPLE_COMPLETION
|
|
944
1085
|
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
|
|
945
1086
|
: "number of tokens to predict (default: %d, -1 = infinity)",
|
|
946
1087
|
params.n_predict),
|
|
@@ -979,42 +1120,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
979
1120
|
).set_env("LLAMA_ARG_SWA_FULL"));
|
|
980
1121
|
add_opt(common_arg(
|
|
981
1122
|
{"--ctx-checkpoints", "--swa-checkpoints"}, "N",
|
|
982
|
-
string_format("max number of context checkpoints to create per slot (default: %d)
|
|
1123
|
+
string_format("max number of context checkpoints to create per slot (default: %d)"
|
|
983
1124
|
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
|
|
984
1125
|
[](common_params & params, int value) {
|
|
985
1126
|
params.n_ctx_checkpoints = value;
|
|
986
1127
|
}
|
|
987
|
-
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1128
|
+
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
988
1129
|
add_opt(common_arg(
|
|
989
1130
|
{"--cache-ram", "-cram"}, "N",
|
|
990
|
-
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)
|
|
1131
|
+
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
|
|
991
1132
|
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
|
|
992
1133
|
[](common_params & params, int value) {
|
|
993
1134
|
params.cache_ram_mib = value;
|
|
994
1135
|
}
|
|
995
|
-
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1136
|
+
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
996
1137
|
add_opt(common_arg(
|
|
997
1138
|
{"--kv-unified", "-kvu"},
|
|
998
|
-
|
|
999
|
-
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
|
|
1139
|
+
"use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
|
|
1000
1140
|
[](common_params & params) {
|
|
1001
1141
|
params.kv_unified = true;
|
|
1002
1142
|
}
|
|
1003
|
-
).set_env("LLAMA_ARG_KV_UNIFIED"));
|
|
1004
|
-
add_opt(common_arg(
|
|
1005
|
-
{"--no-context-shift"},
|
|
1006
|
-
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
|
1007
|
-
[](common_params & params) {
|
|
1008
|
-
params.ctx_shift = false;
|
|
1009
|
-
}
|
|
1010
|
-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
|
1143
|
+
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1011
1144
|
add_opt(common_arg(
|
|
1012
1145
|
{"--context-shift"},
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1146
|
+
{"--no-context-shift"},
|
|
1147
|
+
string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
|
|
1148
|
+
[](common_params & params, bool value) {
|
|
1149
|
+
params.ctx_shift = value;
|
|
1016
1150
|
}
|
|
1017
|
-
).set_examples({
|
|
1151
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
|
|
1018
1152
|
add_opt(common_arg(
|
|
1019
1153
|
{"--chunks"}, "N",
|
|
1020
1154
|
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
|
|
@@ -1050,15 +1184,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1050
1184
|
[](common_params & params, const std::string & value) {
|
|
1051
1185
|
params.system_prompt = value;
|
|
1052
1186
|
}
|
|
1053
|
-
).set_examples({
|
|
1187
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
|
|
1054
1188
|
add_opt(common_arg(
|
|
1189
|
+
{"--perf"},
|
|
1055
1190
|
{"--no-perf"},
|
|
1056
|
-
string_format("
|
|
1057
|
-
[](common_params & params) {
|
|
1058
|
-
params.no_perf =
|
|
1059
|
-
params.sampling.no_perf =
|
|
1191
|
+
string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
|
1192
|
+
[](common_params & params, bool value) {
|
|
1193
|
+
params.no_perf = !value;
|
|
1194
|
+
params.sampling.no_perf = !value;
|
|
1060
1195
|
}
|
|
1061
|
-
).set_env("
|
|
1196
|
+
).set_env("LLAMA_ARG_PERF"));
|
|
1197
|
+
add_opt(common_arg(
|
|
1198
|
+
{"--show-timings"},
|
|
1199
|
+
{"--no-show-timings"},
|
|
1200
|
+
string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
|
|
1201
|
+
[](common_params & params, bool value) {
|
|
1202
|
+
params.show_timings = value;
|
|
1203
|
+
}
|
|
1204
|
+
).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
|
|
1062
1205
|
add_opt(common_arg(
|
|
1063
1206
|
{"-f", "--file"}, "FNAME",
|
|
1064
1207
|
"a file containing the prompt (default: none)",
|
|
@@ -1080,7 +1223,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1080
1223
|
params.system_prompt.pop_back();
|
|
1081
1224
|
}
|
|
1082
1225
|
}
|
|
1083
|
-
).set_examples({
|
|
1226
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
|
|
1084
1227
|
add_opt(common_arg(
|
|
1085
1228
|
{"--in-file"}, "FNAME",
|
|
1086
1229
|
"an input file (repeat to specify multiple files)",
|
|
@@ -1110,16 +1253,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1110
1253
|
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
|
1111
1254
|
add_opt(common_arg(
|
|
1112
1255
|
{"-e", "--escape"},
|
|
1113
|
-
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
|
1114
|
-
[](common_params & params) {
|
|
1115
|
-
params.escape = true;
|
|
1116
|
-
}
|
|
1117
|
-
));
|
|
1118
|
-
add_opt(common_arg(
|
|
1119
1256
|
{"--no-escape"},
|
|
1120
|
-
"
|
|
1121
|
-
[](common_params & params) {
|
|
1122
|
-
params.escape =
|
|
1257
|
+
string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
|
1258
|
+
[](common_params & params, bool value) {
|
|
1259
|
+
params.escape = value;
|
|
1123
1260
|
}
|
|
1124
1261
|
));
|
|
1125
1262
|
add_opt(common_arg(
|
|
@@ -1128,59 +1265,53 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1128
1265
|
[](common_params & params, int value) {
|
|
1129
1266
|
params.n_print = value;
|
|
1130
1267
|
}
|
|
1131
|
-
).set_examples({
|
|
1268
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1132
1269
|
add_opt(common_arg(
|
|
1133
1270
|
{"--prompt-cache"}, "FNAME",
|
|
1134
1271
|
"file to cache prompt state for faster startup (default: none)",
|
|
1135
1272
|
[](common_params & params, const std::string & value) {
|
|
1136
1273
|
params.path_prompt_cache = value;
|
|
1137
1274
|
}
|
|
1138
|
-
).set_examples({
|
|
1275
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1139
1276
|
add_opt(common_arg(
|
|
1140
1277
|
{"--prompt-cache-all"},
|
|
1141
1278
|
"if specified, saves user input and generations to cache as well\n",
|
|
1142
1279
|
[](common_params & params) {
|
|
1143
1280
|
params.prompt_cache_all = true;
|
|
1144
1281
|
}
|
|
1145
|
-
).set_examples({
|
|
1282
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1146
1283
|
add_opt(common_arg(
|
|
1147
1284
|
{"--prompt-cache-ro"},
|
|
1148
1285
|
"if specified, uses the prompt cache but does not update it",
|
|
1149
1286
|
[](common_params & params) {
|
|
1150
1287
|
params.prompt_cache_ro = true;
|
|
1151
1288
|
}
|
|
1152
|
-
).set_examples({
|
|
1289
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1153
1290
|
add_opt(common_arg(
|
|
1154
1291
|
{"-r", "--reverse-prompt"}, "PROMPT",
|
|
1155
1292
|
"halt generation at PROMPT, return control in interactive mode\n",
|
|
1156
1293
|
[](common_params & params, const std::string & value) {
|
|
1157
1294
|
params.antiprompt.emplace_back(value);
|
|
1158
1295
|
}
|
|
1159
|
-
).set_examples({
|
|
1296
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
|
|
1160
1297
|
add_opt(common_arg(
|
|
1161
1298
|
{"-sp", "--special"},
|
|
1162
1299
|
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
|
|
1163
1300
|
[](common_params & params) {
|
|
1164
1301
|
params.special = true;
|
|
1165
1302
|
}
|
|
1166
|
-
).set_examples({
|
|
1303
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
|
|
1167
1304
|
add_opt(common_arg(
|
|
1168
1305
|
{"-cnv", "--conversation"},
|
|
1169
|
-
"
|
|
1306
|
+
{"-no-cnv", "--no-conversation"},
|
|
1307
|
+
"whether to run in conversation mode:\n"
|
|
1170
1308
|
"- does not print special tokens and suffix/prefix\n"
|
|
1171
1309
|
"- interactive mode is also enabled\n"
|
|
1172
1310
|
"(default: auto enabled if chat template is available)",
|
|
1173
|
-
[](common_params & params) {
|
|
1174
|
-
params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
|
|
1175
|
-
}
|
|
1176
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
1177
|
-
add_opt(common_arg(
|
|
1178
|
-
{"-no-cnv", "--no-conversation"},
|
|
1179
|
-
"force disable conversation mode (default: false)",
|
|
1180
|
-
[](common_params & params) {
|
|
1181
|
-
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
|
|
1311
|
+
[](common_params & params, bool value) {
|
|
1312
|
+
params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
|
|
1182
1313
|
}
|
|
1183
|
-
).set_examples({
|
|
1314
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
|
1184
1315
|
add_opt(common_arg(
|
|
1185
1316
|
{"-st", "--single-turn"},
|
|
1186
1317
|
"run conversation for a single turn only, then exit when done\n"
|
|
@@ -1189,28 +1320,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1189
1320
|
[](common_params & params) {
|
|
1190
1321
|
params.single_turn = true;
|
|
1191
1322
|
}
|
|
1192
|
-
).set_examples({
|
|
1323
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
|
1193
1324
|
add_opt(common_arg(
|
|
1194
1325
|
{"-i", "--interactive"},
|
|
1195
1326
|
string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
|
|
1196
1327
|
[](common_params & params) {
|
|
1197
1328
|
params.interactive = true;
|
|
1198
1329
|
}
|
|
1199
|
-
).set_examples({
|
|
1330
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1200
1331
|
add_opt(common_arg(
|
|
1201
1332
|
{"-if", "--interactive-first"},
|
|
1202
1333
|
string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
|
|
1203
1334
|
[](common_params & params) {
|
|
1204
1335
|
params.interactive_first = true;
|
|
1205
1336
|
}
|
|
1206
|
-
).set_examples({
|
|
1337
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1207
1338
|
add_opt(common_arg(
|
|
1208
1339
|
{"-mli", "--multiline-input"},
|
|
1209
1340
|
"allows you to write or paste multiple lines without ending each in '\\'",
|
|
1210
1341
|
[](common_params & params) {
|
|
1211
1342
|
params.multiline_input = true;
|
|
1212
1343
|
}
|
|
1213
|
-
).set_examples({
|
|
1344
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
|
1214
1345
|
add_opt(common_arg(
|
|
1215
1346
|
{"--in-prefix-bos"},
|
|
1216
1347
|
"prefix BOS to user inputs, preceding the `--in-prefix` string",
|
|
@@ -1218,7 +1349,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1218
1349
|
params.input_prefix_bos = true;
|
|
1219
1350
|
params.enable_chat_template = false;
|
|
1220
1351
|
}
|
|
1221
|
-
).set_examples({
|
|
1352
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1222
1353
|
add_opt(common_arg(
|
|
1223
1354
|
{"--in-prefix"}, "STRING",
|
|
1224
1355
|
"string to prefix user inputs with (default: empty)",
|
|
@@ -1226,7 +1357,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1226
1357
|
params.input_prefix = value;
|
|
1227
1358
|
params.enable_chat_template = false;
|
|
1228
1359
|
}
|
|
1229
|
-
).set_examples({
|
|
1360
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1230
1361
|
add_opt(common_arg(
|
|
1231
1362
|
{"--in-suffix"}, "STRING",
|
|
1232
1363
|
"string to suffix after user inputs with (default: empty)",
|
|
@@ -1234,14 +1365,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1234
1365
|
params.input_suffix = value;
|
|
1235
1366
|
params.enable_chat_template = false;
|
|
1236
1367
|
}
|
|
1237
|
-
).set_examples({
|
|
1368
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1238
1369
|
add_opt(common_arg(
|
|
1370
|
+
{"--warmup"},
|
|
1239
1371
|
{"--no-warmup"},
|
|
1240
|
-
"
|
|
1241
|
-
[](common_params & params) {
|
|
1242
|
-
params.warmup =
|
|
1372
|
+
string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
|
|
1373
|
+
[](common_params & params, bool value) {
|
|
1374
|
+
params.warmup = value;
|
|
1243
1375
|
}
|
|
1244
|
-
).set_examples({
|
|
1376
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1245
1377
|
add_opt(common_arg(
|
|
1246
1378
|
{"--spm-infill"},
|
|
1247
1379
|
string_format(
|
|
@@ -1298,7 +1430,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1298
1430
|
params.sampling.top_k = value;
|
|
1299
1431
|
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
|
|
1300
1432
|
}
|
|
1301
|
-
).set_sparam());
|
|
1433
|
+
).set_sparam().set_env("LLAMA_ARG_TOP_K"));
|
|
1302
1434
|
add_opt(common_arg(
|
|
1303
1435
|
{"--top-p"}, "N",
|
|
1304
1436
|
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
|
|
@@ -1632,28 +1764,30 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1632
1764
|
[](common_params & params, int value) {
|
|
1633
1765
|
params.grp_attn_n = value;
|
|
1634
1766
|
}
|
|
1635
|
-
).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({
|
|
1767
|
+
).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_PASSKEY}));
|
|
1636
1768
|
add_opt(common_arg(
|
|
1637
1769
|
{"-gaw", "--grp-attn-w"}, "N",
|
|
1638
1770
|
string_format("group-attention width (default: %d)", params.grp_attn_w),
|
|
1639
1771
|
[](common_params & params, int value) {
|
|
1640
1772
|
params.grp_attn_w = value;
|
|
1641
1773
|
}
|
|
1642
|
-
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({
|
|
1774
|
+
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1643
1775
|
add_opt(common_arg(
|
|
1776
|
+
{"-kvo", "--kv-offload"},
|
|
1644
1777
|
{"-nkvo", "--no-kv-offload"},
|
|
1645
|
-
"
|
|
1646
|
-
[](common_params & params) {
|
|
1647
|
-
params.no_kv_offload =
|
|
1778
|
+
string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
|
|
1779
|
+
[](common_params & params, bool value) {
|
|
1780
|
+
params.no_kv_offload = !value;
|
|
1648
1781
|
}
|
|
1649
|
-
).set_env("
|
|
1782
|
+
).set_env("LLAMA_ARG_KV_OFFLOAD"));
|
|
1650
1783
|
add_opt(common_arg(
|
|
1784
|
+
{"--repack"},
|
|
1651
1785
|
{"-nr", "--no-repack"},
|
|
1652
|
-
"
|
|
1653
|
-
[](common_params & params) {
|
|
1654
|
-
params.no_extra_bufts =
|
|
1786
|
+
string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
|
|
1787
|
+
[](common_params & params, bool value) {
|
|
1788
|
+
params.no_extra_bufts = !value;
|
|
1655
1789
|
}
|
|
1656
|
-
).set_env("
|
|
1790
|
+
).set_env("LLAMA_ARG_REPACK"));
|
|
1657
1791
|
add_opt(common_arg(
|
|
1658
1792
|
{"--no-host"},
|
|
1659
1793
|
"bypass host buffer allowing extra buffers to be used",
|
|
@@ -1766,13 +1900,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1766
1900
|
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
|
|
1767
1901
|
}
|
|
1768
1902
|
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
params.n_parallel
|
|
1774
|
-
|
|
1775
|
-
|
|
1903
|
+
if (ex == LLAMA_EXAMPLE_SERVER) {
|
|
1904
|
+
// this is to make sure this option appears in the server-specific section of the help message
|
|
1905
|
+
add_opt(common_arg(
|
|
1906
|
+
{"-np", "--parallel"}, "N",
|
|
1907
|
+
string_format("number of server slots (default: %d, -1 = auto)", params.n_parallel),
|
|
1908
|
+
[](common_params & params, int value) {
|
|
1909
|
+
if (value == 0) {
|
|
1910
|
+
throw std::invalid_argument("error: invalid value for n_parallel\n");
|
|
1911
|
+
}
|
|
1912
|
+
params.n_parallel = value;
|
|
1913
|
+
}
|
|
1914
|
+
).set_env("LLAMA_ARG_N_PARALLEL").set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1915
|
+
} else {
|
|
1916
|
+
add_opt(common_arg(
|
|
1917
|
+
{"-np", "--parallel"}, "N",
|
|
1918
|
+
string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
|
|
1919
|
+
[](common_params & params, int value) {
|
|
1920
|
+
params.n_parallel = value;
|
|
1921
|
+
}
|
|
1922
|
+
).set_env("LLAMA_ARG_N_PARALLEL"));
|
|
1923
|
+
}
|
|
1776
1924
|
add_opt(common_arg(
|
|
1777
1925
|
{"-ns", "--sequences"}, "N",
|
|
1778
1926
|
string_format("number of sequences to decode (default: %d)", params.n_sequences),
|
|
@@ -1782,20 +1930,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1782
1930
|
).set_examples({LLAMA_EXAMPLE_PARALLEL}));
|
|
1783
1931
|
add_opt(common_arg(
|
|
1784
1932
|
{"-cb", "--cont-batching"},
|
|
1785
|
-
string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
|
|
1786
|
-
[](common_params & params) {
|
|
1787
|
-
params.cont_batching = true;
|
|
1788
|
-
}
|
|
1789
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
|
|
1790
|
-
add_opt(common_arg(
|
|
1791
1933
|
{"-nocb", "--no-cont-batching"},
|
|
1792
|
-
"
|
|
1793
|
-
[](common_params & params) {
|
|
1794
|
-
params.cont_batching =
|
|
1934
|
+
string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
|
|
1935
|
+
[](common_params & params, bool value) {
|
|
1936
|
+
params.cont_batching = value;
|
|
1795
1937
|
}
|
|
1796
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("
|
|
1938
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
|
|
1797
1939
|
add_opt(common_arg(
|
|
1798
|
-
{"--mmproj"}, "FILE",
|
|
1940
|
+
{"-mm", "--mmproj"}, "FILE",
|
|
1799
1941
|
"path to a multimodal projector file. see tools/mtmd/README.md\n"
|
|
1800
1942
|
"note: if -hf is used, this argument can be omitted",
|
|
1801
1943
|
[](common_params & params, const std::string & value) {
|
|
@@ -1803,33 +1945,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1803
1945
|
}
|
|
1804
1946
|
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
|
|
1805
1947
|
add_opt(common_arg(
|
|
1806
|
-
{"--mmproj-url"}, "URL",
|
|
1948
|
+
{"-mmu", "--mmproj-url"}, "URL",
|
|
1807
1949
|
"URL to a multimodal projector file. see tools/mtmd/README.md",
|
|
1808
1950
|
[](common_params & params, const std::string & value) {
|
|
1809
1951
|
params.mmproj.url = value;
|
|
1810
1952
|
}
|
|
1811
1953
|
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
|
|
1812
1954
|
add_opt(common_arg(
|
|
1813
|
-
{"--
|
|
1814
|
-
"
|
|
1815
|
-
|
|
1816
|
-
|
|
1955
|
+
{"--mmproj-auto"},
|
|
1956
|
+
{"--no-mmproj", "--no-mmproj-auto"},
|
|
1957
|
+
string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
|
|
1958
|
+
[](common_params & params, bool value) {
|
|
1959
|
+
params.no_mmproj = !value;
|
|
1817
1960
|
}
|
|
1818
|
-
).set_examples(mmproj_examples).set_env("
|
|
1961
|
+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
|
|
1819
1962
|
add_opt(common_arg(
|
|
1963
|
+
{"--mmproj-offload"},
|
|
1820
1964
|
{"--no-mmproj-offload"},
|
|
1821
|
-
"
|
|
1822
|
-
[](common_params & params) {
|
|
1823
|
-
params.mmproj_use_gpu =
|
|
1965
|
+
string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
|
|
1966
|
+
[](common_params & params, bool value) {
|
|
1967
|
+
params.mmproj_use_gpu = value;
|
|
1824
1968
|
}
|
|
1825
|
-
).set_examples(mmproj_examples).set_env("
|
|
1969
|
+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
|
|
1826
1970
|
add_opt(common_arg(
|
|
1827
1971
|
{"--image", "--audio"}, "FILE",
|
|
1828
1972
|
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
|
|
1829
1973
|
[](common_params & params, const std::string & value) {
|
|
1830
1974
|
params.image.emplace_back(value);
|
|
1831
1975
|
}
|
|
1832
|
-
).set_examples({LLAMA_EXAMPLE_MTMD}));
|
|
1976
|
+
).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
|
|
1833
1977
|
add_opt(common_arg(
|
|
1834
1978
|
{"--image-min-tokens"}, "N",
|
|
1835
1979
|
"minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
|
|
@@ -1862,12 +2006,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1862
2006
|
}
|
|
1863
2007
|
).set_env("LLAMA_ARG_MLOCK"));
|
|
1864
2008
|
add_opt(common_arg(
|
|
2009
|
+
{"--mmap"},
|
|
1865
2010
|
{"--no-mmap"},
|
|
1866
|
-
"
|
|
1867
|
-
[](common_params & params) {
|
|
1868
|
-
params.use_mmap =
|
|
2011
|
+
string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
|
|
2012
|
+
[](common_params & params, bool value) {
|
|
2013
|
+
params.use_mmap = value;
|
|
1869
2014
|
}
|
|
1870
|
-
).set_env("
|
|
2015
|
+
).set_env("LLAMA_ARG_MMAP"));
|
|
1871
2016
|
add_opt(common_arg(
|
|
1872
2017
|
{"--numa"}, "TYPE",
|
|
1873
2018
|
"attempt optimizations that help on some NUMA systems\n"
|
|
@@ -1922,7 +2067,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1922
2067
|
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
|
|
1923
2068
|
parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
|
|
1924
2069
|
}
|
|
1925
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
|
2070
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
1926
2071
|
add_opt(common_arg(
|
|
1927
2072
|
{"--cpu-moe", "-cmoe"},
|
|
1928
2073
|
"keep all Mixture of Experts (MoE) weights in the CPU",
|
|
@@ -1951,7 +2096,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1951
2096
|
[](common_params & params) {
|
|
1952
2097
|
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
|
1953
2098
|
}
|
|
1954
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
|
|
2099
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
|
|
1955
2100
|
add_opt(common_arg(
|
|
1956
2101
|
{"--n-cpu-moe-draft", "-ncmoed"}, "N",
|
|
1957
2102
|
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
|
|
@@ -1965,7 +2110,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1965
2110
|
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
|
|
1966
2111
|
}
|
|
1967
2112
|
}
|
|
1968
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
|
|
2113
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
|
|
1969
2114
|
add_opt(common_arg(
|
|
1970
2115
|
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
|
1971
2116
|
string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
|
|
@@ -2037,6 +2182,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2037
2182
|
}
|
|
2038
2183
|
}
|
|
2039
2184
|
).set_env("LLAMA_ARG_MAIN_GPU"));
|
|
2185
|
+
add_opt(common_arg(
|
|
2186
|
+
{ "-fit", "--fit" }, "[on|off]",
|
|
2187
|
+
string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"),
|
|
2188
|
+
[](common_params & params, const std::string & value) {
|
|
2189
|
+
if (is_truthy(value)) {
|
|
2190
|
+
params.fit_params = true;
|
|
2191
|
+
} else if (is_falsey(value)) {
|
|
2192
|
+
params.fit_params = false;
|
|
2193
|
+
} else {
|
|
2194
|
+
throw std::runtime_error(
|
|
2195
|
+
string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
|
|
2196
|
+
}
|
|
2197
|
+
}
|
|
2198
|
+
).set_env("LLAMA_ARG_FIT"));
|
|
2199
|
+
add_opt(common_arg(
|
|
2200
|
+
{ "-fitt", "--fit-target" }, "MiB",
|
|
2201
|
+
string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
|
|
2202
|
+
[](common_params & params, int value) {
|
|
2203
|
+
params.fit_params_target = value * size_t(1024*1024);
|
|
2204
|
+
}
|
|
2205
|
+
).set_env("LLAMA_ARG_FIT_TARGET"));
|
|
2206
|
+
add_opt(common_arg(
|
|
2207
|
+
{ "-fitc", "--fit-ctx" }, "N",
|
|
2208
|
+
string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx),
|
|
2209
|
+
[](common_params & params, int value) {
|
|
2210
|
+
params.fit_params_min_ctx = value;
|
|
2211
|
+
}
|
|
2212
|
+
).set_env("LLAMA_ARG_FIT_CTX"));
|
|
2040
2213
|
add_opt(common_arg(
|
|
2041
2214
|
{"--check-tensors"},
|
|
2042
2215
|
string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
|
|
@@ -2055,10 +2228,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2055
2228
|
}
|
|
2056
2229
|
));
|
|
2057
2230
|
add_opt(common_arg(
|
|
2231
|
+
{"--op-offload"},
|
|
2058
2232
|
{"--no-op-offload"},
|
|
2059
|
-
string_format("
|
|
2060
|
-
[](common_params & params) {
|
|
2061
|
-
params.no_op_offload =
|
|
2233
|
+
string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
|
|
2234
|
+
[](common_params & params, bool value) {
|
|
2235
|
+
params.no_op_offload = !value;
|
|
2062
2236
|
}
|
|
2063
2237
|
));
|
|
2064
2238
|
add_opt(common_arg(
|
|
@@ -2254,10 +2428,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2254
2428
|
}
|
|
2255
2429
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2256
2430
|
add_opt(common_arg(
|
|
2431
|
+
{"--ppl"},
|
|
2257
2432
|
{"--no-ppl"},
|
|
2258
|
-
string_format("
|
|
2259
|
-
[](common_params & params) {
|
|
2260
|
-
params.compute_ppl =
|
|
2433
|
+
string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
|
|
2434
|
+
[](common_params & params, bool value) {
|
|
2435
|
+
params.compute_ppl = value;
|
|
2261
2436
|
}
|
|
2262
2437
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2263
2438
|
add_opt(common_arg(
|
|
@@ -2376,12 +2551,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2376
2551
|
}
|
|
2377
2552
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
|
2378
2553
|
add_opt(common_arg(
|
|
2554
|
+
{"--webui"},
|
|
2379
2555
|
{"--no-webui"},
|
|
2380
|
-
string_format("
|
|
2381
|
-
[](common_params & params) {
|
|
2382
|
-
params.webui =
|
|
2556
|
+
string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
|
|
2557
|
+
[](common_params & params, bool value) {
|
|
2558
|
+
params.webui = value;
|
|
2383
2559
|
}
|
|
2384
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("
|
|
2560
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
|
|
2385
2561
|
add_opt(common_arg(
|
|
2386
2562
|
{"--embedding", "--embeddings"},
|
|
2387
2563
|
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
|
|
@@ -2444,7 +2620,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2444
2620
|
params.default_template_kwargs[item.key()] = item.value().dump();
|
|
2445
2621
|
}
|
|
2446
2622
|
}
|
|
2447
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
|
|
2623
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
|
|
2448
2624
|
add_opt(common_arg(
|
|
2449
2625
|
{"-to", "--timeout"}, "N",
|
|
2450
2626
|
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
|
@@ -2486,18 +2662,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2486
2662
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
|
|
2487
2663
|
add_opt(common_arg(
|
|
2488
2664
|
{"--slots"},
|
|
2489
|
-
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
|
2490
|
-
[](common_params & params) {
|
|
2491
|
-
params.endpoint_slots = true;
|
|
2492
|
-
}
|
|
2493
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
|
|
2494
|
-
add_opt(common_arg(
|
|
2495
2665
|
{"--no-slots"},
|
|
2496
|
-
"
|
|
2497
|
-
[](common_params & params) {
|
|
2498
|
-
params.endpoint_slots =
|
|
2666
|
+
string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
|
2667
|
+
[](common_params & params, bool value) {
|
|
2668
|
+
params.endpoint_slots = value;
|
|
2499
2669
|
}
|
|
2500
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("
|
|
2670
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
|
|
2501
2671
|
add_opt(common_arg(
|
|
2502
2672
|
{"--slot-save-path"}, "PATH",
|
|
2503
2673
|
"path to save slot kv cache (default: disabled)",
|
|
@@ -2533,6 +2703,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2533
2703
|
params.models_dir = value;
|
|
2534
2704
|
}
|
|
2535
2705
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
|
|
2706
|
+
add_opt(common_arg(
|
|
2707
|
+
{"--models-preset"}, "PATH",
|
|
2708
|
+
"path to INI file containing model presets for the router server (default: disabled)",
|
|
2709
|
+
[](common_params & params, const std::string & value) {
|
|
2710
|
+
params.models_preset = value;
|
|
2711
|
+
}
|
|
2712
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_PRESET"));
|
|
2536
2713
|
add_opt(common_arg(
|
|
2537
2714
|
{"--models-max"}, "N",
|
|
2538
2715
|
string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
|
|
@@ -2541,26 +2718,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2541
2718
|
}
|
|
2542
2719
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
|
|
2543
2720
|
add_opt(common_arg(
|
|
2721
|
+
{"--models-autoload"},
|
|
2544
2722
|
{"--no-models-autoload"},
|
|
2545
|
-
"
|
|
2546
|
-
[](common_params & params) {
|
|
2547
|
-
params.models_autoload =
|
|
2723
|
+
string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
|
|
2724
|
+
[](common_params & params, bool value) {
|
|
2725
|
+
params.models_autoload = value;
|
|
2548
2726
|
}
|
|
2549
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("
|
|
2727
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
|
|
2550
2728
|
add_opt(common_arg(
|
|
2551
2729
|
{"--jinja"},
|
|
2552
|
-
string_format("use jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
|
|
2553
|
-
[](common_params & params) {
|
|
2554
|
-
params.use_jinja = true;
|
|
2555
|
-
}
|
|
2556
|
-
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
|
|
2557
|
-
add_opt(common_arg(
|
|
2558
2730
|
{"--no-jinja"},
|
|
2559
|
-
string_format("
|
|
2560
|
-
[](common_params & params) {
|
|
2561
|
-
params.use_jinja =
|
|
2731
|
+
string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
|
|
2732
|
+
[](common_params & params, bool value) {
|
|
2733
|
+
params.use_jinja = value;
|
|
2562
2734
|
}
|
|
2563
|
-
).set_examples({LLAMA_EXAMPLE_SERVER,
|
|
2735
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
|
|
2564
2736
|
add_opt(common_arg(
|
|
2565
2737
|
{"--reasoning-format"}, "FORMAT",
|
|
2566
2738
|
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
|
@@ -2571,7 +2743,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2571
2743
|
[](common_params & params, const std::string & value) {
|
|
2572
2744
|
params.reasoning_format = common_reasoning_format_from_name(value);
|
|
2573
2745
|
}
|
|
2574
|
-
).set_examples({LLAMA_EXAMPLE_SERVER,
|
|
2746
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
|
|
2575
2747
|
add_opt(common_arg(
|
|
2576
2748
|
{"--reasoning-budget"}, "N",
|
|
2577
2749
|
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
|
|
@@ -2579,7 +2751,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2579
2751
|
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
|
|
2580
2752
|
params.reasoning_budget = value;
|
|
2581
2753
|
}
|
|
2582
|
-
).set_examples({LLAMA_EXAMPLE_SERVER,
|
|
2754
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
|
|
2583
2755
|
add_opt(common_arg(
|
|
2584
2756
|
{"--chat-template"}, "JINJA_TEMPLATE",
|
|
2585
2757
|
string_format(
|
|
@@ -2591,7 +2763,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2591
2763
|
[](common_params & params, const std::string & value) {
|
|
2592
2764
|
params.chat_template = value;
|
|
2593
2765
|
}
|
|
2594
|
-
).set_examples({
|
|
2766
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
|
2595
2767
|
add_opt(common_arg(
|
|
2596
2768
|
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
|
|
2597
2769
|
string_format(
|
|
@@ -2603,17 +2775,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2603
2775
|
[](common_params & params, const std::string & value) {
|
|
2604
2776
|
params.chat_template = read_file(value);
|
|
2605
2777
|
}
|
|
2606
|
-
).set_examples({
|
|
2778
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
|
|
2607
2779
|
add_opt(common_arg(
|
|
2780
|
+
{"--prefill-assistant"},
|
|
2608
2781
|
{"--no-prefill-assistant"},
|
|
2609
2782
|
string_format(
|
|
2610
2783
|
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
|
|
2611
2784
|
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
|
|
2612
2785
|
),
|
|
2613
|
-
[](common_params & params) {
|
|
2614
|
-
params.prefill_assistant =
|
|
2786
|
+
[](common_params & params, bool value) {
|
|
2787
|
+
params.prefill_assistant = value;
|
|
2615
2788
|
}
|
|
2616
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("
|
|
2789
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
|
|
2617
2790
|
add_opt(common_arg(
|
|
2618
2791
|
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
|
2619
2792
|
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
|
@@ -2634,7 +2807,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2634
2807
|
[](common_params & params) {
|
|
2635
2808
|
params.simple_io = true;
|
|
2636
2809
|
}
|
|
2637
|
-
).set_examples({
|
|
2810
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
|
2638
2811
|
add_opt(common_arg(
|
|
2639
2812
|
{"--positive-file"}, "FNAME",
|
|
2640
2813
|
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|
|
@@ -2717,7 +2890,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2717
2890
|
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
|
|
2718
2891
|
[](common_params & params) {
|
|
2719
2892
|
params.verbosity = INT_MAX;
|
|
2720
|
-
common_log_set_verbosity_thold(INT_MAX);
|
|
2721
2893
|
}
|
|
2722
2894
|
));
|
|
2723
2895
|
add_opt(common_arg(
|
|
@@ -2738,7 +2910,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2738
2910
|
"(default: %d)\n", params.verbosity),
|
|
2739
2911
|
[](common_params & params, int value) {
|
|
2740
2912
|
params.verbosity = value;
|
|
2741
|
-
common_log_set_verbosity_thold(value);
|
|
2742
2913
|
}
|
|
2743
2914
|
).set_env("LLAMA_LOG_VERBOSITY"));
|
|
2744
2915
|
add_opt(common_arg(
|
|
@@ -2871,14 +3042,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2871
3042
|
[](common_params & params, int value) {
|
|
2872
3043
|
params.speculative.n_max = value;
|
|
2873
3044
|
}
|
|
2874
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
|
|
3045
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX"));
|
|
2875
3046
|
add_opt(common_arg(
|
|
2876
3047
|
{"--draft-min", "--draft-n-min"}, "N",
|
|
2877
3048
|
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
|
|
2878
3049
|
[](common_params & params, int value) {
|
|
2879
3050
|
params.speculative.n_min = value;
|
|
2880
3051
|
}
|
|
2881
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
|
|
3052
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN"));
|
|
2882
3053
|
add_opt(common_arg(
|
|
2883
3054
|
{"--draft-p-split"}, "P",
|
|
2884
3055
|
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
|
|
@@ -2892,14 +3063,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2892
3063
|
[](common_params & params, const std::string & value) {
|
|
2893
3064
|
params.speculative.p_min = std::stof(value);
|
|
2894
3065
|
}
|
|
2895
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
|
|
3066
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
|
|
2896
3067
|
add_opt(common_arg(
|
|
2897
3068
|
{"-cd", "--ctx-size-draft"}, "N",
|
|
2898
3069
|
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
|
|
2899
3070
|
[](common_params & params, int value) {
|
|
2900
3071
|
params.speculative.n_ctx = value;
|
|
2901
3072
|
}
|
|
2902
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
|
|
3073
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
|
|
2903
3074
|
add_opt(common_arg(
|
|
2904
3075
|
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
|
|
2905
3076
|
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
|
|
@@ -2907,7 +3078,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2907
3078
|
[](common_params & params, const std::string & value) {
|
|
2908
3079
|
params.speculative.devices = parse_device_list(value);
|
|
2909
3080
|
}
|
|
2910
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
|
3081
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
2911
3082
|
add_opt(common_arg(
|
|
2912
3083
|
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
|
2913
3084
|
"number of layers to store in VRAM for the draft model",
|
|
@@ -2919,21 +3090,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2919
3090
|
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
|
2920
3091
|
}
|
|
2921
3092
|
}
|
|
2922
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
|
|
3093
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
|
|
2923
3094
|
add_opt(common_arg(
|
|
2924
3095
|
{"-md", "--model-draft"}, "FNAME",
|
|
2925
3096
|
"draft model for speculative decoding (default: unused)",
|
|
2926
3097
|
[](common_params & params, const std::string & value) {
|
|
2927
3098
|
params.speculative.model.path = value;
|
|
2928
3099
|
}
|
|
2929
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
|
3100
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
|
2930
3101
|
add_opt(common_arg(
|
|
2931
3102
|
{"--spec-replace"}, "TARGET", "DRAFT",
|
|
2932
3103
|
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
|
|
2933
3104
|
[](common_params & params, const std::string & tgt, const std::string & dft) {
|
|
2934
3105
|
params.speculative.replacements.push_back({ tgt, dft });
|
|
2935
3106
|
}
|
|
2936
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
|
3107
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
2937
3108
|
add_opt(common_arg(
|
|
2938
3109
|
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
|
|
2939
3110
|
string_format(
|
|
@@ -3197,7 +3368,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3197
3368
|
params.use_jinja = true;
|
|
3198
3369
|
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
|
|
3199
3370
|
}
|
|
3200
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3371
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
3201
3372
|
|
|
3202
3373
|
add_opt(common_arg(
|
|
3203
3374
|
{"--gpt-oss-120b-default"},
|
|
@@ -3216,7 +3387,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3216
3387
|
params.use_jinja = true;
|
|
3217
3388
|
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
|
|
3218
3389
|
}
|
|
3219
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3390
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
3220
3391
|
|
|
3221
3392
|
add_opt(common_arg(
|
|
3222
3393
|
{"--vision-gemma-4b-default"},
|
|
@@ -3227,7 +3398,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3227
3398
|
params.n_ctx = 0;
|
|
3228
3399
|
params.use_jinja = true;
|
|
3229
3400
|
}
|
|
3230
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3401
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
3231
3402
|
|
|
3232
3403
|
add_opt(common_arg(
|
|
3233
3404
|
{"--vision-gemma-12b-default"},
|
|
@@ -3238,7 +3409,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3238
3409
|
params.n_ctx = 0;
|
|
3239
3410
|
params.use_jinja = true;
|
|
3240
3411
|
}
|
|
3241
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3412
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
3242
3413
|
|
|
3243
3414
|
return ctx_arg;
|
|
3244
3415
|
}
|