@fugood/llama.node 1.4.7 → 1.4.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +23 -24
- package/src/LlamaContext.cpp +4 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +470 -223
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +140 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +44 -17
- package/src/llama.cpp/common/console.cpp +98 -18
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +67 -54
- package/src/llama.cpp/common/sampling.h +8 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +285 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -45
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +288 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +93 -23
- package/src/llama.cpp/src/llama-context.h +8 -2
- package/src/llama.cpp/src/llama-graph.cpp +84 -16
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +123 -28
- package/src/llama.cpp/src/llama-mmap.h +5 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +58 -13
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +110 -49
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +16 -0
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +665 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +5 -5
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
#include <nlohmann/json.hpp>
|
|
21
21
|
|
|
22
22
|
#include <algorithm>
|
|
23
|
+
#include <cinttypes>
|
|
23
24
|
#include <climits>
|
|
24
25
|
#include <cstdarg>
|
|
25
26
|
#include <fstream>
|
|
@@ -47,10 +48,12 @@
|
|
|
47
48
|
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
|
48
49
|
|
|
49
50
|
using json = nlohmann::ordered_json;
|
|
51
|
+
using namespace common_arg_utils;
|
|
50
52
|
|
|
51
53
|
static std::initializer_list<enum llama_example> mmproj_examples = {
|
|
52
54
|
LLAMA_EXAMPLE_MTMD,
|
|
53
55
|
LLAMA_EXAMPLE_SERVER,
|
|
56
|
+
LLAMA_EXAMPLE_CLI,
|
|
54
57
|
};
|
|
55
58
|
|
|
56
59
|
static std::string read_file(const std::string & fname) {
|
|
@@ -63,6 +66,15 @@ static std::string read_file(const std::string & fname) {
|
|
|
63
66
|
return content;
|
|
64
67
|
}
|
|
65
68
|
|
|
69
|
+
static const std::vector<common_arg> & get_common_arg_defs() {
|
|
70
|
+
static const std::vector<common_arg> options = [] {
|
|
71
|
+
common_params params;
|
|
72
|
+
auto ctx = common_params_parser_init(params, LLAMA_EXAMPLE_SERVER, nullptr);
|
|
73
|
+
return ctx.options;
|
|
74
|
+
}();
|
|
75
|
+
return options;
|
|
76
|
+
}
|
|
77
|
+
|
|
66
78
|
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
|
|
67
79
|
this->examples = examples;
|
|
68
80
|
return *this;
|
|
@@ -94,6 +106,16 @@ bool common_arg::is_exclude(enum llama_example ex) {
|
|
|
94
106
|
|
|
95
107
|
bool common_arg::get_value_from_env(std::string & output) const {
|
|
96
108
|
if (env == nullptr) return false;
|
|
109
|
+
if (!args_neg.empty()) {
|
|
110
|
+
// for compatibility, we need to check LLAMA_ARG_NO_ env as well
|
|
111
|
+
std::string neg_env = env;
|
|
112
|
+
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
|
|
113
|
+
char * neg_value = std::getenv(neg_env.c_str());
|
|
114
|
+
if (neg_value) {
|
|
115
|
+
output = "0"; // falsey
|
|
116
|
+
return true;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
97
119
|
char * value = std::getenv(env);
|
|
98
120
|
if (value) {
|
|
99
121
|
output = value;
|
|
@@ -103,6 +125,14 @@ bool common_arg::get_value_from_env(std::string & output) const {
|
|
|
103
125
|
}
|
|
104
126
|
|
|
105
127
|
bool common_arg::has_value_from_env() const {
|
|
128
|
+
if (env != nullptr && !args_neg.empty()) {
|
|
129
|
+
// for compatibility, we need to check LLAMA_ARG_NO_ env as well
|
|
130
|
+
std::string neg_env = env;
|
|
131
|
+
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
|
|
132
|
+
if (std::getenv(neg_env.c_str())) {
|
|
133
|
+
return true;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
106
136
|
return env != nullptr && std::getenv(env);
|
|
107
137
|
}
|
|
108
138
|
|
|
@@ -133,16 +163,17 @@ static std::vector<std::string> break_str_into_lines(std::string input, size_t m
|
|
|
133
163
|
return result;
|
|
134
164
|
}
|
|
135
165
|
|
|
136
|
-
std::string common_arg::to_string() {
|
|
166
|
+
std::string common_arg::to_string() const {
|
|
137
167
|
// params for printing to console
|
|
138
168
|
const static int n_leading_spaces = 40;
|
|
139
169
|
const static int n_char_per_line_help = 70; // TODO: detect this based on current console
|
|
140
170
|
std::string leading_spaces(n_leading_spaces, ' ');
|
|
141
171
|
|
|
142
172
|
std::ostringstream ss;
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
173
|
+
auto all_args = get_args(); // also contains args_neg
|
|
174
|
+
for (const auto & arg : all_args) {
|
|
175
|
+
if (arg == all_args.front()) {
|
|
176
|
+
if (all_args.size() == 1) {
|
|
146
177
|
ss << arg;
|
|
147
178
|
} else {
|
|
148
179
|
// first arg is usually abbreviation, we need padding to make it more beautiful
|
|
@@ -151,7 +182,7 @@ std::string common_arg::to_string() {
|
|
|
151
182
|
ss << tmp << spaces;
|
|
152
183
|
}
|
|
153
184
|
} else {
|
|
154
|
-
ss << arg << (arg !=
|
|
185
|
+
ss << arg << (arg != all_args.back() ? ", " : "");
|
|
155
186
|
}
|
|
156
187
|
}
|
|
157
188
|
if (value_hint) ss << " " << value_hint;
|
|
@@ -170,6 +201,31 @@ std::string common_arg::to_string() {
|
|
|
170
201
|
return ss.str();
|
|
171
202
|
}
|
|
172
203
|
|
|
204
|
+
std::vector<std::string> common_arg::get_args() const {
|
|
205
|
+
std::vector<std::string> result;
|
|
206
|
+
for (const auto & arg : args) {
|
|
207
|
+
result.push_back(std::string(arg));
|
|
208
|
+
}
|
|
209
|
+
for (const auto & arg : args_neg) {
|
|
210
|
+
result.push_back(std::string(arg));
|
|
211
|
+
}
|
|
212
|
+
return result;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
std::vector<std::string> common_arg::get_env() const {
|
|
216
|
+
std::vector<std::string> result;
|
|
217
|
+
if (env) {
|
|
218
|
+
result.push_back(std::string(env));
|
|
219
|
+
}
|
|
220
|
+
if (!args_neg.empty() && env) {
|
|
221
|
+
// for compatibility, we need to add LLAMA_ARG_NO_ variant
|
|
222
|
+
std::string neg_env = env;
|
|
223
|
+
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
|
|
224
|
+
result.push_back(neg_env);
|
|
225
|
+
}
|
|
226
|
+
return result;
|
|
227
|
+
}
|
|
228
|
+
|
|
173
229
|
//
|
|
174
230
|
// utils
|
|
175
231
|
//
|
|
@@ -305,6 +361,16 @@ static std::string get_all_kv_cache_types() {
|
|
|
305
361
|
return msg.str();
|
|
306
362
|
}
|
|
307
363
|
|
|
364
|
+
static bool parse_bool_value(const std::string & value) {
|
|
365
|
+
if (is_truthy(value)) {
|
|
366
|
+
return true;
|
|
367
|
+
} else if (is_falsey(value)) {
|
|
368
|
+
return false;
|
|
369
|
+
} else {
|
|
370
|
+
throw std::invalid_argument("invalid boolean value");
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
|
|
308
374
|
//
|
|
309
375
|
// CLI argument parsing functions
|
|
310
376
|
//
|
|
@@ -312,10 +378,13 @@ static std::string get_all_kv_cache_types() {
|
|
|
312
378
|
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
|
|
313
379
|
common_params & params = ctx_arg.params;
|
|
314
380
|
|
|
315
|
-
std::unordered_map<std::string, common_arg
|
|
381
|
+
std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
|
|
316
382
|
for (auto & opt : ctx_arg.options) {
|
|
317
383
|
for (const auto & arg : opt.args) {
|
|
318
|
-
arg_to_options[arg] = &opt;
|
|
384
|
+
arg_to_options[arg] = {&opt, /* is_positive */ true};
|
|
385
|
+
}
|
|
386
|
+
for (const auto & arg : opt.args_neg) {
|
|
387
|
+
arg_to_options[arg] = {&opt, /* is_positive */ false};
|
|
319
388
|
}
|
|
320
389
|
}
|
|
321
390
|
|
|
@@ -324,12 +393,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
324
393
|
std::string value;
|
|
325
394
|
if (opt.get_value_from_env(value)) {
|
|
326
395
|
try {
|
|
327
|
-
if (opt.handler_void && (value
|
|
396
|
+
if (opt.handler_void && is_truthy(value)) {
|
|
328
397
|
opt.handler_void(params);
|
|
329
398
|
}
|
|
330
399
|
if (opt.handler_int) {
|
|
331
400
|
opt.handler_int(params, std::stoi(value));
|
|
332
401
|
}
|
|
402
|
+
if (opt.handler_bool) {
|
|
403
|
+
opt.handler_bool(params, parse_bool_value(value));
|
|
404
|
+
}
|
|
333
405
|
if (opt.handler_string) {
|
|
334
406
|
opt.handler_string(params, value);
|
|
335
407
|
continue;
|
|
@@ -348,6 +420,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
348
420
|
}
|
|
349
421
|
};
|
|
350
422
|
|
|
423
|
+
std::set<std::string> seen_args;
|
|
424
|
+
|
|
351
425
|
for (int i = 1; i < argc; i++) {
|
|
352
426
|
const std::string arg_prefix = "--";
|
|
353
427
|
|
|
@@ -358,7 +432,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
358
432
|
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
|
359
433
|
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
|
360
434
|
}
|
|
361
|
-
|
|
435
|
+
if (!seen_args.insert(arg).second) {
|
|
436
|
+
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
|
437
|
+
}
|
|
438
|
+
auto & tmp = arg_to_options[arg];
|
|
439
|
+
auto opt = *tmp.first;
|
|
440
|
+
bool is_positive = tmp.second;
|
|
362
441
|
if (opt.has_value_from_env()) {
|
|
363
442
|
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
|
|
364
443
|
}
|
|
@@ -367,6 +446,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
367
446
|
opt.handler_void(params);
|
|
368
447
|
continue;
|
|
369
448
|
}
|
|
449
|
+
if (opt.handler_bool) {
|
|
450
|
+
opt.handler_bool(params, is_positive);
|
|
451
|
+
continue;
|
|
452
|
+
}
|
|
370
453
|
|
|
371
454
|
// arg with single value
|
|
372
455
|
check_arg(i);
|
|
@@ -391,7 +474,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
391
474
|
throw std::invalid_argument(string_format(
|
|
392
475
|
"error while handling argument \"%s\": %s\n\n"
|
|
393
476
|
"usage:\n%s\n\nto show complete usage, run with -h",
|
|
394
|
-
arg.c_str(), e.what(),
|
|
477
|
+
arg.c_str(), e.what(), opt.to_string().c_str()));
|
|
395
478
|
}
|
|
396
479
|
}
|
|
397
480
|
|
|
@@ -427,7 +510,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
427
510
|
|
|
428
511
|
// model is required (except for server)
|
|
429
512
|
// TODO @ngxson : maybe show a list of available models in CLI in this case
|
|
430
|
-
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) {
|
|
513
|
+
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) {
|
|
431
514
|
throw std::invalid_argument("error: --model is required\n");
|
|
432
515
|
}
|
|
433
516
|
|
|
@@ -452,7 +535,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
452
535
|
params.kv_overrides.back().key[0] = 0;
|
|
453
536
|
}
|
|
454
537
|
|
|
455
|
-
|
|
538
|
+
// pad tensor_buft_overrides for llama_params_fit:
|
|
539
|
+
const size_t ntbo = llama_max_tensor_buft_overrides();
|
|
540
|
+
while (params.tensor_buft_overrides.size() < ntbo) {
|
|
456
541
|
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
|
457
542
|
}
|
|
458
543
|
|
|
@@ -468,6 +553,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
468
553
|
));
|
|
469
554
|
}
|
|
470
555
|
|
|
556
|
+
common_log_set_verbosity_thold(params.verbosity);
|
|
557
|
+
|
|
471
558
|
return true;
|
|
472
559
|
}
|
|
473
560
|
|
|
@@ -560,6 +647,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
|
|
560
647
|
"llama-batched-bench",
|
|
561
648
|
"llama-bench",
|
|
562
649
|
"llama-cli",
|
|
650
|
+
"llama-completion",
|
|
563
651
|
"llama-convert-llama2c-to-ggml",
|
|
564
652
|
"llama-cvector-generator",
|
|
565
653
|
"llama-embedding",
|
|
@@ -644,6 +732,61 @@ static void add_rpc_devices(const std::string & servers) {
|
|
|
644
732
|
}
|
|
645
733
|
}
|
|
646
734
|
|
|
735
|
+
bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
|
|
736
|
+
common_params dummy_params;
|
|
737
|
+
common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
|
|
738
|
+
|
|
739
|
+
std::unordered_map<std::string, common_arg *> arg_to_options;
|
|
740
|
+
for (auto & opt : ctx_arg.options) {
|
|
741
|
+
for (const auto & arg : opt.args) {
|
|
742
|
+
arg_to_options[arg] = &opt;
|
|
743
|
+
}
|
|
744
|
+
for (const auto & arg : opt.args_neg) {
|
|
745
|
+
arg_to_options[arg] = &opt;
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
// TODO @ngxson : find a way to deduplicate this code
|
|
750
|
+
|
|
751
|
+
// handle command line arguments
|
|
752
|
+
auto check_arg = [&](int i) {
|
|
753
|
+
if (i+1 >= argc) {
|
|
754
|
+
throw std::invalid_argument("expected value for argument");
|
|
755
|
+
}
|
|
756
|
+
};
|
|
757
|
+
|
|
758
|
+
std::set<std::string> seen_args;
|
|
759
|
+
|
|
760
|
+
for (int i = 1; i < argc; i++) {
|
|
761
|
+
const std::string arg_prefix = "--";
|
|
762
|
+
|
|
763
|
+
std::string arg = argv[i];
|
|
764
|
+
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
|
765
|
+
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
766
|
+
}
|
|
767
|
+
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
|
768
|
+
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
|
769
|
+
}
|
|
770
|
+
if (!seen_args.insert(arg).second) {
|
|
771
|
+
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
|
772
|
+
}
|
|
773
|
+
auto opt = *arg_to_options[arg];
|
|
774
|
+
std::string val;
|
|
775
|
+
if (opt.value_hint != nullptr) {
|
|
776
|
+
// arg with single value
|
|
777
|
+
check_arg(i);
|
|
778
|
+
val = argv[++i];
|
|
779
|
+
}
|
|
780
|
+
if (opt.value_hint_2 != nullptr) {
|
|
781
|
+
// TODO: support arg with 2 values
|
|
782
|
+
throw std::invalid_argument("error: argument with 2 values is not yet supported\n");
|
|
783
|
+
}
|
|
784
|
+
out_map[opt] = val;
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
return true;
|
|
788
|
+
}
|
|
789
|
+
|
|
647
790
|
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
|
648
791
|
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
|
|
649
792
|
const common_params params_org = ctx_arg.params; // the example can modify the default params
|
|
@@ -689,23 +832,30 @@ static std::string list_builtin_chat_templates() {
|
|
|
689
832
|
return msg.str();
|
|
690
833
|
}
|
|
691
834
|
|
|
692
|
-
|
|
693
|
-
return value == "on" || value == "enabled" || value == "1";
|
|
835
|
+
bool common_arg_utils::is_truthy(const std::string & value) {
|
|
836
|
+
return value == "on" || value == "enabled" || value == "true" || value == "1";
|
|
694
837
|
}
|
|
695
838
|
|
|
696
|
-
|
|
697
|
-
return value == "off" || value == "disabled" || value == "0";
|
|
839
|
+
bool common_arg_utils::is_falsey(const std::string & value) {
|
|
840
|
+
return value == "off" || value == "disabled" || value == "false" || value == "0";
|
|
698
841
|
}
|
|
699
842
|
|
|
700
|
-
|
|
843
|
+
bool common_arg_utils::is_autoy(const std::string & value) {
|
|
701
844
|
return value == "auto" || value == "-1";
|
|
702
845
|
}
|
|
703
846
|
|
|
704
847
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
|
705
|
-
// default
|
|
706
|
-
//
|
|
707
|
-
if (ex ==
|
|
708
|
-
params.use_jinja =
|
|
848
|
+
// per-example default params
|
|
849
|
+
// we define here to make sure it's included in llama-gen-docs
|
|
850
|
+
if (ex == LLAMA_EXAMPLE_COMPLETION) {
|
|
851
|
+
params.use_jinja = false; // disable jinja by default
|
|
852
|
+
|
|
853
|
+
} else if (ex == LLAMA_EXAMPLE_MTMD) {
|
|
854
|
+
params.use_jinja = false; // disable jinja by default
|
|
855
|
+
params.sampling.temp = 0.2; // lower temp by default for better quality
|
|
856
|
+
|
|
857
|
+
} else if (ex == LLAMA_EXAMPLE_SERVER) {
|
|
858
|
+
params.n_parallel = -1; // auto by default
|
|
709
859
|
}
|
|
710
860
|
|
|
711
861
|
params.use_color = tty_can_use_colors();
|
|
@@ -723,7 +873,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
723
873
|
sampler_type_chars += common_sampler_type_to_chr(sampler);
|
|
724
874
|
sampler_type_names += common_sampler_type_to_str(sampler) + ";";
|
|
725
875
|
}
|
|
726
|
-
sampler_type_names.
|
|
876
|
+
if (!sampler_type_names.empty()) {
|
|
877
|
+
sampler_type_names.pop_back(); // remove last semicolon
|
|
878
|
+
}
|
|
727
879
|
|
|
728
880
|
|
|
729
881
|
/**
|
|
@@ -785,12 +937,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
785
937
|
}
|
|
786
938
|
));
|
|
787
939
|
add_opt(common_arg(
|
|
940
|
+
{"--display-prompt"},
|
|
788
941
|
{"--no-display-prompt"},
|
|
789
|
-
string_format("
|
|
790
|
-
[](common_params & params) {
|
|
791
|
-
params.display_prompt =
|
|
942
|
+
string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
|
|
943
|
+
[](common_params & params, bool value) {
|
|
944
|
+
params.display_prompt = value;
|
|
792
945
|
}
|
|
793
|
-
).set_examples({
|
|
946
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
|
794
947
|
add_opt(common_arg(
|
|
795
948
|
{"-co", "--color"}, "[on|off|auto]",
|
|
796
949
|
"Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
|
|
@@ -807,7 +960,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
807
960
|
string_format("error: unknown value for --color: '%s'\n", value.c_str()));
|
|
808
961
|
}
|
|
809
962
|
}
|
|
810
|
-
).set_examples({
|
|
963
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
|
811
964
|
add_opt(common_arg(
|
|
812
965
|
{"-t", "--threads"}, "N",
|
|
813
966
|
string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
|
@@ -940,7 +1093,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
940
1093
|
add_opt(common_arg(
|
|
941
1094
|
{"-n", "--predict", "--n-predict"}, "N",
|
|
942
1095
|
string_format(
|
|
943
|
-
ex ==
|
|
1096
|
+
ex == LLAMA_EXAMPLE_COMPLETION
|
|
944
1097
|
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
|
|
945
1098
|
: "number of tokens to predict (default: %d, -1 = infinity)",
|
|
946
1099
|
params.n_predict),
|
|
@@ -979,42 +1132,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
979
1132
|
).set_env("LLAMA_ARG_SWA_FULL"));
|
|
980
1133
|
add_opt(common_arg(
|
|
981
1134
|
{"--ctx-checkpoints", "--swa-checkpoints"}, "N",
|
|
982
|
-
string_format("max number of context checkpoints to create per slot (default: %d)
|
|
1135
|
+
string_format("max number of context checkpoints to create per slot (default: %d)"
|
|
983
1136
|
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
|
|
984
1137
|
[](common_params & params, int value) {
|
|
985
1138
|
params.n_ctx_checkpoints = value;
|
|
986
1139
|
}
|
|
987
|
-
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1140
|
+
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
988
1141
|
add_opt(common_arg(
|
|
989
1142
|
{"--cache-ram", "-cram"}, "N",
|
|
990
|
-
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)
|
|
1143
|
+
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
|
|
991
1144
|
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
|
|
992
1145
|
[](common_params & params, int value) {
|
|
993
1146
|
params.cache_ram_mib = value;
|
|
994
1147
|
}
|
|
995
|
-
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1148
|
+
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
996
1149
|
add_opt(common_arg(
|
|
997
1150
|
{"--kv-unified", "-kvu"},
|
|
998
|
-
|
|
999
|
-
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
|
|
1151
|
+
"use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
|
|
1000
1152
|
[](common_params & params) {
|
|
1001
1153
|
params.kv_unified = true;
|
|
1002
1154
|
}
|
|
1003
|
-
).set_env("LLAMA_ARG_KV_UNIFIED"));
|
|
1004
|
-
add_opt(common_arg(
|
|
1005
|
-
{"--no-context-shift"},
|
|
1006
|
-
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
|
1007
|
-
[](common_params & params) {
|
|
1008
|
-
params.ctx_shift = false;
|
|
1009
|
-
}
|
|
1010
|
-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
|
1155
|
+
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1011
1156
|
add_opt(common_arg(
|
|
1012
1157
|
{"--context-shift"},
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1158
|
+
{"--no-context-shift"},
|
|
1159
|
+
string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
|
|
1160
|
+
[](common_params & params, bool value) {
|
|
1161
|
+
params.ctx_shift = value;
|
|
1016
1162
|
}
|
|
1017
|
-
).set_examples({
|
|
1163
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
|
|
1018
1164
|
add_opt(common_arg(
|
|
1019
1165
|
{"--chunks"}, "N",
|
|
1020
1166
|
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
|
|
@@ -1050,15 +1196,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1050
1196
|
[](common_params & params, const std::string & value) {
|
|
1051
1197
|
params.system_prompt = value;
|
|
1052
1198
|
}
|
|
1053
|
-
).set_examples({
|
|
1199
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
|
|
1054
1200
|
add_opt(common_arg(
|
|
1201
|
+
{"--perf"},
|
|
1055
1202
|
{"--no-perf"},
|
|
1056
|
-
string_format("
|
|
1057
|
-
[](common_params & params) {
|
|
1058
|
-
params.no_perf =
|
|
1059
|
-
params.sampling.no_perf =
|
|
1203
|
+
string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
|
1204
|
+
[](common_params & params, bool value) {
|
|
1205
|
+
params.no_perf = !value;
|
|
1206
|
+
params.sampling.no_perf = !value;
|
|
1060
1207
|
}
|
|
1061
|
-
).set_env("
|
|
1208
|
+
).set_env("LLAMA_ARG_PERF"));
|
|
1209
|
+
add_opt(common_arg(
|
|
1210
|
+
{"--show-timings"},
|
|
1211
|
+
{"--no-show-timings"},
|
|
1212
|
+
string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
|
|
1213
|
+
[](common_params & params, bool value) {
|
|
1214
|
+
params.show_timings = value;
|
|
1215
|
+
}
|
|
1216
|
+
).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
|
|
1062
1217
|
add_opt(common_arg(
|
|
1063
1218
|
{"-f", "--file"}, "FNAME",
|
|
1064
1219
|
"a file containing the prompt (default: none)",
|
|
@@ -1080,16 +1235,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1080
1235
|
params.system_prompt.pop_back();
|
|
1081
1236
|
}
|
|
1082
1237
|
}
|
|
1083
|
-
).set_examples({
|
|
1238
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
|
|
1084
1239
|
add_opt(common_arg(
|
|
1085
1240
|
{"--in-file"}, "FNAME",
|
|
1086
|
-
"an input file (
|
|
1241
|
+
"an input file (use comma-separated values to specify multiple files)",
|
|
1087
1242
|
[](common_params & params, const std::string & value) {
|
|
1088
|
-
std::
|
|
1089
|
-
|
|
1090
|
-
|
|
1243
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
1244
|
+
std::ifstream file(item);
|
|
1245
|
+
if (!file) {
|
|
1246
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
|
1247
|
+
}
|
|
1248
|
+
params.in_files.push_back(item);
|
|
1091
1249
|
}
|
|
1092
|
-
params.in_files.push_back(value);
|
|
1093
1250
|
}
|
|
1094
1251
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
1095
1252
|
add_opt(common_arg(
|
|
@@ -1110,16 +1267,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1110
1267
|
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
|
1111
1268
|
add_opt(common_arg(
|
|
1112
1269
|
{"-e", "--escape"},
|
|
1113
|
-
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
|
1114
|
-
[](common_params & params) {
|
|
1115
|
-
params.escape = true;
|
|
1116
|
-
}
|
|
1117
|
-
));
|
|
1118
|
-
add_opt(common_arg(
|
|
1119
1270
|
{"--no-escape"},
|
|
1120
|
-
"
|
|
1121
|
-
[](common_params & params) {
|
|
1122
|
-
params.escape =
|
|
1271
|
+
string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
|
1272
|
+
[](common_params & params, bool value) {
|
|
1273
|
+
params.escape = value;
|
|
1123
1274
|
}
|
|
1124
1275
|
));
|
|
1125
1276
|
add_opt(common_arg(
|
|
@@ -1128,59 +1279,53 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1128
1279
|
[](common_params & params, int value) {
|
|
1129
1280
|
params.n_print = value;
|
|
1130
1281
|
}
|
|
1131
|
-
).set_examples({
|
|
1282
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1132
1283
|
add_opt(common_arg(
|
|
1133
1284
|
{"--prompt-cache"}, "FNAME",
|
|
1134
1285
|
"file to cache prompt state for faster startup (default: none)",
|
|
1135
1286
|
[](common_params & params, const std::string & value) {
|
|
1136
1287
|
params.path_prompt_cache = value;
|
|
1137
1288
|
}
|
|
1138
|
-
).set_examples({
|
|
1289
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1139
1290
|
add_opt(common_arg(
|
|
1140
1291
|
{"--prompt-cache-all"},
|
|
1141
1292
|
"if specified, saves user input and generations to cache as well\n",
|
|
1142
1293
|
[](common_params & params) {
|
|
1143
1294
|
params.prompt_cache_all = true;
|
|
1144
1295
|
}
|
|
1145
|
-
).set_examples({
|
|
1296
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1146
1297
|
add_opt(common_arg(
|
|
1147
1298
|
{"--prompt-cache-ro"},
|
|
1148
1299
|
"if specified, uses the prompt cache but does not update it",
|
|
1149
1300
|
[](common_params & params) {
|
|
1150
1301
|
params.prompt_cache_ro = true;
|
|
1151
1302
|
}
|
|
1152
|
-
).set_examples({
|
|
1303
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1153
1304
|
add_opt(common_arg(
|
|
1154
1305
|
{"-r", "--reverse-prompt"}, "PROMPT",
|
|
1155
1306
|
"halt generation at PROMPT, return control in interactive mode\n",
|
|
1156
1307
|
[](common_params & params, const std::string & value) {
|
|
1157
1308
|
params.antiprompt.emplace_back(value);
|
|
1158
1309
|
}
|
|
1159
|
-
).set_examples({
|
|
1310
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
|
|
1160
1311
|
add_opt(common_arg(
|
|
1161
1312
|
{"-sp", "--special"},
|
|
1162
1313
|
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
|
|
1163
1314
|
[](common_params & params) {
|
|
1164
1315
|
params.special = true;
|
|
1165
1316
|
}
|
|
1166
|
-
).set_examples({
|
|
1317
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
|
|
1167
1318
|
add_opt(common_arg(
|
|
1168
1319
|
{"-cnv", "--conversation"},
|
|
1169
|
-
"
|
|
1320
|
+
{"-no-cnv", "--no-conversation"},
|
|
1321
|
+
"whether to run in conversation mode:\n"
|
|
1170
1322
|
"- does not print special tokens and suffix/prefix\n"
|
|
1171
1323
|
"- interactive mode is also enabled\n"
|
|
1172
1324
|
"(default: auto enabled if chat template is available)",
|
|
1173
|
-
[](common_params & params) {
|
|
1174
|
-
params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
|
|
1325
|
+
[](common_params & params, bool value) {
|
|
1326
|
+
params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
|
|
1175
1327
|
}
|
|
1176
|
-
).set_examples({
|
|
1177
|
-
add_opt(common_arg(
|
|
1178
|
-
{"-no-cnv", "--no-conversation"},
|
|
1179
|
-
"force disable conversation mode (default: false)",
|
|
1180
|
-
[](common_params & params) {
|
|
1181
|
-
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
|
|
1182
|
-
}
|
|
1183
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
1328
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
|
1184
1329
|
add_opt(common_arg(
|
|
1185
1330
|
{"-st", "--single-turn"},
|
|
1186
1331
|
"run conversation for a single turn only, then exit when done\n"
|
|
@@ -1189,28 +1334,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1189
1334
|
[](common_params & params) {
|
|
1190
1335
|
params.single_turn = true;
|
|
1191
1336
|
}
|
|
1192
|
-
).set_examples({
|
|
1337
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
|
1193
1338
|
add_opt(common_arg(
|
|
1194
1339
|
{"-i", "--interactive"},
|
|
1195
1340
|
string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
|
|
1196
1341
|
[](common_params & params) {
|
|
1197
1342
|
params.interactive = true;
|
|
1198
1343
|
}
|
|
1199
|
-
).set_examples({
|
|
1344
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1200
1345
|
add_opt(common_arg(
|
|
1201
1346
|
{"-if", "--interactive-first"},
|
|
1202
1347
|
string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
|
|
1203
1348
|
[](common_params & params) {
|
|
1204
1349
|
params.interactive_first = true;
|
|
1205
1350
|
}
|
|
1206
|
-
).set_examples({
|
|
1351
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1207
1352
|
add_opt(common_arg(
|
|
1208
1353
|
{"-mli", "--multiline-input"},
|
|
1209
1354
|
"allows you to write or paste multiple lines without ending each in '\\'",
|
|
1210
1355
|
[](common_params & params) {
|
|
1211
1356
|
params.multiline_input = true;
|
|
1212
1357
|
}
|
|
1213
|
-
).set_examples({
|
|
1358
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
|
1214
1359
|
add_opt(common_arg(
|
|
1215
1360
|
{"--in-prefix-bos"},
|
|
1216
1361
|
"prefix BOS to user inputs, preceding the `--in-prefix` string",
|
|
@@ -1218,7 +1363,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1218
1363
|
params.input_prefix_bos = true;
|
|
1219
1364
|
params.enable_chat_template = false;
|
|
1220
1365
|
}
|
|
1221
|
-
).set_examples({
|
|
1366
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1222
1367
|
add_opt(common_arg(
|
|
1223
1368
|
{"--in-prefix"}, "STRING",
|
|
1224
1369
|
"string to prefix user inputs with (default: empty)",
|
|
@@ -1226,7 +1371,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1226
1371
|
params.input_prefix = value;
|
|
1227
1372
|
params.enable_chat_template = false;
|
|
1228
1373
|
}
|
|
1229
|
-
).set_examples({
|
|
1374
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1230
1375
|
add_opt(common_arg(
|
|
1231
1376
|
{"--in-suffix"}, "STRING",
|
|
1232
1377
|
"string to suffix after user inputs with (default: empty)",
|
|
@@ -1234,14 +1379,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1234
1379
|
params.input_suffix = value;
|
|
1235
1380
|
params.enable_chat_template = false;
|
|
1236
1381
|
}
|
|
1237
|
-
).set_examples({
|
|
1382
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1238
1383
|
add_opt(common_arg(
|
|
1384
|
+
{"--warmup"},
|
|
1239
1385
|
{"--no-warmup"},
|
|
1240
|
-
"
|
|
1241
|
-
[](common_params & params) {
|
|
1242
|
-
params.warmup =
|
|
1386
|
+
string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
|
|
1387
|
+
[](common_params & params, bool value) {
|
|
1388
|
+
params.warmup = value;
|
|
1243
1389
|
}
|
|
1244
|
-
).set_examples({
|
|
1390
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1245
1391
|
add_opt(common_arg(
|
|
1246
1392
|
{"--spm-infill"},
|
|
1247
1393
|
string_format(
|
|
@@ -1298,7 +1444,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1298
1444
|
params.sampling.top_k = value;
|
|
1299
1445
|
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
|
|
1300
1446
|
}
|
|
1301
|
-
).set_sparam());
|
|
1447
|
+
).set_sparam().set_env("LLAMA_ARG_TOP_K"));
|
|
1302
1448
|
add_opt(common_arg(
|
|
1303
1449
|
{"--top-p"}, "N",
|
|
1304
1450
|
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
|
|
@@ -1632,28 +1778,30 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1632
1778
|
[](common_params & params, int value) {
|
|
1633
1779
|
params.grp_attn_n = value;
|
|
1634
1780
|
}
|
|
1635
|
-
).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({
|
|
1781
|
+
).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_PASSKEY}));
|
|
1636
1782
|
add_opt(common_arg(
|
|
1637
1783
|
{"-gaw", "--grp-attn-w"}, "N",
|
|
1638
1784
|
string_format("group-attention width (default: %d)", params.grp_attn_w),
|
|
1639
1785
|
[](common_params & params, int value) {
|
|
1640
1786
|
params.grp_attn_w = value;
|
|
1641
1787
|
}
|
|
1642
|
-
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({
|
|
1788
|
+
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
1643
1789
|
add_opt(common_arg(
|
|
1790
|
+
{"-kvo", "--kv-offload"},
|
|
1644
1791
|
{"-nkvo", "--no-kv-offload"},
|
|
1645
|
-
"
|
|
1646
|
-
[](common_params & params) {
|
|
1647
|
-
params.no_kv_offload =
|
|
1792
|
+
string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
|
|
1793
|
+
[](common_params & params, bool value) {
|
|
1794
|
+
params.no_kv_offload = !value;
|
|
1648
1795
|
}
|
|
1649
|
-
).set_env("
|
|
1796
|
+
).set_env("LLAMA_ARG_KV_OFFLOAD"));
|
|
1650
1797
|
add_opt(common_arg(
|
|
1798
|
+
{"--repack"},
|
|
1651
1799
|
{"-nr", "--no-repack"},
|
|
1652
|
-
"
|
|
1653
|
-
[](common_params & params) {
|
|
1654
|
-
params.no_extra_bufts =
|
|
1800
|
+
string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
|
|
1801
|
+
[](common_params & params, bool value) {
|
|
1802
|
+
params.no_extra_bufts = !value;
|
|
1655
1803
|
}
|
|
1656
|
-
).set_env("
|
|
1804
|
+
).set_env("LLAMA_ARG_REPACK"));
|
|
1657
1805
|
add_opt(common_arg(
|
|
1658
1806
|
{"--no-host"},
|
|
1659
1807
|
"bypass host buffer allowing extra buffers to be used",
|
|
@@ -1766,13 +1914,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1766
1914
|
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
|
|
1767
1915
|
}
|
|
1768
1916
|
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
params.n_parallel
|
|
1774
|
-
|
|
1775
|
-
|
|
1917
|
+
if (ex == LLAMA_EXAMPLE_SERVER) {
|
|
1918
|
+
// this is to make sure this option appears in the server-specific section of the help message
|
|
1919
|
+
add_opt(common_arg(
|
|
1920
|
+
{"-np", "--parallel"}, "N",
|
|
1921
|
+
string_format("number of server slots (default: %d, -1 = auto)", params.n_parallel),
|
|
1922
|
+
[](common_params & params, int value) {
|
|
1923
|
+
if (value == 0) {
|
|
1924
|
+
throw std::invalid_argument("error: invalid value for n_parallel\n");
|
|
1925
|
+
}
|
|
1926
|
+
params.n_parallel = value;
|
|
1927
|
+
}
|
|
1928
|
+
).set_env("LLAMA_ARG_N_PARALLEL").set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1929
|
+
} else {
|
|
1930
|
+
add_opt(common_arg(
|
|
1931
|
+
{"-np", "--parallel"}, "N",
|
|
1932
|
+
string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
|
|
1933
|
+
[](common_params & params, int value) {
|
|
1934
|
+
params.n_parallel = value;
|
|
1935
|
+
}
|
|
1936
|
+
).set_env("LLAMA_ARG_N_PARALLEL"));
|
|
1937
|
+
}
|
|
1776
1938
|
add_opt(common_arg(
|
|
1777
1939
|
{"-ns", "--sequences"}, "N",
|
|
1778
1940
|
string_format("number of sequences to decode (default: %d)", params.n_sequences),
|
|
@@ -1782,20 +1944,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1782
1944
|
).set_examples({LLAMA_EXAMPLE_PARALLEL}));
|
|
1783
1945
|
add_opt(common_arg(
|
|
1784
1946
|
{"-cb", "--cont-batching"},
|
|
1785
|
-
string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
|
|
1786
|
-
[](common_params & params) {
|
|
1787
|
-
params.cont_batching = true;
|
|
1788
|
-
}
|
|
1789
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
|
|
1790
|
-
add_opt(common_arg(
|
|
1791
1947
|
{"-nocb", "--no-cont-batching"},
|
|
1792
|
-
"
|
|
1793
|
-
[](common_params & params) {
|
|
1794
|
-
params.cont_batching =
|
|
1948
|
+
string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
|
|
1949
|
+
[](common_params & params, bool value) {
|
|
1950
|
+
params.cont_batching = value;
|
|
1795
1951
|
}
|
|
1796
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("
|
|
1952
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
|
|
1797
1953
|
add_opt(common_arg(
|
|
1798
|
-
{"--mmproj"}, "FILE",
|
|
1954
|
+
{"-mm", "--mmproj"}, "FILE",
|
|
1799
1955
|
"path to a multimodal projector file. see tools/mtmd/README.md\n"
|
|
1800
1956
|
"note: if -hf is used, this argument can be omitted",
|
|
1801
1957
|
[](common_params & params, const std::string & value) {
|
|
@@ -1803,33 +1959,37 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1803
1959
|
}
|
|
1804
1960
|
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
|
|
1805
1961
|
add_opt(common_arg(
|
|
1806
|
-
{"--mmproj-url"}, "URL",
|
|
1962
|
+
{"-mmu", "--mmproj-url"}, "URL",
|
|
1807
1963
|
"URL to a multimodal projector file. see tools/mtmd/README.md",
|
|
1808
1964
|
[](common_params & params, const std::string & value) {
|
|
1809
1965
|
params.mmproj.url = value;
|
|
1810
1966
|
}
|
|
1811
1967
|
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
|
|
1812
1968
|
add_opt(common_arg(
|
|
1813
|
-
{"--
|
|
1814
|
-
"
|
|
1815
|
-
|
|
1816
|
-
|
|
1969
|
+
{"--mmproj-auto"},
|
|
1970
|
+
{"--no-mmproj", "--no-mmproj-auto"},
|
|
1971
|
+
string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
|
|
1972
|
+
[](common_params & params, bool value) {
|
|
1973
|
+
params.no_mmproj = !value;
|
|
1817
1974
|
}
|
|
1818
|
-
).set_examples(mmproj_examples).set_env("
|
|
1975
|
+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
|
|
1819
1976
|
add_opt(common_arg(
|
|
1977
|
+
{"--mmproj-offload"},
|
|
1820
1978
|
{"--no-mmproj-offload"},
|
|
1821
|
-
"
|
|
1822
|
-
[](common_params & params) {
|
|
1823
|
-
params.mmproj_use_gpu =
|
|
1979
|
+
string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
|
|
1980
|
+
[](common_params & params, bool value) {
|
|
1981
|
+
params.mmproj_use_gpu = value;
|
|
1824
1982
|
}
|
|
1825
|
-
).set_examples(mmproj_examples).set_env("
|
|
1983
|
+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
|
|
1826
1984
|
add_opt(common_arg(
|
|
1827
1985
|
{"--image", "--audio"}, "FILE",
|
|
1828
|
-
"path to an image or audio file. use with multimodal models,
|
|
1986
|
+
"path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
|
|
1829
1987
|
[](common_params & params, const std::string & value) {
|
|
1830
|
-
|
|
1988
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
1989
|
+
params.image.emplace_back(item);
|
|
1990
|
+
}
|
|
1831
1991
|
}
|
|
1832
|
-
).set_examples({LLAMA_EXAMPLE_MTMD}));
|
|
1992
|
+
).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
|
|
1833
1993
|
add_opt(common_arg(
|
|
1834
1994
|
{"--image-min-tokens"}, "N",
|
|
1835
1995
|
"minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
|
|
@@ -1862,12 +2022,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1862
2022
|
}
|
|
1863
2023
|
).set_env("LLAMA_ARG_MLOCK"));
|
|
1864
2024
|
add_opt(common_arg(
|
|
2025
|
+
{"--mmap"},
|
|
1865
2026
|
{"--no-mmap"},
|
|
1866
|
-
"
|
|
1867
|
-
[](common_params & params) {
|
|
1868
|
-
params.use_mmap =
|
|
2027
|
+
string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
|
|
2028
|
+
[](common_params & params, bool value) {
|
|
2029
|
+
params.use_mmap = value;
|
|
1869
2030
|
}
|
|
1870
|
-
).set_env("
|
|
2031
|
+
).set_env("LLAMA_ARG_MMAP"));
|
|
1871
2032
|
add_opt(common_arg(
|
|
1872
2033
|
{"--numa"}, "TYPE",
|
|
1873
2034
|
"attempt optimizations that help on some NUMA systems\n"
|
|
@@ -1922,7 +2083,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1922
2083
|
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
|
|
1923
2084
|
parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
|
|
1924
2085
|
}
|
|
1925
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
|
2086
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
1926
2087
|
add_opt(common_arg(
|
|
1927
2088
|
{"--cpu-moe", "-cmoe"},
|
|
1928
2089
|
"keep all Mixture of Experts (MoE) weights in the CPU",
|
|
@@ -1951,7 +2112,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1951
2112
|
[](common_params & params) {
|
|
1952
2113
|
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
|
1953
2114
|
}
|
|
1954
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
|
|
2115
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
|
|
1955
2116
|
add_opt(common_arg(
|
|
1956
2117
|
{"--n-cpu-moe-draft", "-ncmoed"}, "N",
|
|
1957
2118
|
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
|
|
@@ -1965,7 +2126,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1965
2126
|
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
|
|
1966
2127
|
}
|
|
1967
2128
|
}
|
|
1968
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
|
|
2129
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
|
|
1969
2130
|
add_opt(common_arg(
|
|
1970
2131
|
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
|
1971
2132
|
string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
|
|
@@ -2037,6 +2198,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2037
2198
|
}
|
|
2038
2199
|
}
|
|
2039
2200
|
).set_env("LLAMA_ARG_MAIN_GPU"));
|
|
2201
|
+
add_opt(common_arg(
|
|
2202
|
+
{ "-fit", "--fit" }, "[on|off]",
|
|
2203
|
+
string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"),
|
|
2204
|
+
[](common_params & params, const std::string & value) {
|
|
2205
|
+
if (is_truthy(value)) {
|
|
2206
|
+
params.fit_params = true;
|
|
2207
|
+
} else if (is_falsey(value)) {
|
|
2208
|
+
params.fit_params = false;
|
|
2209
|
+
} else {
|
|
2210
|
+
throw std::runtime_error(
|
|
2211
|
+
string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
|
|
2212
|
+
}
|
|
2213
|
+
}
|
|
2214
|
+
).set_env("LLAMA_ARG_FIT"));
|
|
2215
|
+
add_opt(common_arg(
|
|
2216
|
+
{ "-fitt", "--fit-target" }, "MiB",
|
|
2217
|
+
string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
|
|
2218
|
+
[](common_params & params, int value) {
|
|
2219
|
+
params.fit_params_target = value * size_t(1024*1024);
|
|
2220
|
+
}
|
|
2221
|
+
).set_env("LLAMA_ARG_FIT_TARGET"));
|
|
2222
|
+
add_opt(common_arg(
|
|
2223
|
+
{ "-fitc", "--fit-ctx" }, "N",
|
|
2224
|
+
string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx),
|
|
2225
|
+
[](common_params & params, int value) {
|
|
2226
|
+
params.fit_params_min_ctx = value;
|
|
2227
|
+
}
|
|
2228
|
+
).set_env("LLAMA_ARG_FIT_CTX"));
|
|
2040
2229
|
add_opt(common_arg(
|
|
2041
2230
|
{"--check-tensors"},
|
|
2042
2231
|
string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
|
|
@@ -2045,51 +2234,96 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2045
2234
|
}
|
|
2046
2235
|
));
|
|
2047
2236
|
add_opt(common_arg(
|
|
2048
|
-
{"--override-kv"}, "KEY=TYPE:VALUE",
|
|
2049
|
-
"advanced option to override model metadata by key.
|
|
2050
|
-
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
|
|
2237
|
+
{"--override-kv"}, "KEY=TYPE:VALUE,...",
|
|
2238
|
+
"advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.\n"
|
|
2239
|
+
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
|
|
2051
2240
|
[](common_params & params, const std::string & value) {
|
|
2052
|
-
|
|
2053
|
-
|
|
2241
|
+
std::vector<std::string> kv_overrides;
|
|
2242
|
+
|
|
2243
|
+
std::string current;
|
|
2244
|
+
bool escaping = false;
|
|
2245
|
+
|
|
2246
|
+
for (const char c : value) {
|
|
2247
|
+
if (escaping) {
|
|
2248
|
+
current.push_back(c);
|
|
2249
|
+
escaping = false;
|
|
2250
|
+
} else if (c == '\\') {
|
|
2251
|
+
escaping = true;
|
|
2252
|
+
} else if (c == ',') {
|
|
2253
|
+
kv_overrides.push_back(current);
|
|
2254
|
+
current.clear();
|
|
2255
|
+
} else {
|
|
2256
|
+
current.push_back(c);
|
|
2257
|
+
}
|
|
2258
|
+
}
|
|
2259
|
+
|
|
2260
|
+
if (escaping) {
|
|
2261
|
+
current.push_back('\\');
|
|
2262
|
+
}
|
|
2263
|
+
|
|
2264
|
+
kv_overrides.push_back(current);
|
|
2265
|
+
|
|
2266
|
+
for (const auto & kv_override : kv_overrides) {
|
|
2267
|
+
if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
|
|
2268
|
+
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
|
|
2269
|
+
}
|
|
2054
2270
|
}
|
|
2055
2271
|
}
|
|
2056
2272
|
));
|
|
2057
2273
|
add_opt(common_arg(
|
|
2274
|
+
{"--op-offload"},
|
|
2058
2275
|
{"--no-op-offload"},
|
|
2059
|
-
string_format("
|
|
2060
|
-
[](common_params & params) {
|
|
2061
|
-
params.no_op_offload =
|
|
2276
|
+
string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
|
|
2277
|
+
[](common_params & params, bool value) {
|
|
2278
|
+
params.no_op_offload = !value;
|
|
2062
2279
|
}
|
|
2063
2280
|
));
|
|
2064
2281
|
add_opt(common_arg(
|
|
2065
2282
|
{"--lora"}, "FNAME",
|
|
2066
|
-
"path to LoRA adapter (
|
|
2283
|
+
"path to LoRA adapter (use comma-separated values to load multiple adapters)",
|
|
2067
2284
|
[](common_params & params, const std::string & value) {
|
|
2068
|
-
|
|
2285
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
2286
|
+
params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
|
|
2287
|
+
}
|
|
2069
2288
|
}
|
|
2070
2289
|
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
|
2071
2290
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
2072
2291
|
add_opt(common_arg(
|
|
2073
|
-
{"--lora-scaled"}, "FNAME
|
|
2074
|
-
"path to LoRA adapter with user defined scaling (
|
|
2075
|
-
|
|
2076
|
-
|
|
2292
|
+
{"--lora-scaled"}, "FNAME:SCALE,...",
|
|
2293
|
+
"path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
|
|
2294
|
+
"note: use comma-separated values",
|
|
2295
|
+
[](common_params & params, const std::string & value) {
|
|
2296
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
2297
|
+
auto parts = string_split<std::string>(item, ':');
|
|
2298
|
+
if (parts.size() != 2) {
|
|
2299
|
+
throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
|
|
2300
|
+
}
|
|
2301
|
+
params.lora_adapters.push_back({ parts[0], std::stof(parts[1]), "", "", nullptr });
|
|
2302
|
+
}
|
|
2077
2303
|
}
|
|
2078
2304
|
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
|
2079
2305
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
2080
2306
|
add_opt(common_arg(
|
|
2081
2307
|
{"--control-vector"}, "FNAME",
|
|
2082
|
-
"add a control vector\nnote:
|
|
2308
|
+
"add a control vector\nnote: use comma-separated values to add multiple control vectors",
|
|
2083
2309
|
[](common_params & params, const std::string & value) {
|
|
2084
|
-
|
|
2310
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
2311
|
+
params.control_vectors.push_back({ 1.0f, item, });
|
|
2312
|
+
}
|
|
2085
2313
|
}
|
|
2086
2314
|
));
|
|
2087
2315
|
add_opt(common_arg(
|
|
2088
|
-
{"--control-vector-scaled"}, "FNAME
|
|
2316
|
+
{"--control-vector-scaled"}, "FNAME:SCALE,...",
|
|
2089
2317
|
"add a control vector with user defined scaling SCALE\n"
|
|
2090
|
-
"note:
|
|
2091
|
-
[](common_params & params, const std::string &
|
|
2092
|
-
|
|
2318
|
+
"note: use comma-separated values (format: FNAME:SCALE,...)",
|
|
2319
|
+
[](common_params & params, const std::string & value) {
|
|
2320
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
2321
|
+
auto parts = string_split<std::string>(item, ':');
|
|
2322
|
+
if (parts.size() != 2) {
|
|
2323
|
+
throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
|
|
2324
|
+
}
|
|
2325
|
+
params.control_vectors.push_back({ std::stof(parts[1]), parts[0] });
|
|
2326
|
+
}
|
|
2093
2327
|
}
|
|
2094
2328
|
));
|
|
2095
2329
|
add_opt(common_arg(
|
|
@@ -2179,13 +2413,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2179
2413
|
).set_env("HF_TOKEN"));
|
|
2180
2414
|
add_opt(common_arg(
|
|
2181
2415
|
{"--context-file"}, "FNAME",
|
|
2182
|
-
"file to load context from (
|
|
2416
|
+
"file to load context from (use comma-separated values to specify multiple files)",
|
|
2183
2417
|
[](common_params & params, const std::string & value) {
|
|
2184
|
-
std::
|
|
2185
|
-
|
|
2186
|
-
|
|
2418
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
2419
|
+
std::ifstream file(item, std::ios::binary);
|
|
2420
|
+
if (!file) {
|
|
2421
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
|
2422
|
+
}
|
|
2423
|
+
params.context_files.push_back(item);
|
|
2187
2424
|
}
|
|
2188
|
-
params.context_files.push_back(value);
|
|
2189
2425
|
}
|
|
2190
2426
|
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
|
2191
2427
|
add_opt(common_arg(
|
|
@@ -2254,10 +2490,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2254
2490
|
}
|
|
2255
2491
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2256
2492
|
add_opt(common_arg(
|
|
2493
|
+
{"--ppl"},
|
|
2257
2494
|
{"--no-ppl"},
|
|
2258
|
-
string_format("
|
|
2259
|
-
[](common_params & params) {
|
|
2260
|
-
params.compute_ppl =
|
|
2495
|
+
string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
|
|
2496
|
+
[](common_params & params, bool value) {
|
|
2497
|
+
params.compute_ppl = value;
|
|
2261
2498
|
}
|
|
2262
2499
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2263
2500
|
add_opt(common_arg(
|
|
@@ -2376,12 +2613,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2376
2613
|
}
|
|
2377
2614
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
|
2378
2615
|
add_opt(common_arg(
|
|
2616
|
+
{"--webui-config"}, "JSON",
|
|
2617
|
+
"JSON that provides default WebUI settings (overrides WebUI defaults)",
|
|
2618
|
+
[](common_params & params, const std::string & value) {
|
|
2619
|
+
params.webui_config_json = value;
|
|
2620
|
+
}
|
|
2621
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
|
|
2622
|
+
add_opt(common_arg(
|
|
2623
|
+
{"--webui-config-file"}, "PATH",
|
|
2624
|
+
"JSON file that provides default WebUI settings (overrides WebUI defaults)",
|
|
2625
|
+
[](common_params & params, const std::string & value) {
|
|
2626
|
+
params.webui_config_json = read_file(value);
|
|
2627
|
+
}
|
|
2628
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
|
|
2629
|
+
add_opt(common_arg(
|
|
2630
|
+
{"--webui"},
|
|
2379
2631
|
{"--no-webui"},
|
|
2380
|
-
string_format("
|
|
2381
|
-
[](common_params & params) {
|
|
2382
|
-
params.webui =
|
|
2632
|
+
string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
|
|
2633
|
+
[](common_params & params, bool value) {
|
|
2634
|
+
params.webui = value;
|
|
2383
2635
|
}
|
|
2384
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("
|
|
2636
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
|
|
2385
2637
|
add_opt(common_arg(
|
|
2386
2638
|
{"--embedding", "--embeddings"},
|
|
2387
2639
|
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
|
|
@@ -2444,7 +2696,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2444
2696
|
params.default_template_kwargs[item.key()] = item.value().dump();
|
|
2445
2697
|
}
|
|
2446
2698
|
}
|
|
2447
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
|
|
2699
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
|
|
2448
2700
|
add_opt(common_arg(
|
|
2449
2701
|
{"-to", "--timeout"}, "N",
|
|
2450
2702
|
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
|
@@ -2486,18 +2738,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2486
2738
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
|
|
2487
2739
|
add_opt(common_arg(
|
|
2488
2740
|
{"--slots"},
|
|
2489
|
-
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
|
2490
|
-
[](common_params & params) {
|
|
2491
|
-
params.endpoint_slots = true;
|
|
2492
|
-
}
|
|
2493
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
|
|
2494
|
-
add_opt(common_arg(
|
|
2495
2741
|
{"--no-slots"},
|
|
2496
|
-
"
|
|
2497
|
-
[](common_params & params) {
|
|
2498
|
-
params.endpoint_slots =
|
|
2742
|
+
string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
|
2743
|
+
[](common_params & params, bool value) {
|
|
2744
|
+
params.endpoint_slots = value;
|
|
2499
2745
|
}
|
|
2500
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("
|
|
2746
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
|
|
2501
2747
|
add_opt(common_arg(
|
|
2502
2748
|
{"--slot-save-path"}, "PATH",
|
|
2503
2749
|
"path to save slot kv cache (default: disabled)",
|
|
@@ -2533,6 +2779,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2533
2779
|
params.models_dir = value;
|
|
2534
2780
|
}
|
|
2535
2781
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
|
|
2782
|
+
add_opt(common_arg(
|
|
2783
|
+
{"--models-preset"}, "PATH",
|
|
2784
|
+
"path to INI file containing model presets for the router server (default: disabled)",
|
|
2785
|
+
[](common_params & params, const std::string & value) {
|
|
2786
|
+
params.models_preset = value;
|
|
2787
|
+
}
|
|
2788
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_PRESET"));
|
|
2536
2789
|
add_opt(common_arg(
|
|
2537
2790
|
{"--models-max"}, "N",
|
|
2538
2791
|
string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
|
|
@@ -2541,26 +2794,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2541
2794
|
}
|
|
2542
2795
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
|
|
2543
2796
|
add_opt(common_arg(
|
|
2797
|
+
{"--models-autoload"},
|
|
2544
2798
|
{"--no-models-autoload"},
|
|
2545
|
-
"
|
|
2546
|
-
[](common_params & params) {
|
|
2547
|
-
params.models_autoload =
|
|
2799
|
+
string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
|
|
2800
|
+
[](common_params & params, bool value) {
|
|
2801
|
+
params.models_autoload = value;
|
|
2548
2802
|
}
|
|
2549
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("
|
|
2803
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
|
|
2550
2804
|
add_opt(common_arg(
|
|
2551
2805
|
{"--jinja"},
|
|
2552
|
-
string_format("use jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
|
|
2553
|
-
[](common_params & params) {
|
|
2554
|
-
params.use_jinja = true;
|
|
2555
|
-
}
|
|
2556
|
-
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
|
|
2557
|
-
add_opt(common_arg(
|
|
2558
2806
|
{"--no-jinja"},
|
|
2559
|
-
string_format("
|
|
2560
|
-
[](common_params & params) {
|
|
2561
|
-
params.use_jinja =
|
|
2807
|
+
string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
|
|
2808
|
+
[](common_params & params, bool value) {
|
|
2809
|
+
params.use_jinja = value;
|
|
2562
2810
|
}
|
|
2563
|
-
).set_examples({LLAMA_EXAMPLE_SERVER,
|
|
2811
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
|
|
2564
2812
|
add_opt(common_arg(
|
|
2565
2813
|
{"--reasoning-format"}, "FORMAT",
|
|
2566
2814
|
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
|
@@ -2571,7 +2819,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2571
2819
|
[](common_params & params, const std::string & value) {
|
|
2572
2820
|
params.reasoning_format = common_reasoning_format_from_name(value);
|
|
2573
2821
|
}
|
|
2574
|
-
).set_examples({LLAMA_EXAMPLE_SERVER,
|
|
2822
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
|
|
2575
2823
|
add_opt(common_arg(
|
|
2576
2824
|
{"--reasoning-budget"}, "N",
|
|
2577
2825
|
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
|
|
@@ -2579,7 +2827,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2579
2827
|
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
|
|
2580
2828
|
params.reasoning_budget = value;
|
|
2581
2829
|
}
|
|
2582
|
-
).set_examples({LLAMA_EXAMPLE_SERVER,
|
|
2830
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
|
|
2583
2831
|
add_opt(common_arg(
|
|
2584
2832
|
{"--chat-template"}, "JINJA_TEMPLATE",
|
|
2585
2833
|
string_format(
|
|
@@ -2591,7 +2839,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2591
2839
|
[](common_params & params, const std::string & value) {
|
|
2592
2840
|
params.chat_template = value;
|
|
2593
2841
|
}
|
|
2594
|
-
).set_examples({
|
|
2842
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
|
2595
2843
|
add_opt(common_arg(
|
|
2596
2844
|
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
|
|
2597
2845
|
string_format(
|
|
@@ -2603,17 +2851,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2603
2851
|
[](common_params & params, const std::string & value) {
|
|
2604
2852
|
params.chat_template = read_file(value);
|
|
2605
2853
|
}
|
|
2606
|
-
).set_examples({
|
|
2854
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
|
|
2607
2855
|
add_opt(common_arg(
|
|
2856
|
+
{"--prefill-assistant"},
|
|
2608
2857
|
{"--no-prefill-assistant"},
|
|
2609
2858
|
string_format(
|
|
2610
2859
|
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
|
|
2611
2860
|
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
|
|
2612
2861
|
),
|
|
2613
|
-
[](common_params & params) {
|
|
2614
|
-
params.prefill_assistant =
|
|
2862
|
+
[](common_params & params, bool value) {
|
|
2863
|
+
params.prefill_assistant = value;
|
|
2615
2864
|
}
|
|
2616
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("
|
|
2865
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
|
|
2617
2866
|
add_opt(common_arg(
|
|
2618
2867
|
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
|
2619
2868
|
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
|
@@ -2634,7 +2883,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2634
2883
|
[](common_params & params) {
|
|
2635
2884
|
params.simple_io = true;
|
|
2636
2885
|
}
|
|
2637
|
-
).set_examples({
|
|
2886
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
|
2638
2887
|
add_opt(common_arg(
|
|
2639
2888
|
{"--positive-file"}, "FNAME",
|
|
2640
2889
|
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|
|
@@ -2717,7 +2966,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2717
2966
|
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
|
|
2718
2967
|
[](common_params & params) {
|
|
2719
2968
|
params.verbosity = INT_MAX;
|
|
2720
|
-
common_log_set_verbosity_thold(INT_MAX);
|
|
2721
2969
|
}
|
|
2722
2970
|
));
|
|
2723
2971
|
add_opt(common_arg(
|
|
@@ -2738,7 +2986,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2738
2986
|
"(default: %d)\n", params.verbosity),
|
|
2739
2987
|
[](common_params & params, int value) {
|
|
2740
2988
|
params.verbosity = value;
|
|
2741
|
-
common_log_set_verbosity_thold(value);
|
|
2742
2989
|
}
|
|
2743
2990
|
).set_env("LLAMA_LOG_VERBOSITY"));
|
|
2744
2991
|
add_opt(common_arg(
|
|
@@ -2871,14 +3118,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2871
3118
|
[](common_params & params, int value) {
|
|
2872
3119
|
params.speculative.n_max = value;
|
|
2873
3120
|
}
|
|
2874
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
|
|
3121
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX"));
|
|
2875
3122
|
add_opt(common_arg(
|
|
2876
3123
|
{"--draft-min", "--draft-n-min"}, "N",
|
|
2877
3124
|
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
|
|
2878
3125
|
[](common_params & params, int value) {
|
|
2879
3126
|
params.speculative.n_min = value;
|
|
2880
3127
|
}
|
|
2881
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
|
|
3128
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN"));
|
|
2882
3129
|
add_opt(common_arg(
|
|
2883
3130
|
{"--draft-p-split"}, "P",
|
|
2884
3131
|
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
|
|
@@ -2892,14 +3139,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2892
3139
|
[](common_params & params, const std::string & value) {
|
|
2893
3140
|
params.speculative.p_min = std::stof(value);
|
|
2894
3141
|
}
|
|
2895
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
|
|
3142
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
|
|
2896
3143
|
add_opt(common_arg(
|
|
2897
3144
|
{"-cd", "--ctx-size-draft"}, "N",
|
|
2898
3145
|
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
|
|
2899
3146
|
[](common_params & params, int value) {
|
|
2900
3147
|
params.speculative.n_ctx = value;
|
|
2901
3148
|
}
|
|
2902
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
|
|
3149
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
|
|
2903
3150
|
add_opt(common_arg(
|
|
2904
3151
|
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
|
|
2905
3152
|
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
|
|
@@ -2907,7 +3154,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2907
3154
|
[](common_params & params, const std::string & value) {
|
|
2908
3155
|
params.speculative.devices = parse_device_list(value);
|
|
2909
3156
|
}
|
|
2910
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
|
3157
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
2911
3158
|
add_opt(common_arg(
|
|
2912
3159
|
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
|
2913
3160
|
"number of layers to store in VRAM for the draft model",
|
|
@@ -2919,21 +3166,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2919
3166
|
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
|
2920
3167
|
}
|
|
2921
3168
|
}
|
|
2922
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
|
|
3169
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
|
|
2923
3170
|
add_opt(common_arg(
|
|
2924
3171
|
{"-md", "--model-draft"}, "FNAME",
|
|
2925
3172
|
"draft model for speculative decoding (default: unused)",
|
|
2926
3173
|
[](common_params & params, const std::string & value) {
|
|
2927
3174
|
params.speculative.model.path = value;
|
|
2928
3175
|
}
|
|
2929
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
|
3176
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
|
2930
3177
|
add_opt(common_arg(
|
|
2931
3178
|
{"--spec-replace"}, "TARGET", "DRAFT",
|
|
2932
3179
|
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
|
|
2933
3180
|
[](common_params & params, const std::string & tgt, const std::string & dft) {
|
|
2934
3181
|
params.speculative.replacements.push_back({ tgt, dft });
|
|
2935
3182
|
}
|
|
2936
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
|
3183
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
2937
3184
|
add_opt(common_arg(
|
|
2938
3185
|
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
|
|
2939
3186
|
string_format(
|
|
@@ -3197,7 +3444,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3197
3444
|
params.use_jinja = true;
|
|
3198
3445
|
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
|
|
3199
3446
|
}
|
|
3200
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3447
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
3201
3448
|
|
|
3202
3449
|
add_opt(common_arg(
|
|
3203
3450
|
{"--gpt-oss-120b-default"},
|
|
@@ -3216,7 +3463,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3216
3463
|
params.use_jinja = true;
|
|
3217
3464
|
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
|
|
3218
3465
|
}
|
|
3219
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3466
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
3220
3467
|
|
|
3221
3468
|
add_opt(common_arg(
|
|
3222
3469
|
{"--vision-gemma-4b-default"},
|
|
@@ -3227,7 +3474,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3227
3474
|
params.n_ctx = 0;
|
|
3228
3475
|
params.use_jinja = true;
|
|
3229
3476
|
}
|
|
3230
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3477
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
3231
3478
|
|
|
3232
3479
|
add_opt(common_arg(
|
|
3233
3480
|
{"--vision-gemma-12b-default"},
|
|
@@ -3238,7 +3485,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3238
3485
|
params.n_ctx = 0;
|
|
3239
3486
|
params.use_jinja = true;
|
|
3240
3487
|
}
|
|
3241
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3488
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
3242
3489
|
|
|
3243
3490
|
return ctx_arg;
|
|
3244
3491
|
}
|