cui-llama.rn 1.1.2 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +1 -2
- package/android/src/main/jni.cpp +26 -21
- package/cpp/common.cpp +2028 -1520
- package/cpp/common.h +134 -18
- package/cpp/ggml-aarch64.c +612 -0
- package/cpp/ggml-alloc.h +2 -2
- package/cpp/ggml-backend.c +33 -6
- package/cpp/ggml-backend.h +2 -0
- package/cpp/ggml-common.h +20 -0
- package/cpp/ggml-impl.h +4 -7
- package/cpp/ggml-metal.m +63 -2
- package/cpp/ggml-quants.c +690 -2
- package/cpp/ggml-quants.h +15 -0
- package/cpp/ggml.c +1650 -317
- package/cpp/ggml.h +155 -48
- package/cpp/llama-grammar.cpp +721 -122
- package/cpp/llama-grammar.h +120 -15
- package/cpp/llama-impl.h +132 -1
- package/cpp/llama-sampling.cpp +1361 -356
- package/cpp/llama-sampling.h +20 -48
- package/cpp/llama-vocab.cpp +140 -7
- package/cpp/llama-vocab.h +3 -2
- package/cpp/llama.cpp +810 -307
- package/cpp/llama.h +213 -259
- package/cpp/rn-llama.hpp +17 -14
- package/cpp/sampling.cpp +347 -355
- package/cpp/sampling.h +106 -135
- package/cpp/sgemm.cpp +153 -0
- package/package.json +1 -1
- package/cpp/grammar-parser.cpp +0 -539
- package/cpp/grammar-parser.h +0 -29
package/cpp/common.cpp
CHANGED
@@ -25,6 +25,7 @@
|
|
25
25
|
#include <unordered_map>
|
26
26
|
#include <unordered_set>
|
27
27
|
#include <vector>
|
28
|
+
#include <climits>
|
28
29
|
|
29
30
|
#if defined(__APPLE__) && defined(__MACH__)
|
30
31
|
#include <sys/types.h>
|
@@ -83,41 +84,6 @@ char const *LLAMA_BUILD_TARGET = "unknown";
|
|
83
84
|
|
84
85
|
using json = nlohmann::ordered_json;
|
85
86
|
|
86
|
-
//
|
87
|
-
// Environment variable utils
|
88
|
-
//
|
89
|
-
|
90
|
-
template<typename T>
|
91
|
-
static typename std::enable_if<std::is_same<T, std::string>::value, void>::type
|
92
|
-
get_env(std::string name, T & target) {
|
93
|
-
char * value = std::getenv(name.c_str());
|
94
|
-
target = value ? std::string(value) : target;
|
95
|
-
}
|
96
|
-
|
97
|
-
template<typename T>
|
98
|
-
static typename std::enable_if<!std::is_same<T, bool>::value && std::is_integral<T>::value, void>::type
|
99
|
-
get_env(std::string name, T & target) {
|
100
|
-
char * value = std::getenv(name.c_str());
|
101
|
-
target = value ? std::stoi(value) : target;
|
102
|
-
}
|
103
|
-
|
104
|
-
template<typename T>
|
105
|
-
static typename std::enable_if<std::is_floating_point<T>::value, void>::type
|
106
|
-
get_env(std::string name, T & target) {
|
107
|
-
char * value = std::getenv(name.c_str());
|
108
|
-
target = value ? std::stof(value) : target;
|
109
|
-
}
|
110
|
-
|
111
|
-
template<typename T>
|
112
|
-
static typename std::enable_if<std::is_same<T, bool>::value, void>::type
|
113
|
-
get_env(std::string name, T & target) {
|
114
|
-
char * value = std::getenv(name.c_str());
|
115
|
-
if (value) {
|
116
|
-
std::string val(value);
|
117
|
-
target = val == "1" || val == "true";
|
118
|
-
}
|
119
|
-
}
|
120
|
-
|
121
87
|
//
|
122
88
|
// CPU utils
|
123
89
|
//
|
@@ -257,11 +223,88 @@ int32_t cpu_get_num_math() {
|
|
257
223
|
return cpu_get_num_physical_cores();
|
258
224
|
}
|
259
225
|
|
226
|
+
// Helper for setting process priority
|
227
|
+
|
228
|
+
#if defined(_WIN32)
|
229
|
+
|
230
|
+
bool set_process_priority(enum lm_ggml_sched_priority prio) {
|
231
|
+
if (prio == LM_GGML_SCHED_PRIO_NORMAL) {
|
232
|
+
return true;
|
233
|
+
}
|
234
|
+
|
235
|
+
DWORD p = NORMAL_PRIORITY_CLASS;
|
236
|
+
switch (prio) {
|
237
|
+
case LM_GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
|
238
|
+
case LM_GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
|
239
|
+
case LM_GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
|
240
|
+
case LM_GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
|
241
|
+
}
|
242
|
+
|
243
|
+
if (!SetPriorityClass(GetCurrentProcess(), p)) {
|
244
|
+
fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
|
245
|
+
return false;
|
246
|
+
}
|
247
|
+
|
248
|
+
return true;
|
249
|
+
}
|
250
|
+
|
251
|
+
#else // MacOS and POSIX
|
252
|
+
#include <sys/types.h>
|
253
|
+
#include <sys/resource.h>
|
254
|
+
|
255
|
+
bool set_process_priority(enum lm_ggml_sched_priority prio) {
|
256
|
+
if (prio == LM_GGML_SCHED_PRIO_NORMAL) {
|
257
|
+
return true;
|
258
|
+
}
|
259
|
+
|
260
|
+
int p = 0;
|
261
|
+
switch (prio) {
|
262
|
+
case LM_GGML_SCHED_PRIO_NORMAL: p = 0; break;
|
263
|
+
case LM_GGML_SCHED_PRIO_MEDIUM: p = -5; break;
|
264
|
+
case LM_GGML_SCHED_PRIO_HIGH: p = -10; break;
|
265
|
+
case LM_GGML_SCHED_PRIO_REALTIME: p = -20; break;
|
266
|
+
}
|
267
|
+
|
268
|
+
if (!setpriority(PRIO_PROCESS, 0, p)) {
|
269
|
+
fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
|
270
|
+
return false;
|
271
|
+
}
|
272
|
+
return true;
|
273
|
+
}
|
274
|
+
|
275
|
+
#endif
|
276
|
+
|
260
277
|
//
|
261
278
|
// CLI argument parsing
|
262
279
|
//
|
263
280
|
|
264
|
-
|
281
|
+
#ifdef __GNUC__
|
282
|
+
#ifdef __MINGW32__
|
283
|
+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
284
|
+
#else
|
285
|
+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
286
|
+
#endif
|
287
|
+
#else
|
288
|
+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
289
|
+
#endif
|
290
|
+
|
291
|
+
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
292
|
+
static std::string format(const char * fmt, ...) {
|
293
|
+
va_list ap;
|
294
|
+
va_list ap2;
|
295
|
+
va_start(ap, fmt);
|
296
|
+
va_copy(ap2, ap);
|
297
|
+
int size = vsnprintf(NULL, 0, fmt, ap);
|
298
|
+
LM_GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
299
|
+
std::vector<char> buf(size + 1);
|
300
|
+
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
301
|
+
LM_GGML_ASSERT(size2 == size);
|
302
|
+
va_end(ap2);
|
303
|
+
va_end(ap);
|
304
|
+
return std::string(buf.data(), size);
|
305
|
+
}
|
306
|
+
|
307
|
+
static void gpt_params_handle_model_default(gpt_params & params) {
|
265
308
|
if (!params.hf_repo.empty()) {
|
266
309
|
// short-hand to avoid specifying --hf-file -> default it to --model
|
267
310
|
if (params.hf_file.empty()) {
|
@@ -283,1538 +326,1990 @@ void gpt_params_handle_model_default(gpt_params & params) {
|
|
283
326
|
}
|
284
327
|
}
|
285
328
|
|
286
|
-
|
287
|
-
|
288
|
-
std::string arg;
|
289
|
-
const std::string arg_prefix = "--";
|
290
|
-
llama_sampling_params & sparams = params.sparams;
|
329
|
+
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
|
330
|
+
int32_t n_set = 0;
|
291
331
|
|
292
|
-
|
293
|
-
|
294
|
-
if (
|
295
|
-
|
296
|
-
}
|
297
|
-
|
298
|
-
throw std::invalid_argument("error: unknown argument: " + arg);
|
299
|
-
}
|
300
|
-
if (invalid_param) {
|
301
|
-
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
332
|
+
if (cpuparams.n_threads < 0) {
|
333
|
+
// Assuming everything about cpuparams is invalid
|
334
|
+
if (role_model != nullptr) {
|
335
|
+
cpuparams = *role_model;
|
336
|
+
} else {
|
337
|
+
cpuparams.n_threads = cpu_get_num_math();
|
302
338
|
}
|
303
339
|
}
|
304
340
|
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
gpt_params_handle_model_default(params);
|
310
|
-
|
311
|
-
if (params.hf_token.empty()) {
|
312
|
-
get_env("HF_TOKEN", params.hf_token);
|
313
|
-
}
|
314
|
-
|
315
|
-
if (params.escape) {
|
316
|
-
string_process_escapes(params.prompt);
|
317
|
-
string_process_escapes(params.input_prefix);
|
318
|
-
string_process_escapes(params.input_suffix);
|
319
|
-
string_process_escapes(sparams.cfg_negative_prompt);
|
320
|
-
for (auto & antiprompt : params.antiprompt) {
|
321
|
-
string_process_escapes(antiprompt);
|
341
|
+
for (int32_t i = 0; i < LM_GGML_MAX_N_THREADS; i++) {
|
342
|
+
if (cpuparams.cpumask[i]) {
|
343
|
+
n_set++;
|
322
344
|
}
|
323
345
|
}
|
324
346
|
|
325
|
-
if (
|
326
|
-
|
327
|
-
|
347
|
+
if (n_set && n_set < cpuparams.n_threads) {
|
348
|
+
// Not enough set bits, may experience performance issues.
|
349
|
+
fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
|
328
350
|
}
|
329
|
-
|
330
|
-
return true;
|
331
351
|
}
|
332
352
|
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias);
|
338
|
-
get_env("LLAMA_ARG_HF_REPO", params.hf_repo);
|
339
|
-
get_env("LLAMA_ARG_HF_FILE", params.hf_file);
|
340
|
-
get_env("LLAMA_ARG_THREADS", params.n_threads);
|
341
|
-
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
|
342
|
-
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
|
343
|
-
get_env("LLAMA_ARG_BATCH", params.n_batch);
|
344
|
-
get_env("LLAMA_ARG_UBATCH", params.n_ubatch);
|
345
|
-
get_env("LLAMA_ARG_N_GPU_LAYERS", params.n_gpu_layers);
|
346
|
-
get_env("LLAMA_ARG_THREADS_HTTP", params.n_threads_http);
|
347
|
-
get_env("LLAMA_ARG_CHAT_TEMPLATE", params.chat_template);
|
348
|
-
get_env("LLAMA_ARG_N_PREDICT", params.n_predict);
|
349
|
-
get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics);
|
350
|
-
get_env("LLAMA_ARG_ENDPOINT_SLOTS", params.endpoint_slots);
|
351
|
-
get_env("LLAMA_ARG_EMBEDDINGS", params.embedding);
|
352
|
-
get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn);
|
353
|
-
get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold);
|
354
|
-
get_env("LLAMA_ARG_CONT_BATCHING", params.cont_batching);
|
355
|
-
get_env("LLAMA_ARG_HOST", params.hostname);
|
356
|
-
get_env("LLAMA_ARG_PORT", params.port);
|
357
|
-
}
|
358
|
-
|
359
|
-
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
360
|
-
const auto params_org = params; // the example can modify the default params
|
353
|
+
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options) {
|
354
|
+
std::string arg;
|
355
|
+
const std::string arg_prefix = "--";
|
356
|
+
gpt_sampler_params & sparams = params.sparams;
|
361
357
|
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
return false;
|
358
|
+
std::unordered_map<std::string, llama_arg *> arg_to_options;
|
359
|
+
for (auto & opt : options) {
|
360
|
+
for (const auto & arg : opt.args) {
|
361
|
+
arg_to_options[arg] = &opt;
|
367
362
|
}
|
368
|
-
} catch (const std::invalid_argument & ex) {
|
369
|
-
fprintf(stderr, "%s\n", ex.what());
|
370
|
-
params = params_org;
|
371
|
-
return false;
|
372
363
|
}
|
373
364
|
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
params.n_threads = std::stoi(argv[i]);
|
394
|
-
if (params.n_threads <= 0) {
|
395
|
-
params.n_threads = std::thread::hardware_concurrency();
|
396
|
-
}
|
397
|
-
return true;
|
398
|
-
}
|
399
|
-
if (arg == "-tb" || arg == "--threads-batch") {
|
400
|
-
CHECK_ARG
|
401
|
-
params.n_threads_batch = std::stoi(argv[i]);
|
402
|
-
if (params.n_threads_batch <= 0) {
|
403
|
-
params.n_threads_batch = std::thread::hardware_concurrency();
|
404
|
-
}
|
405
|
-
return true;
|
406
|
-
}
|
407
|
-
if (arg == "-td" || arg == "--threads-draft") {
|
408
|
-
CHECK_ARG
|
409
|
-
params.n_threads_draft = std::stoi(argv[i]);
|
410
|
-
if (params.n_threads_draft <= 0) {
|
411
|
-
params.n_threads_draft = std::thread::hardware_concurrency();
|
412
|
-
}
|
413
|
-
return true;
|
414
|
-
}
|
415
|
-
if (arg == "-tbd" || arg == "--threads-batch-draft") {
|
416
|
-
CHECK_ARG
|
417
|
-
params.n_threads_batch_draft = std::stoi(argv[i]);
|
418
|
-
if (params.n_threads_batch_draft <= 0) {
|
419
|
-
params.n_threads_batch_draft = std::thread::hardware_concurrency();
|
420
|
-
}
|
421
|
-
return true;
|
422
|
-
}
|
423
|
-
if (arg == "-p" || arg == "--prompt") {
|
424
|
-
CHECK_ARG
|
425
|
-
params.prompt = argv[i];
|
426
|
-
return true;
|
427
|
-
}
|
428
|
-
if (arg == "-e" || arg == "--escape") {
|
429
|
-
params.escape = true;
|
430
|
-
return true;
|
431
|
-
}
|
432
|
-
if (arg == "--no-escape") {
|
433
|
-
params.escape = false;
|
434
|
-
return true;
|
435
|
-
}
|
436
|
-
if (arg == "--prompt-cache") {
|
437
|
-
CHECK_ARG
|
438
|
-
params.path_prompt_cache = argv[i];
|
439
|
-
return true;
|
440
|
-
}
|
441
|
-
if (arg == "--prompt-cache-all") {
|
442
|
-
params.prompt_cache_all = true;
|
443
|
-
return true;
|
444
|
-
}
|
445
|
-
if (arg == "--prompt-cache-ro") {
|
446
|
-
params.prompt_cache_ro = true;
|
447
|
-
return true;
|
448
|
-
}
|
449
|
-
if (arg == "-bf" || arg == "--binary-file") {
|
450
|
-
CHECK_ARG
|
451
|
-
std::ifstream file(argv[i], std::ios::binary);
|
452
|
-
if (!file) {
|
453
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
454
|
-
invalid_param = true;
|
455
|
-
return true;
|
456
|
-
}
|
457
|
-
// store the external file name in params
|
458
|
-
params.prompt_file = argv[i];
|
459
|
-
std::ostringstream ss;
|
460
|
-
ss << file.rdbuf();
|
461
|
-
params.prompt = ss.str();
|
462
|
-
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
|
463
|
-
return true;
|
464
|
-
}
|
465
|
-
if (arg == "-f" || arg == "--file") {
|
466
|
-
CHECK_ARG
|
467
|
-
std::ifstream file(argv[i]);
|
468
|
-
if (!file) {
|
469
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
470
|
-
invalid_param = true;
|
471
|
-
return true;
|
472
|
-
}
|
473
|
-
// store the external file name in params
|
474
|
-
params.prompt_file = argv[i];
|
475
|
-
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
476
|
-
if (!params.prompt.empty() && params.prompt.back() == '\n') {
|
477
|
-
params.prompt.pop_back();
|
478
|
-
}
|
479
|
-
return true;
|
480
|
-
}
|
481
|
-
if (arg == "--in-file") {
|
482
|
-
CHECK_ARG
|
483
|
-
std::ifstream file(argv[i]);
|
484
|
-
if (!file) {
|
485
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
486
|
-
invalid_param = true;
|
487
|
-
return true;
|
488
|
-
}
|
489
|
-
params.in_files.push_back(argv[i]);
|
490
|
-
return true;
|
491
|
-
}
|
492
|
-
if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
|
493
|
-
CHECK_ARG
|
494
|
-
params.n_predict = std::stoi(argv[i]);
|
495
|
-
return true;
|
496
|
-
}
|
497
|
-
if (arg == "--top-k") {
|
498
|
-
CHECK_ARG
|
499
|
-
sparams.top_k = std::stoi(argv[i]);
|
500
|
-
return true;
|
501
|
-
}
|
502
|
-
if (arg == "-c" || arg == "--ctx-size") {
|
503
|
-
CHECK_ARG
|
504
|
-
params.n_ctx = std::stoi(argv[i]);
|
505
|
-
return true;
|
506
|
-
}
|
507
|
-
if (arg == "--grp-attn-n" || arg == "-gan") {
|
508
|
-
CHECK_ARG
|
509
|
-
params.grp_attn_n = std::stoi(argv[i]);
|
510
|
-
return true;
|
511
|
-
}
|
512
|
-
if (arg == "--grp-attn-w" || arg == "-gaw") {
|
513
|
-
CHECK_ARG
|
514
|
-
params.grp_attn_w = std::stoi(argv[i]);
|
515
|
-
return true;
|
516
|
-
}
|
517
|
-
if (arg == "--rope-freq-base") {
|
518
|
-
CHECK_ARG
|
519
|
-
params.rope_freq_base = std::stof(argv[i]);
|
520
|
-
return true;
|
521
|
-
}
|
522
|
-
if (arg == "--rope-freq-scale") {
|
523
|
-
CHECK_ARG
|
524
|
-
params.rope_freq_scale = std::stof(argv[i]);
|
525
|
-
return true;
|
526
|
-
}
|
527
|
-
if (arg == "--rope-scaling") {
|
528
|
-
CHECK_ARG
|
529
|
-
std::string value(argv[i]);
|
530
|
-
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
531
|
-
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
532
|
-
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
533
|
-
else { invalid_param = true; }
|
534
|
-
return true;
|
535
|
-
}
|
536
|
-
if (arg == "--rope-scale") {
|
537
|
-
CHECK_ARG
|
538
|
-
params.rope_freq_scale = 1.0f / std::stof(argv[i]);
|
539
|
-
return true;
|
540
|
-
}
|
541
|
-
if (arg == "--yarn-orig-ctx") {
|
542
|
-
CHECK_ARG
|
543
|
-
params.yarn_orig_ctx = std::stoi(argv[i]);
|
544
|
-
return true;
|
545
|
-
}
|
546
|
-
if (arg == "--yarn-ext-factor") {
|
547
|
-
CHECK_ARG
|
548
|
-
params.yarn_ext_factor = std::stof(argv[i]);
|
549
|
-
return true;
|
550
|
-
}
|
551
|
-
if (arg == "--yarn-attn-factor") {
|
552
|
-
CHECK_ARG
|
553
|
-
params.yarn_attn_factor = std::stof(argv[i]);
|
554
|
-
return true;
|
555
|
-
}
|
556
|
-
if (arg == "--yarn-beta-fast") {
|
557
|
-
CHECK_ARG
|
558
|
-
params.yarn_beta_fast = std::stof(argv[i]);
|
559
|
-
return true;
|
560
|
-
}
|
561
|
-
if (arg == "--yarn-beta-slow") {
|
562
|
-
CHECK_ARG
|
563
|
-
params.yarn_beta_slow = std::stof(argv[i]);
|
564
|
-
return true;
|
565
|
-
}
|
566
|
-
if (arg == "--pooling") {
|
567
|
-
CHECK_ARG
|
568
|
-
std::string value(argv[i]);
|
569
|
-
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
|
570
|
-
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
|
571
|
-
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
|
572
|
-
else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
|
573
|
-
else { invalid_param = true; }
|
574
|
-
return true;
|
575
|
-
}
|
576
|
-
if (arg == "--attention") {
|
577
|
-
CHECK_ARG
|
578
|
-
std::string value(argv[i]);
|
579
|
-
/**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
|
580
|
-
else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
|
581
|
-
else { invalid_param = true; }
|
582
|
-
return true;
|
583
|
-
}
|
584
|
-
if (arg == "--defrag-thold" || arg == "-dt") {
|
585
|
-
CHECK_ARG
|
586
|
-
params.defrag_thold = std::stof(argv[i]);
|
587
|
-
return true;
|
588
|
-
}
|
589
|
-
if (arg == "--samplers") {
|
590
|
-
CHECK_ARG
|
591
|
-
const auto sampler_names = string_split(argv[i], ';');
|
592
|
-
sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
|
593
|
-
return true;
|
594
|
-
}
|
595
|
-
if (arg == "--sampling-seq") {
|
596
|
-
CHECK_ARG
|
597
|
-
sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]);
|
598
|
-
return true;
|
599
|
-
}
|
600
|
-
if (arg == "--top-p") {
|
601
|
-
CHECK_ARG
|
602
|
-
sparams.top_p = std::stof(argv[i]);
|
603
|
-
return true;
|
604
|
-
}
|
605
|
-
if (arg == "--min-p") {
|
606
|
-
CHECK_ARG
|
607
|
-
sparams.min_p = std::stof(argv[i]);
|
608
|
-
return true;
|
609
|
-
}
|
610
|
-
if (arg == "--temp") {
|
611
|
-
CHECK_ARG
|
612
|
-
sparams.temp = std::stof(argv[i]);
|
613
|
-
sparams.temp = std::max(sparams.temp, 0.0f);
|
614
|
-
return true;
|
615
|
-
}
|
616
|
-
if (arg == "--tfs") {
|
617
|
-
CHECK_ARG
|
618
|
-
sparams.tfs_z = std::stof(argv[i]);
|
619
|
-
return true;
|
620
|
-
}
|
621
|
-
if (arg == "--typical") {
|
622
|
-
CHECK_ARG
|
623
|
-
sparams.typical_p = std::stof(argv[i]);
|
624
|
-
return true;
|
625
|
-
}
|
626
|
-
if (arg == "--repeat-last-n") {
|
627
|
-
CHECK_ARG
|
628
|
-
sparams.penalty_last_n = std::stoi(argv[i]);
|
629
|
-
sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
|
630
|
-
return true;
|
631
|
-
}
|
632
|
-
if (arg == "--repeat-penalty") {
|
633
|
-
CHECK_ARG
|
634
|
-
sparams.penalty_repeat = std::stof(argv[i]);
|
635
|
-
return true;
|
636
|
-
}
|
637
|
-
if (arg == "--frequency-penalty") {
|
638
|
-
CHECK_ARG
|
639
|
-
sparams.penalty_freq = std::stof(argv[i]);
|
640
|
-
return true;
|
641
|
-
}
|
642
|
-
if (arg == "--presence-penalty") {
|
643
|
-
CHECK_ARG
|
644
|
-
sparams.penalty_present = std::stof(argv[i]);
|
645
|
-
return true;
|
646
|
-
}
|
647
|
-
if (arg == "--dynatemp-range") {
|
648
|
-
CHECK_ARG
|
649
|
-
sparams.dynatemp_range = std::stof(argv[i]);
|
650
|
-
return true;
|
651
|
-
}
|
652
|
-
if (arg == "--dynatemp-exp") {
|
653
|
-
CHECK_ARG
|
654
|
-
sparams.dynatemp_exponent = std::stof(argv[i]);
|
655
|
-
return true;
|
656
|
-
}
|
657
|
-
if (arg == "--mirostat") {
|
658
|
-
CHECK_ARG
|
659
|
-
sparams.mirostat = std::stoi(argv[i]);
|
660
|
-
return true;
|
661
|
-
}
|
662
|
-
if (arg == "--mirostat-lr") {
|
663
|
-
CHECK_ARG
|
664
|
-
sparams.mirostat_eta = std::stof(argv[i]);
|
665
|
-
return true;
|
666
|
-
}
|
667
|
-
if (arg == "--mirostat-ent") {
|
668
|
-
CHECK_ARG
|
669
|
-
sparams.mirostat_tau = std::stof(argv[i]);
|
670
|
-
return true;
|
671
|
-
}
|
672
|
-
if (arg == "--cfg-negative-prompt") {
|
673
|
-
CHECK_ARG
|
674
|
-
sparams.cfg_negative_prompt = argv[i];
|
675
|
-
return true;
|
676
|
-
}
|
677
|
-
if (arg == "--cfg-negative-prompt-file") {
|
678
|
-
CHECK_ARG
|
679
|
-
std::ifstream file(argv[i]);
|
680
|
-
if (!file) {
|
681
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
682
|
-
invalid_param = true;
|
683
|
-
return true;
|
684
|
-
}
|
685
|
-
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
|
686
|
-
if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
|
687
|
-
sparams.cfg_negative_prompt.pop_back();
|
688
|
-
}
|
689
|
-
return true;
|
690
|
-
}
|
691
|
-
if (arg == "--cfg-scale") {
|
692
|
-
CHECK_ARG
|
693
|
-
sparams.cfg_scale = std::stof(argv[i]);
|
694
|
-
return true;
|
695
|
-
}
|
696
|
-
if (arg == "-b" || arg == "--batch-size") {
|
697
|
-
CHECK_ARG
|
698
|
-
params.n_batch = std::stoi(argv[i]);
|
699
|
-
return true;
|
700
|
-
}
|
701
|
-
if (arg == "-ub" || arg == "--ubatch-size") {
|
702
|
-
CHECK_ARG
|
703
|
-
params.n_ubatch = std::stoi(argv[i]);
|
704
|
-
return true;
|
705
|
-
}
|
706
|
-
if (arg == "--keep") {
|
707
|
-
CHECK_ARG
|
708
|
-
params.n_keep = std::stoi(argv[i]);
|
709
|
-
return true;
|
710
|
-
}
|
711
|
-
if (arg == "--draft") {
|
712
|
-
CHECK_ARG
|
713
|
-
params.n_draft = std::stoi(argv[i]);
|
714
|
-
return true;
|
715
|
-
}
|
716
|
-
if (arg == "--chunks") {
|
717
|
-
CHECK_ARG
|
718
|
-
params.n_chunks = std::stoi(argv[i]);
|
719
|
-
return true;
|
720
|
-
}
|
721
|
-
if (arg == "-np" || arg == "--parallel") {
|
722
|
-
CHECK_ARG
|
723
|
-
params.n_parallel = std::stoi(argv[i]);
|
724
|
-
return true;
|
725
|
-
}
|
726
|
-
if (arg == "-ns" || arg == "--sequences") {
|
727
|
-
CHECK_ARG
|
728
|
-
params.n_sequences = std::stoi(argv[i]);
|
729
|
-
return true;
|
730
|
-
}
|
731
|
-
if (arg == "--p-split" || arg == "-ps") {
|
732
|
-
CHECK_ARG
|
733
|
-
params.p_split = std::stof(argv[i]);
|
734
|
-
return true;
|
735
|
-
}
|
736
|
-
if (arg == "-m" || arg == "--model") {
|
737
|
-
CHECK_ARG
|
738
|
-
params.model = argv[i];
|
739
|
-
return true;
|
740
|
-
}
|
741
|
-
if (arg == "-md" || arg == "--model-draft") {
|
742
|
-
CHECK_ARG
|
743
|
-
params.model_draft = argv[i];
|
744
|
-
return true;
|
745
|
-
}
|
746
|
-
if (arg == "-a" || arg == "--alias") {
|
747
|
-
CHECK_ARG
|
748
|
-
params.model_alias = argv[i];
|
749
|
-
return true;
|
750
|
-
}
|
751
|
-
if (arg == "-mu" || arg == "--model-url") {
|
752
|
-
CHECK_ARG
|
753
|
-
params.model_url = argv[i];
|
754
|
-
return true;
|
755
|
-
}
|
756
|
-
if (arg == "-hft" || arg == "--hf-token") {
|
757
|
-
if (++i >= argc) {
|
758
|
-
invalid_param = true;
|
759
|
-
return true;
|
760
|
-
}
|
761
|
-
params.hf_token = argv[i];
|
762
|
-
return true;
|
763
|
-
}
|
764
|
-
if (arg == "-hfr" || arg == "--hf-repo") {
|
765
|
-
CHECK_ARG
|
766
|
-
params.hf_repo = argv[i];
|
767
|
-
return true;
|
768
|
-
}
|
769
|
-
if (arg == "-hff" || arg == "--hf-file") {
|
770
|
-
CHECK_ARG
|
771
|
-
params.hf_file = argv[i];
|
772
|
-
return true;
|
773
|
-
}
|
774
|
-
if (arg == "--lora") {
|
775
|
-
CHECK_ARG
|
776
|
-
params.lora_adapters.push_back({
|
777
|
-
std::string(argv[i]),
|
778
|
-
1.0,
|
779
|
-
});
|
780
|
-
return true;
|
781
|
-
}
|
782
|
-
if (arg == "--lora-scaled") {
|
783
|
-
CHECK_ARG
|
784
|
-
std::string lora_adapter = argv[i];
|
785
|
-
CHECK_ARG
|
786
|
-
params.lora_adapters.push_back({
|
787
|
-
lora_adapter,
|
788
|
-
std::stof(argv[i]),
|
789
|
-
});
|
790
|
-
return true;
|
791
|
-
}
|
792
|
-
if (arg == "--lora-init-without-apply") {
|
793
|
-
params.lora_init_without_apply = true;
|
794
|
-
return true;
|
795
|
-
}
|
796
|
-
if (arg == "--control-vector") {
|
797
|
-
CHECK_ARG
|
798
|
-
params.control_vectors.push_back({ 1.0f, argv[i], });
|
799
|
-
return true;
|
800
|
-
}
|
801
|
-
if (arg == "--control-vector-scaled") {
|
802
|
-
CHECK_ARG
|
803
|
-
const char* fname = argv[i];
|
804
|
-
CHECK_ARG
|
805
|
-
params.control_vectors.push_back({ std::stof(argv[i]), fname, });
|
806
|
-
return true;
|
807
|
-
}
|
808
|
-
if (arg == "--control-vector-layer-range") {
|
809
|
-
CHECK_ARG
|
810
|
-
params.control_vector_layer_start = std::stoi(argv[i]);
|
811
|
-
CHECK_ARG
|
812
|
-
params.control_vector_layer_end = std::stoi(argv[i]);
|
813
|
-
return true;
|
814
|
-
}
|
815
|
-
if (arg == "--mmproj") {
|
816
|
-
CHECK_ARG
|
817
|
-
params.mmproj = argv[i];
|
818
|
-
return true;
|
819
|
-
}
|
820
|
-
if (arg == "--image") {
|
821
|
-
CHECK_ARG
|
822
|
-
params.image.emplace_back(argv[i]);
|
823
|
-
return true;
|
824
|
-
}
|
825
|
-
if (arg == "-i" || arg == "--interactive") {
|
826
|
-
params.interactive = true;
|
827
|
-
return true;
|
828
|
-
}
|
829
|
-
if (arg == "-sp" || arg == "--special") {
|
830
|
-
params.special = true;
|
831
|
-
return true;
|
832
|
-
}
|
833
|
-
if (arg == "--embedding" || arg == "--embeddings") {
|
834
|
-
params.embedding = true;
|
835
|
-
return true;
|
836
|
-
}
|
837
|
-
if (arg == "--embd-normalize") {
|
838
|
-
CHECK_ARG
|
839
|
-
params.embd_normalize = std::stoi(argv[i]);
|
840
|
-
return true;
|
841
|
-
}
|
842
|
-
if (arg == "--embd-output-format") {
|
843
|
-
CHECK_ARG
|
844
|
-
params.embd_out = argv[i];
|
845
|
-
return true;
|
846
|
-
}
|
847
|
-
if (arg == "--embd-separator") {
|
848
|
-
CHECK_ARG
|
849
|
-
params.embd_sep = argv[i];
|
850
|
-
return true;
|
851
|
-
}
|
852
|
-
if (arg == "-if" || arg == "--interactive-first") {
|
853
|
-
params.interactive_first = true;
|
854
|
-
return true;
|
855
|
-
}
|
856
|
-
if (arg == "-cnv" || arg == "--conversation") {
|
857
|
-
params.conversation = true;
|
858
|
-
return true;
|
859
|
-
}
|
860
|
-
if (arg == "--infill") {
|
861
|
-
params.infill = true;
|
862
|
-
return true;
|
863
|
-
}
|
864
|
-
if (arg == "-dkvc" || arg == "--dump-kv-cache") {
|
865
|
-
params.dump_kv_cache = true;
|
866
|
-
return true;
|
867
|
-
}
|
868
|
-
if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
869
|
-
params.no_kv_offload = true;
|
870
|
-
return true;
|
871
|
-
}
|
872
|
-
if (arg == "-ctk" || arg == "--cache-type-k") {
|
873
|
-
params.cache_type_k = argv[++i];
|
874
|
-
return true;
|
875
|
-
}
|
876
|
-
if (arg == "-ctv" || arg == "--cache-type-v") {
|
877
|
-
params.cache_type_v = argv[++i];
|
878
|
-
return true;
|
879
|
-
}
|
880
|
-
if (arg == "-mli" || arg == "--multiline-input") {
|
881
|
-
params.multiline_input = true;
|
882
|
-
return true;
|
883
|
-
}
|
884
|
-
if (arg == "--simple-io") {
|
885
|
-
params.simple_io = true;
|
886
|
-
return true;
|
887
|
-
}
|
888
|
-
if (arg == "-cb" || arg == "--cont-batching") {
|
889
|
-
params.cont_batching = true;
|
890
|
-
return true;
|
891
|
-
}
|
892
|
-
if (arg == "-nocb" || arg == "--no-cont-batching") {
|
893
|
-
params.cont_batching = false;
|
894
|
-
return true;
|
895
|
-
}
|
896
|
-
if (arg == "-fa" || arg == "--flash-attn") {
|
897
|
-
params.flash_attn = true;
|
898
|
-
return true;
|
899
|
-
}
|
900
|
-
if (arg == "-co" || arg == "--color") {
|
901
|
-
params.use_color = true;
|
902
|
-
return true;
|
903
|
-
}
|
904
|
-
if (arg == "--mlock") {
|
905
|
-
params.use_mlock = true;
|
906
|
-
return true;
|
907
|
-
}
|
908
|
-
if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
|
909
|
-
CHECK_ARG
|
910
|
-
params.n_gpu_layers = std::stoi(argv[i]);
|
911
|
-
if (!llama_supports_gpu_offload()) {
|
912
|
-
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
|
913
|
-
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
914
|
-
}
|
915
|
-
return true;
|
916
|
-
}
|
917
|
-
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") {
|
918
|
-
CHECK_ARG
|
919
|
-
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
920
|
-
if (!llama_supports_gpu_offload()) {
|
921
|
-
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
922
|
-
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
365
|
+
// handle environment variables
|
366
|
+
for (auto & opt : options) {
|
367
|
+
std::string value;
|
368
|
+
if (opt.get_value_from_env(value)) {
|
369
|
+
try {
|
370
|
+
if (opt.handler_void && (value == "1" || value == "true")) {
|
371
|
+
opt.handler_void(params);
|
372
|
+
}
|
373
|
+
if (opt.handler_int) {
|
374
|
+
opt.handler_int(params, std::stoi(value));
|
375
|
+
}
|
376
|
+
if (opt.handler_string) {
|
377
|
+
opt.handler_string(params, value);
|
378
|
+
continue;
|
379
|
+
}
|
380
|
+
} catch (std::exception & e) {
|
381
|
+
throw std::invalid_argument(format(
|
382
|
+
"error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
|
383
|
+
}
|
923
384
|
}
|
924
|
-
return true;
|
925
385
|
}
|
926
|
-
|
927
|
-
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
#endif // LM_GGML_USE_CUDA_SYCL_VULKAN
|
932
|
-
return true;
|
933
|
-
}
|
934
|
-
if (arg == "--split-mode" || arg == "-sm") {
|
935
|
-
CHECK_ARG
|
936
|
-
std::string arg_next = argv[i];
|
937
|
-
if (arg_next == "none") {
|
938
|
-
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
939
|
-
}
|
940
|
-
else if (arg_next == "layer") {
|
941
|
-
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
942
|
-
}
|
943
|
-
else if (arg_next == "row") {
|
944
|
-
#ifdef LM_GGML_USE_SYCL
|
945
|
-
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
|
946
|
-
exit(1);
|
947
|
-
#endif // LM_GGML_USE_SYCL
|
948
|
-
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
949
|
-
}
|
950
|
-
else {
|
951
|
-
invalid_param = true;
|
952
|
-
return true;
|
386
|
+
|
387
|
+
// handle command line arguments
|
388
|
+
auto check_arg = [&](int i) {
|
389
|
+
if (i+1 >= argc) {
|
390
|
+
throw std::invalid_argument("expected value for argument");
|
953
391
|
}
|
954
|
-
|
955
|
-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
|
956
|
-
#endif // LM_GGML_USE_CUDA_SYCL_VULKAN
|
957
|
-
return true;
|
958
|
-
}
|
959
|
-
if (arg == "--tensor-split" || arg == "-ts") {
|
960
|
-
CHECK_ARG
|
961
|
-
std::string arg_next = argv[i];
|
392
|
+
};
|
962
393
|
|
963
|
-
|
964
|
-
const std::
|
965
|
-
|
966
|
-
std::
|
967
|
-
if (
|
968
|
-
|
969
|
-
return true;
|
394
|
+
for (int i = 1; i < argc; i++) {
|
395
|
+
const std::string arg_prefix = "--";
|
396
|
+
|
397
|
+
std::string arg = argv[i];
|
398
|
+
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
399
|
+
std::replace(arg.begin(), arg.end(), '_', '-');
|
970
400
|
}
|
971
|
-
|
972
|
-
|
973
|
-
params.tensor_split[i] = std::stof(split_arg[i]);
|
974
|
-
}
|
975
|
-
else {
|
976
|
-
params.tensor_split[i] = 0.0f;
|
977
|
-
}
|
401
|
+
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
402
|
+
throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str()));
|
978
403
|
}
|
979
|
-
|
980
|
-
|
981
|
-
|
982
|
-
return true;
|
983
|
-
}
|
984
|
-
if (arg == "--rpc") {
|
985
|
-
CHECK_ARG
|
986
|
-
params.rpc_servers = argv[i];
|
987
|
-
return true;
|
988
|
-
}
|
989
|
-
if (arg == "--no-mmap") {
|
990
|
-
params.use_mmap = false;
|
991
|
-
return true;
|
992
|
-
}
|
993
|
-
if (arg == "--numa") {
|
994
|
-
CHECK_ARG
|
995
|
-
std::string value(argv[i]);
|
996
|
-
/**/ if (value == "distribute" || value == "") { params.numa = LM_GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
997
|
-
else if (value == "isolate") { params.numa = LM_GGML_NUMA_STRATEGY_ISOLATE; }
|
998
|
-
else if (value == "numactl") { params.numa = LM_GGML_NUMA_STRATEGY_NUMACTL; }
|
999
|
-
else { invalid_param = true; }
|
1000
|
-
return true;
|
1001
|
-
}
|
1002
|
-
if (arg == "-v" || arg == "--verbose") {
|
1003
|
-
params.verbosity = 1;
|
1004
|
-
return true;
|
1005
|
-
}
|
1006
|
-
if (arg == "--verbosity") {
|
1007
|
-
CHECK_ARG
|
1008
|
-
params.verbosity = std::stoi(argv[i]);
|
1009
|
-
return true;
|
1010
|
-
}
|
1011
|
-
if (arg == "--verbose-prompt") {
|
1012
|
-
params.verbose_prompt = true;
|
1013
|
-
return true;
|
1014
|
-
}
|
1015
|
-
if (arg == "--no-display-prompt") {
|
1016
|
-
params.display_prompt = false;
|
1017
|
-
return true;
|
1018
|
-
}
|
1019
|
-
if (arg == "-r" || arg == "--reverse-prompt") {
|
1020
|
-
CHECK_ARG
|
1021
|
-
params.antiprompt.emplace_back(argv[i]);
|
1022
|
-
return true;
|
1023
|
-
}
|
1024
|
-
if (arg == "-ld" || arg == "--logdir") {
|
1025
|
-
CHECK_ARG
|
1026
|
-
params.logdir = argv[i];
|
1027
|
-
|
1028
|
-
if (params.logdir.back() != DIRECTORY_SEPARATOR) {
|
1029
|
-
params.logdir += DIRECTORY_SEPARATOR;
|
404
|
+
auto opt = *arg_to_options[arg];
|
405
|
+
if (opt.has_value_from_env()) {
|
406
|
+
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
|
1030
407
|
}
|
1031
|
-
return true;
|
1032
|
-
}
|
1033
|
-
if (arg == "-lcs" || arg == "--lookup-cache-static") {
|
1034
|
-
CHECK_ARG
|
1035
|
-
params.lookup_cache_static = argv[i];
|
1036
|
-
return true;
|
1037
|
-
}
|
1038
|
-
if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
|
1039
|
-
CHECK_ARG
|
1040
|
-
params.lookup_cache_dynamic = argv[i];
|
1041
|
-
return true;
|
1042
|
-
}
|
1043
|
-
if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
|
1044
|
-
CHECK_ARG
|
1045
|
-
params.logits_file = argv[i];
|
1046
|
-
return true;
|
1047
|
-
}
|
1048
|
-
if (arg == "--perplexity" || arg == "--all-logits") {
|
1049
|
-
params.logits_all = true;
|
1050
|
-
return true;
|
1051
|
-
}
|
1052
|
-
if (arg == "--ppl-stride") {
|
1053
|
-
CHECK_ARG
|
1054
|
-
params.ppl_stride = std::stoi(argv[i]);
|
1055
|
-
return true;
|
1056
|
-
}
|
1057
|
-
if (arg == "--ppl-output-type") {
|
1058
|
-
CHECK_ARG
|
1059
|
-
params.ppl_output_type = std::stoi(argv[i]);
|
1060
|
-
return true;
|
1061
|
-
}
|
1062
|
-
if (arg == "-ptc" || arg == "--print-token-count") {
|
1063
|
-
CHECK_ARG
|
1064
|
-
params.n_print = std::stoi(argv[i]);
|
1065
|
-
return true;
|
1066
|
-
}
|
1067
|
-
if (arg == "--check-tensors") {
|
1068
|
-
params.check_tensors = true;
|
1069
|
-
return true;
|
1070
|
-
}
|
1071
|
-
if (arg == "--hellaswag") {
|
1072
|
-
params.hellaswag = true;
|
1073
|
-
return true;
|
1074
|
-
}
|
1075
|
-
if (arg == "--hellaswag-tasks") {
|
1076
|
-
CHECK_ARG
|
1077
|
-
params.hellaswag_tasks = std::stoi(argv[i]);
|
1078
|
-
return true;
|
1079
|
-
}
|
1080
|
-
if (arg == "--winogrande") {
|
1081
|
-
params.winogrande = true;
|
1082
|
-
return true;
|
1083
|
-
}
|
1084
|
-
if (arg == "--winogrande-tasks") {
|
1085
|
-
CHECK_ARG
|
1086
|
-
params.winogrande_tasks = std::stoi(argv[i]);
|
1087
|
-
return true;
|
1088
|
-
}
|
1089
|
-
if (arg == "--multiple-choice") {
|
1090
|
-
params.multiple_choice = true;
|
1091
|
-
return true;
|
1092
|
-
}
|
1093
|
-
if (arg == "--multiple-choice-tasks") {
|
1094
|
-
CHECK_ARG
|
1095
|
-
params.multiple_choice_tasks = std::stoi(argv[i]);
|
1096
|
-
return true;
|
1097
|
-
}
|
1098
|
-
if (arg == "--kl-divergence") {
|
1099
|
-
params.kl_divergence = true;
|
1100
|
-
return true;
|
1101
|
-
}
|
1102
|
-
if (arg == "--ignore-eos") {
|
1103
|
-
params.ignore_eos = true;
|
1104
|
-
return true;
|
1105
|
-
}
|
1106
|
-
if (arg == "--penalize-nl") {
|
1107
|
-
sparams.penalize_nl = true;
|
1108
|
-
return true;
|
1109
|
-
}
|
1110
|
-
if (arg == "-l" || arg == "--logit-bias") {
|
1111
|
-
CHECK_ARG
|
1112
|
-
std::stringstream ss(argv[i]);
|
1113
|
-
llama_token key;
|
1114
|
-
char sign;
|
1115
|
-
std::string value_str;
|
1116
408
|
try {
|
1117
|
-
if (
|
1118
|
-
|
409
|
+
if (opt.handler_void) {
|
410
|
+
opt.handler_void(params);
|
411
|
+
continue;
|
1119
412
|
}
|
1120
|
-
|
1121
|
-
|
413
|
+
|
414
|
+
// arg with single value
|
415
|
+
check_arg(i);
|
416
|
+
std::string val = argv[++i];
|
417
|
+
if (opt.handler_int) {
|
418
|
+
opt.handler_int(params, std::stoi(val));
|
419
|
+
continue;
|
1122
420
|
}
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1130
|
-
|
1131
|
-
|
1132
|
-
|
1133
|
-
|
1134
|
-
if (arg == "--version") {
|
1135
|
-
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
1136
|
-
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
1137
|
-
exit(0);
|
1138
|
-
}
|
1139
|
-
if (arg == "--in-prefix-bos") {
|
1140
|
-
params.input_prefix_bos = true;
|
1141
|
-
params.enable_chat_template = false;
|
1142
|
-
return true;
|
1143
|
-
}
|
1144
|
-
if (arg == "--in-prefix") {
|
1145
|
-
CHECK_ARG
|
1146
|
-
params.input_prefix = argv[i];
|
1147
|
-
params.enable_chat_template = false;
|
1148
|
-
return true;
|
1149
|
-
}
|
1150
|
-
if (arg == "--in-suffix") {
|
1151
|
-
CHECK_ARG
|
1152
|
-
params.input_suffix = argv[i];
|
1153
|
-
params.enable_chat_template = false;
|
1154
|
-
return true;
|
1155
|
-
}
|
1156
|
-
if (arg == "--spm-infill") {
|
1157
|
-
params.spm_infill = true;
|
1158
|
-
return true;
|
1159
|
-
}
|
1160
|
-
if (arg == "--grammar") {
|
1161
|
-
CHECK_ARG
|
1162
|
-
sparams.grammar = argv[i];
|
1163
|
-
return true;
|
1164
|
-
}
|
1165
|
-
if (arg == "--grammar-file") {
|
1166
|
-
CHECK_ARG
|
1167
|
-
std::ifstream file(argv[i]);
|
1168
|
-
if (!file) {
|
1169
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
1170
|
-
invalid_param = true;
|
1171
|
-
return true;
|
1172
|
-
}
|
1173
|
-
std::copy(
|
1174
|
-
std::istreambuf_iterator<char>(file),
|
1175
|
-
std::istreambuf_iterator<char>(),
|
1176
|
-
std::back_inserter(sparams.grammar)
|
1177
|
-
);
|
1178
|
-
return true;
|
1179
|
-
}
|
1180
|
-
if (arg == "-j" || arg == "--json-schema") {
|
1181
|
-
CHECK_ARG
|
1182
|
-
sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
|
1183
|
-
return true;
|
1184
|
-
}
|
1185
|
-
if (arg == "--override-kv") {
|
1186
|
-
CHECK_ARG
|
1187
|
-
if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
|
1188
|
-
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
1189
|
-
invalid_param = true;
|
1190
|
-
return true;
|
1191
|
-
}
|
1192
|
-
return true;
|
1193
|
-
}
|
1194
|
-
if (arg == "--host") {
|
1195
|
-
CHECK_ARG
|
1196
|
-
params.hostname = argv[i];
|
1197
|
-
return true;
|
1198
|
-
}
|
1199
|
-
if (arg == "--port") {
|
1200
|
-
CHECK_ARG
|
1201
|
-
params.port = std::stoi(argv[i]);
|
1202
|
-
return true;
|
1203
|
-
}
|
1204
|
-
if (arg == "--path") {
|
1205
|
-
CHECK_ARG
|
1206
|
-
params.public_path = argv[i];
|
1207
|
-
return true;
|
1208
|
-
}
|
1209
|
-
if (arg == "--api-key") {
|
1210
|
-
CHECK_ARG
|
1211
|
-
params.api_keys.push_back(argv[i]);
|
1212
|
-
return true;
|
1213
|
-
}
|
1214
|
-
if (arg == "--api-key-file") {
|
1215
|
-
CHECK_ARG
|
1216
|
-
std::ifstream key_file(argv[i]);
|
1217
|
-
if (!key_file) {
|
1218
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
1219
|
-
invalid_param = true;
|
1220
|
-
return true;
|
1221
|
-
}
|
1222
|
-
std::string key;
|
1223
|
-
while (std::getline(key_file, key)) {
|
1224
|
-
if (!key.empty()) {
|
1225
|
-
params.api_keys.push_back(key);
|
421
|
+
if (opt.handler_string) {
|
422
|
+
opt.handler_string(params, val);
|
423
|
+
continue;
|
424
|
+
}
|
425
|
+
|
426
|
+
// arg with 2 values
|
427
|
+
check_arg(i);
|
428
|
+
std::string val2 = argv[++i];
|
429
|
+
if (opt.handler_str_str) {
|
430
|
+
opt.handler_str_str(params, val, val2);
|
431
|
+
continue;
|
1226
432
|
}
|
433
|
+
} catch (std::exception & e) {
|
434
|
+
throw std::invalid_argument(format(
|
435
|
+
"error while handling argument \"%s\": %s\n\n"
|
436
|
+
"usage:\n%s\n\nto show complete usage, run with -h",
|
437
|
+
arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
|
1227
438
|
}
|
1228
|
-
key_file.close();
|
1229
|
-
return true;
|
1230
439
|
}
|
1231
|
-
|
1232
|
-
|
1233
|
-
|
1234
|
-
|
440
|
+
|
441
|
+
postprocess_cpu_params(params.cpuparams, nullptr);
|
442
|
+
postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
|
443
|
+
postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams);
|
444
|
+
postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch);
|
445
|
+
|
446
|
+
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
|
447
|
+
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
1235
448
|
}
|
1236
|
-
|
1237
|
-
|
1238
|
-
|
1239
|
-
|
449
|
+
|
450
|
+
gpt_params_handle_model_default(params);
|
451
|
+
|
452
|
+
if (params.escape) {
|
453
|
+
string_process_escapes(params.prompt);
|
454
|
+
string_process_escapes(params.input_prefix);
|
455
|
+
string_process_escapes(params.input_suffix);
|
456
|
+
for (auto & antiprompt : params.antiprompt) {
|
457
|
+
string_process_escapes(antiprompt);
|
458
|
+
}
|
1240
459
|
}
|
1241
|
-
|
1242
|
-
|
1243
|
-
params.
|
1244
|
-
params.
|
1245
|
-
return true;
|
460
|
+
|
461
|
+
if (!params.kv_overrides.empty()) {
|
462
|
+
params.kv_overrides.emplace_back();
|
463
|
+
params.kv_overrides.back().key[0] = 0;
|
1246
464
|
}
|
1247
|
-
|
1248
|
-
|
1249
|
-
|
1250
|
-
return true;
|
465
|
+
|
466
|
+
if (sparams.seed == LLAMA_DEFAULT_SEED) {
|
467
|
+
sparams.seed = time(NULL);
|
1251
468
|
}
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
469
|
+
|
470
|
+
return true;
|
471
|
+
}
|
472
|
+
|
473
|
+
bool gpt_params_parse(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options) {
|
474
|
+
const auto params_org = params; // the example can modify the default params
|
475
|
+
|
476
|
+
try {
|
477
|
+
if (!gpt_params_parse_ex(argc, argv, params, options)) {
|
478
|
+
params = params_org;
|
479
|
+
return false;
|
1259
480
|
}
|
1260
|
-
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1266
|
-
params.system_prompt = system_prompt;
|
1267
|
-
return true;
|
1268
|
-
}
|
1269
|
-
if (arg == "--log-format") {
|
1270
|
-
CHECK_ARG
|
1271
|
-
if (std::strcmp(argv[i], "json") == 0) {
|
1272
|
-
params.log_json = true;
|
1273
|
-
} else if (std::strcmp(argv[i], "text") == 0) {
|
1274
|
-
params.log_json = false;
|
1275
|
-
} else {
|
1276
|
-
invalid_param = true;
|
1277
|
-
return true;
|
481
|
+
if (params.usage) {
|
482
|
+
gpt_params_print_usage(params, options);
|
483
|
+
if (params.print_usage) {
|
484
|
+
params.print_usage(argc, argv);
|
485
|
+
}
|
486
|
+
exit(0);
|
1278
487
|
}
|
1279
|
-
|
1280
|
-
|
1281
|
-
|
1282
|
-
|
1283
|
-
return true;
|
488
|
+
} catch (const std::invalid_argument & ex) {
|
489
|
+
fprintf(stderr, "%s\n", ex.what());
|
490
|
+
params = params_org;
|
491
|
+
return false;
|
1284
492
|
}
|
1285
|
-
|
1286
|
-
|
1287
|
-
|
493
|
+
|
494
|
+
return true;
|
495
|
+
}
|
496
|
+
|
497
|
+
bool parse_cpu_range(const std::string & range, bool (&boolmask)[LM_GGML_MAX_N_THREADS]) {
|
498
|
+
size_t dash_loc = range.find('-');
|
499
|
+
if (dash_loc == std::string::npos) {
|
500
|
+
fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
|
501
|
+
return false;
|
1288
502
|
}
|
1289
|
-
|
1290
|
-
|
1291
|
-
|
1292
|
-
|
1293
|
-
|
1294
|
-
|
503
|
+
|
504
|
+
size_t start_i;
|
505
|
+
size_t end_i;
|
506
|
+
|
507
|
+
if (dash_loc == 0) {
|
508
|
+
start_i = 0;
|
509
|
+
} else {
|
510
|
+
start_i = std::stoull(range.substr(0, dash_loc));
|
511
|
+
if (start_i >= LM_GGML_MAX_N_THREADS) {
|
512
|
+
fprintf(stderr, "Start index out of bounds!\n");
|
513
|
+
return false;
|
1295
514
|
}
|
1296
|
-
return true;
|
1297
515
|
}
|
1298
|
-
|
1299
|
-
|
1300
|
-
|
1301
|
-
|
1302
|
-
|
1303
|
-
|
1304
|
-
|
516
|
+
|
517
|
+
if (dash_loc == range.length() - 1) {
|
518
|
+
end_i = LM_GGML_MAX_N_THREADS - 1;
|
519
|
+
} else {
|
520
|
+
end_i = std::stoull(range.substr(dash_loc + 1));
|
521
|
+
if (end_i >= LM_GGML_MAX_N_THREADS) {
|
522
|
+
fprintf(stderr, "End index out of bounds!\n");
|
523
|
+
return false;
|
1305
524
|
}
|
1306
|
-
params.chat_template = argv[i];
|
1307
|
-
return true;
|
1308
|
-
}
|
1309
|
-
if (arg == "--slot-prompt-similarity" || arg == "-sps") {
|
1310
|
-
CHECK_ARG
|
1311
|
-
params.slot_prompt_similarity = std::stof(argv[i]);
|
1312
|
-
return true;
|
1313
|
-
}
|
1314
|
-
if (arg == "-pps") {
|
1315
|
-
params.is_pp_shared = true;
|
1316
|
-
return true;
|
1317
|
-
}
|
1318
|
-
if (arg == "-npp") {
|
1319
|
-
CHECK_ARG
|
1320
|
-
auto p = string_split<int>(argv[i], split_delim);
|
1321
|
-
params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
|
1322
|
-
return true;
|
1323
525
|
}
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
|
1328
|
-
return true;
|
526
|
+
|
527
|
+
for (size_t i = start_i; i <= end_i; i++) {
|
528
|
+
boolmask[i] = true;
|
1329
529
|
}
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
1333
|
-
|
1334
|
-
|
530
|
+
|
531
|
+
return true;
|
532
|
+
}
|
533
|
+
|
534
|
+
bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[LM_GGML_MAX_N_THREADS]) {
|
535
|
+
// Discard potential 0x prefix
|
536
|
+
size_t start_i = 0;
|
537
|
+
if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
|
538
|
+
start_i = 2;
|
1335
539
|
}
|
1336
|
-
|
1337
|
-
|
1338
|
-
|
1339
|
-
|
1340
|
-
|
1341
|
-
|
1342
|
-
|
540
|
+
|
541
|
+
size_t num_digits = mask.length() - start_i;
|
542
|
+
if (num_digits > 128) num_digits = 128;
|
543
|
+
|
544
|
+
size_t end_i = num_digits + start_i;
|
545
|
+
|
546
|
+
for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
|
547
|
+
char c = mask.at(i);
|
548
|
+
int8_t id = c;
|
549
|
+
|
550
|
+
if ((c >= '0' && c <= '9')) {
|
551
|
+
id -= '0';
|
552
|
+
} else if (c >= 'a' && c <= 'f') {
|
553
|
+
id -= 'a' - 10;
|
554
|
+
} else if (c >= 'A' && c <= 'F') {
|
555
|
+
id -= 'A' - 10;
|
556
|
+
} else {
|
557
|
+
fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
|
558
|
+
return false;
|
1343
559
|
}
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1348
|
-
|
1349
|
-
params.chunk_size = std::stoi(argv[i]);
|
1350
|
-
return true;
|
1351
|
-
}
|
1352
|
-
if (arg == "--chunk-separator") {
|
1353
|
-
CHECK_ARG
|
1354
|
-
params.chunk_separator = argv[i];
|
1355
|
-
return true;
|
1356
|
-
}
|
1357
|
-
if (arg == "--junk") {
|
1358
|
-
CHECK_ARG
|
1359
|
-
params.n_junk = std::stoi(argv[i]);
|
1360
|
-
return true;
|
1361
|
-
}
|
1362
|
-
if (arg == "--pos") {
|
1363
|
-
CHECK_ARG
|
1364
|
-
params.i_pos = std::stoi(argv[i]);
|
1365
|
-
return true;
|
1366
|
-
}
|
1367
|
-
if (arg == "-o" || arg == "--output" || arg == "--output-file") {
|
1368
|
-
CHECK_ARG
|
1369
|
-
params.out_file = argv[i];
|
1370
|
-
params.cvector_outfile = argv[i];
|
1371
|
-
params.lora_outfile = argv[i];
|
1372
|
-
return true;
|
1373
|
-
}
|
1374
|
-
if (arg == "-ofreq" || arg == "--output-frequency") {
|
1375
|
-
CHECK_ARG
|
1376
|
-
params.n_out_freq = std::stoi(argv[i]);
|
1377
|
-
return true;
|
1378
|
-
}
|
1379
|
-
if (arg == "--save-frequency") {
|
1380
|
-
CHECK_ARG
|
1381
|
-
params.n_save_freq = std::stoi(argv[i]);
|
1382
|
-
return true;
|
1383
|
-
}
|
1384
|
-
if (arg == "--process-output") {
|
1385
|
-
params.process_output = true;
|
1386
|
-
return true;
|
1387
|
-
}
|
1388
|
-
if (arg == "--no-ppl") {
|
1389
|
-
params.compute_ppl = false;
|
1390
|
-
return true;
|
1391
|
-
}
|
1392
|
-
if (arg == "--chunk" || arg == "--from-chunk") {
|
1393
|
-
CHECK_ARG
|
1394
|
-
params.i_chunk = std::stoi(argv[i]);
|
1395
|
-
return true;
|
1396
|
-
}
|
1397
|
-
// cvector params
|
1398
|
-
if (arg == "--positive-file") {
|
1399
|
-
CHECK_ARG
|
1400
|
-
params.cvector_positive_file = argv[i];
|
1401
|
-
return true;
|
1402
|
-
}
|
1403
|
-
if (arg == "--negative-file") {
|
1404
|
-
CHECK_ARG
|
1405
|
-
params.cvector_negative_file = argv[i];
|
1406
|
-
return true;
|
1407
|
-
}
|
1408
|
-
if (arg == "--pca-batch") {
|
1409
|
-
CHECK_ARG
|
1410
|
-
params.n_pca_batch = std::stoi(argv[i]);
|
1411
|
-
return true;
|
560
|
+
|
561
|
+
boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
|
562
|
+
boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
|
563
|
+
boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
|
564
|
+
boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
|
1412
565
|
}
|
1413
|
-
|
1414
|
-
|
1415
|
-
|
1416
|
-
|
566
|
+
|
567
|
+
return true;
|
568
|
+
}
|
569
|
+
|
570
|
+
static std::vector<std::string> break_str_into_lines(std::string input, size_t max_char_per_line) {
|
571
|
+
std::vector<std::string> result;
|
572
|
+
std::istringstream iss(input);
|
573
|
+
std::string line;
|
574
|
+
auto add_line = [&](const std::string& l) {
|
575
|
+
if (l.length() <= max_char_per_line) {
|
576
|
+
result.push_back(l);
|
577
|
+
} else {
|
578
|
+
std::istringstream line_stream(l);
|
579
|
+
std::string word, current_line;
|
580
|
+
while (line_stream >> word) {
|
581
|
+
if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) {
|
582
|
+
if (!current_line.empty()) result.push_back(current_line);
|
583
|
+
current_line = word;
|
584
|
+
} else {
|
585
|
+
current_line += (!current_line.empty() ? " " : "") + word;
|
586
|
+
}
|
587
|
+
}
|
588
|
+
if (!current_line.empty()) result.push_back(current_line);
|
589
|
+
}
|
590
|
+
};
|
591
|
+
while (std::getline(iss, line)) {
|
592
|
+
add_line(line);
|
1417
593
|
}
|
1418
|
-
|
1419
|
-
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
594
|
+
return result;
|
595
|
+
}
|
596
|
+
|
597
|
+
std::string llama_arg::to_string() {
|
598
|
+
// params for printing to console
|
599
|
+
const static int n_leading_spaces = 40;
|
600
|
+
const static int n_char_per_line_help = 70; // TODO: detect this based on current console
|
601
|
+
std::string leading_spaces(n_leading_spaces, ' ');
|
602
|
+
|
603
|
+
std::ostringstream ss;
|
604
|
+
for (const auto arg : args) {
|
605
|
+
if (arg == args.front()) {
|
606
|
+
if (args.size() == 1) {
|
607
|
+
ss << arg;
|
608
|
+
} else {
|
609
|
+
// first arg is usually abbreviation, we need padding to make it more beautiful
|
610
|
+
auto tmp = std::string(arg) + ", ";
|
611
|
+
ss << format("%-7s", tmp.c_str());
|
612
|
+
}
|
613
|
+
} else {
|
614
|
+
ss << arg << (arg != args.back() ? ", " : "");
|
615
|
+
}
|
1425
616
|
}
|
1426
|
-
if (
|
1427
|
-
|
1428
|
-
|
617
|
+
if (value_hint) ss << " " << value_hint;
|
618
|
+
if (value_hint_2) ss << " " << value_hint_2;
|
619
|
+
if (ss.tellp() > n_leading_spaces - 3) {
|
620
|
+
// current line is too long, add new line
|
621
|
+
ss << "\n" << leading_spaces;
|
622
|
+
} else {
|
623
|
+
// padding between arg and help, same line
|
624
|
+
ss << std::string(leading_spaces.size() - ss.tellp(), ' ');
|
1429
625
|
}
|
1430
|
-
|
1431
|
-
|
1432
|
-
|
1433
|
-
// Do nothing, log_param_single_parse automatically does it's thing
|
1434
|
-
// and returns if a match was found and parsed.
|
1435
|
-
return true;
|
626
|
+
const auto help_lines = break_str_into_lines(help, n_char_per_line_help);
|
627
|
+
for (const auto & line : help_lines) {
|
628
|
+
ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n";
|
1436
629
|
}
|
1437
|
-
|
1438
|
-
|
1439
|
-
|
1440
|
-
|
1441
|
-
|
1442
|
-
|
1443
|
-
|
1444
|
-
return true;
|
630
|
+
return ss.str();
|
631
|
+
}
|
632
|
+
|
633
|
+
void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options) {
|
634
|
+
auto print_options = [](std::vector<llama_arg *> & options) {
|
635
|
+
for (llama_arg * opt : options) {
|
636
|
+
printf("%s", opt->to_string().c_str());
|
1445
637
|
}
|
1446
|
-
|
1447
|
-
}
|
1448
|
-
// End of Parse args for logging parameters
|
1449
|
-
#endif // LOG_DISABLE_LOGS
|
638
|
+
};
|
1450
639
|
|
1451
|
-
|
640
|
+
std::vector<llama_arg *> common_options;
|
641
|
+
std::vector<llama_arg *> specific_options;
|
642
|
+
for (auto & opt : options) {
|
643
|
+
// in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
|
644
|
+
if (opt.in_example(params.curr_ex)) {
|
645
|
+
specific_options.push_back(&opt);
|
646
|
+
} else {
|
647
|
+
common_options.push_back(&opt);
|
648
|
+
}
|
649
|
+
}
|
650
|
+
printf("----- common options -----\n\n");
|
651
|
+
print_options(common_options);
|
652
|
+
// TODO: maybe convert enum llama_example to string
|
653
|
+
printf("\n\n----- example-specific options -----\n\n");
|
654
|
+
print_options(specific_options);
|
1452
655
|
}
|
1453
656
|
|
1454
|
-
|
1455
|
-
|
1456
|
-
|
1457
|
-
#else
|
1458
|
-
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
1459
|
-
#endif
|
1460
|
-
#else
|
1461
|
-
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
1462
|
-
#endif
|
657
|
+
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex) {
|
658
|
+
return gpt_params_parser_init(params, ex, nullptr);
|
659
|
+
}
|
1463
660
|
|
1464
|
-
|
1465
|
-
|
661
|
+
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage) {
|
662
|
+
std::vector<llama_arg> options;
|
663
|
+
params.print_usage = print_usage;
|
664
|
+
params.curr_ex = ex;
|
1466
665
|
|
1467
666
|
std::string sampler_type_chars;
|
1468
667
|
std::string sampler_type_names;
|
1469
|
-
for (const auto
|
1470
|
-
sampler_type_chars +=
|
1471
|
-
sampler_type_names +=
|
668
|
+
for (const auto & sampler : params.sparams.samplers) {
|
669
|
+
sampler_type_chars += gpt_sampler_type_to_chr(sampler);
|
670
|
+
sampler_type_names += gpt_sampler_type_to_str(sampler) + ";";
|
1472
671
|
}
|
1473
672
|
sampler_type_names.pop_back();
|
1474
673
|
|
1475
|
-
struct option_info {
|
1476
|
-
LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5)
|
1477
|
-
option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) {
|
1478
|
-
va_list args_list;
|
1479
|
-
va_start(args_list, desc);
|
1480
|
-
char buffer[1024];
|
1481
|
-
vsnprintf(buffer, sizeof(buffer), desc, args_list);
|
1482
|
-
va_end(args_list);
|
1483
|
-
this->desc = buffer;
|
1484
|
-
}
|
1485
|
-
|
1486
|
-
option_info(const std::string & grp) : grp(grp) {}
|
1487
674
|
|
1488
|
-
|
1489
|
-
|
1490
|
-
|
1491
|
-
|
675
|
+
/**
|
676
|
+
* filter options by example
|
677
|
+
* rules:
|
678
|
+
* - all examples inherit options from LLAMA_EXAMPLE_COMMON
|
679
|
+
* - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
|
680
|
+
* - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
|
681
|
+
*/
|
682
|
+
auto add_opt = [&](llama_arg arg) {
|
683
|
+
if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
|
684
|
+
options.push_back(std::move(arg));
|
685
|
+
}
|
1492
686
|
};
|
1493
687
|
|
1494
|
-
std::vector<option_info> options;
|
1495
|
-
|
1496
|
-
// TODO: filter by tags
|
1497
|
-
|
1498
|
-
options.push_back({ "general" });
|
1499
|
-
options.push_back({ "*", "-h, --help, --usage", "print usage and exit" });
|
1500
|
-
options.push_back({ "*", " --version", "show version and build info" });
|
1501
|
-
options.push_back({ "*", "-v, --verbose", "print verbose information" });
|
1502
|
-
options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity });
|
1503
|
-
options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
|
1504
|
-
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
|
1505
|
-
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
|
1506
|
-
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
|
1507
|
-
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
|
1508
|
-
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
|
1509
|
-
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
|
1510
|
-
options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
|
1511
|
-
"number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
|
1512
|
-
options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
|
1513
|
-
options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
|
1514
|
-
options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
|
1515
|
-
"path to static lookup cache to use for lookup decoding (not updated by generation)" });
|
1516
|
-
options.push_back({ "*", "-lcd, --lookup-cache-dynamic FNAME",
|
1517
|
-
"path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
|
1518
|
-
|
1519
|
-
options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
|
1520
|
-
options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
|
1521
|
-
options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch });
|
1522
|
-
options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch });
|
1523
|
-
options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
|
1524
|
-
options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
|
1525
|
-
options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
|
1526
|
-
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n"
|
1527
|
-
"in conversation mode, this will be used as system prompt\n"
|
1528
|
-
"(default: '%s')", params.prompt.c_str() });
|
1529
|
-
options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
|
1530
|
-
options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" });
|
1531
|
-
options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
|
1532
|
-
options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
|
1533
|
-
options.push_back({ "*", " --no-escape", "do not process escape sequences" });
|
1534
|
-
options.push_back({ "main", "-ptc, --print-token-count N", "print token count every N tokens (default: %d)", params.n_print });
|
1535
|
-
options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" });
|
1536
|
-
options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n"
|
1537
|
-
"not supported with --interactive or other interactive options" });
|
1538
|
-
options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" });
|
1539
|
-
options.push_back({ "main", "-r, --reverse-prompt PROMPT",
|
1540
|
-
"halt generation at PROMPT, return control in interactive mode\n"
|
1541
|
-
"can be specified more than once for multiple prompts" });
|
1542
|
-
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
|
1543
|
-
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode, does not print special tokens and suffix/prefix\n"
|
1544
|
-
"if suffix/prefix are not specified, default chat template will be used\n"
|
1545
|
-
"(default: %s)", params.conversation ? "true" : "false" });
|
1546
|
-
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
|
1547
|
-
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
|
1548
|
-
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
|
1549
|
-
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
|
1550
|
-
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
|
1551
|
-
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
|
1552
|
-
options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" });
|
1553
|
-
options.push_back({ "server infill",
|
1554
|
-
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
|
1555
|
-
|
1556
|
-
options.push_back({ "sampling" });
|
1557
|
-
options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
|
1558
|
-
"(default: %s)", sampler_type_names.c_str() });
|
1559
|
-
options.push_back({ "*", " --sampling-seq SEQUENCE",
|
1560
|
-
"simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() });
|
1561
|
-
options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" });
|
1562
|
-
options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" });
|
1563
|
-
options.push_back({ "*", " --temp N", "temperature (default: %.1f)", (double)sparams.temp });
|
1564
|
-
options.push_back({ "*", " --top-k N", "top-k sampling (default: %d, 0 = disabled)", sparams.top_k });
|
1565
|
-
options.push_back({ "*", " --top-p N", "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p });
|
1566
|
-
options.push_back({ "*", " --min-p N", "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p });
|
1567
|
-
options.push_back({ "*", " --tfs N", "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z });
|
1568
|
-
options.push_back({ "*", " --typical N", "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p });
|
1569
|
-
options.push_back({ "*", " --repeat-last-n N", "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n });
|
1570
|
-
options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat });
|
1571
|
-
options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present });
|
1572
|
-
options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq });
|
1573
|
-
options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range });
|
1574
|
-
options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent });
|
1575
|
-
options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n"
|
1576
|
-
"Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
|
1577
|
-
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat });
|
1578
|
-
options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
|
1579
|
-
options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
|
1580
|
-
options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n"
|
1581
|
-
"i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
|
1582
|
-
"or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
|
1583
|
-
options.push_back({ "main", " --cfg-negative-prompt PROMPT",
|
1584
|
-
"negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() });
|
1585
|
-
options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
|
1586
|
-
"negative prompt file to use for guidance" });
|
1587
|
-
options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
|
1588
|
-
options.push_back({ "main", " --chat-template JINJA_TEMPLATE",
|
1589
|
-
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
1590
|
-
"if suffix/prefix are specified, template will be disabled\n"
|
1591
|
-
"only commonly used templates are accepted:\n"
|
1592
|
-
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
|
1593
|
-
options.push_back({ "grammar" });
|
1594
|
-
options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
|
1595
|
-
options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
|
1596
|
-
options.push_back({ "*", "-j, --json-schema SCHEMA",
|
1597
|
-
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n"
|
1598
|
-
"For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
|
1599
|
-
|
1600
|
-
options.push_back({ "embedding" });
|
1601
|
-
options.push_back({ "embedding", " --pooling {none,mean,cls,last}",
|
1602
|
-
"pooling type for embeddings, use model default if unspecified" });
|
1603
|
-
options.push_back({ "embedding", " --attention {causal,non-causal}",
|
1604
|
-
"attention type for embeddings, use model default if unspecified" });
|
1605
|
-
|
1606
|
-
options.push_back({ "context hacking" });
|
1607
|
-
options.push_back({ "*", " --rope-scaling {none,linear,yarn}",
|
1608
|
-
"RoPE frequency scaling method, defaults to linear unless specified by the model" });
|
1609
|
-
options.push_back({ "*", " --rope-scale N", "RoPE context scaling factor, expands context by a factor of N" });
|
1610
|
-
options.push_back({ "*", " --rope-freq-base N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" });
|
1611
|
-
options.push_back({ "*", " --rope-freq-scale N", "RoPE frequency scaling factor, expands context by a factor of 1/N" });
|
1612
|
-
options.push_back({ "*", " --yarn-orig-ctx N", "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx });
|
1613
|
-
options.push_back({ "*", " --yarn-ext-factor N", "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor });
|
1614
|
-
options.push_back({ "*", " --yarn-attn-factor N", "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor });
|
1615
|
-
options.push_back({ "*", " --yarn-beta-slow N", "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow });
|
1616
|
-
options.push_back({ "*", " --yarn-beta-fast N", "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast });
|
1617
|
-
options.push_back({ "*", "-gan, --grp-attn-n N", "group-attention factor (default: %d)", params.grp_attn_n });
|
1618
|
-
options.push_back({ "*", "-gaw, --grp-attn-w N", "group-attention width (default: %.1f)", (double)params.grp_attn_w });
|
1619
|
-
options.push_back({ "*", "-dkvc, --dump-kv-cache", "verbose print of the KV cache" });
|
1620
|
-
options.push_back({ "*", "-nkvo, --no-kv-offload", "disable KV offload" });
|
1621
|
-
options.push_back({ "*", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() });
|
1622
|
-
options.push_back({ "*", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() });
|
1623
|
-
|
1624
|
-
options.push_back({ "perplexity" });
|
1625
|
-
options.push_back({ "perplexity", " --all-logits", "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" });
|
1626
|
-
options.push_back({ "perplexity", " --hellaswag", "compute HellaSwag score over random tasks from datafile supplied with -f" });
|
1627
|
-
options.push_back({ "perplexity", " --hellaswag-tasks N", "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks });
|
1628
|
-
options.push_back({ "perplexity", " --winogrande", "compute Winogrande score over random tasks from datafile supplied with -f" });
|
1629
|
-
options.push_back({ "perplexity", " --winogrande-tasks N", "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks });
|
1630
|
-
options.push_back({ "perplexity", " --multiple-choice", "compute multiple choice score over random tasks from datafile supplied with -f" });
|
1631
|
-
options.push_back({ "perplexity", " --multiple-choice-tasks N",
|
1632
|
-
"number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks });
|
1633
|
-
options.push_back({ "perplexity", " --kl-divergence", "computes KL-divergence to logits provided via --kl-divergence-base" });
|
1634
|
-
options.push_back({ "perplexity", " --ppl-stride N", "stride for perplexity calculation (default: %d)", params.ppl_stride });
|
1635
|
-
options.push_back({ "perplexity", " --ppl-output-type {0,1}",
|
1636
|
-
"output type for perplexity calculation (default: %d)", params.ppl_output_type });
|
1637
|
-
|
1638
|
-
options.push_back({ "parallel" });
|
1639
|
-
options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold });
|
1640
|
-
options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
|
1641
|
-
options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
|
1642
|
-
options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
|
1643
|
-
options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" });
|
1644
|
-
|
1645
|
-
options.push_back({ "multi-modality" });
|
1646
|
-
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
|
1647
|
-
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
|
1648
|
-
|
1649
|
-
options.push_back({ "backend" });
|
1650
|
-
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
|
1651
|
-
|
1652
|
-
if (llama_supports_mlock()) {
|
1653
|
-
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
|
1654
|
-
}
|
1655
|
-
if (llama_supports_mmap()) {
|
1656
|
-
options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
|
1657
|
-
}
|
1658
|
-
options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n"
|
1659
|
-
" - distribute: spread execution evenly over all nodes\n"
|
1660
|
-
" - isolate: only spawn threads on CPUs on the node that execution started on\n"
|
1661
|
-
" - numactl: use the CPU map provided by numactl\n"
|
1662
|
-
"if run without this previously, it is recommended to drop the system page cache before using this\n"
|
1663
|
-
"see https://github.com/ggerganov/llama.cpp/issues/1437" });
|
1664
|
-
|
1665
|
-
if (llama_supports_gpu_offload()) {
|
1666
|
-
options.push_back({ "*", "-ngl, --gpu-layers N",
|
1667
|
-
"number of layers to store in VRAM" });
|
1668
|
-
options.push_back({ "*", "-ngld, --gpu-layers-draft N",
|
1669
|
-
"number of layers to store in VRAM for the draft model" });
|
1670
|
-
options.push_back({ "*", "-sm, --split-mode SPLIT_MODE",
|
1671
|
-
"how to split the model across multiple GPUs, one of:\n"
|
1672
|
-
" - none: use one GPU only\n"
|
1673
|
-
" - layer (default): split layers and KV across GPUs\n"
|
1674
|
-
" - row: split rows across GPUs" });
|
1675
|
-
options.push_back({ "*", "-ts, --tensor-split SPLIT",
|
1676
|
-
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
|
1677
|
-
options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n"
|
1678
|
-
"or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
|
1679
|
-
}
|
1680
|
-
|
1681
|
-
options.push_back({ "model" });
|
1682
|
-
options.push_back({ "*", " --check-tensors", "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" });
|
1683
|
-
options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
|
1684
|
-
"advanced option to override model metadata by key. may be specified multiple times.\n"
|
1685
|
-
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
|
1686
|
-
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" });
|
1687
|
-
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
1688
|
-
options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
|
1689
|
-
"note: this argument can be repeated to add multiple control vectors" });
|
1690
|
-
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
|
1691
|
-
"add a control vector with user defined scaling SCALE\n"
|
1692
|
-
"note: this argument can be repeated to add multiple scaled control vectors" });
|
1693
|
-
options.push_back({ "*", " --control-vector-layer-range START END",
|
1694
|
-
"layer range to apply the control vector(s) to, start and end inclusive" });
|
1695
|
-
options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
|
1696
|
-
"or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
|
1697
|
-
options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" });
|
1698
|
-
options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
|
1699
|
-
options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
|
1700
|
-
options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" });
|
1701
|
-
options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
|
1702
|
-
|
1703
|
-
options.push_back({ "retrieval" });
|
1704
|
-
options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" });
|
1705
|
-
options.push_back({ "retrieval", " --chunk-size N", "minimum length of embedded text chunks (default: %d)", params.chunk_size });
|
1706
|
-
options.push_back({ "retrieval", " --chunk-separator STRING",
|
1707
|
-
"separator between chunks (default: '%s')", params.chunk_separator.c_str() });
|
1708
|
-
|
1709
|
-
options.push_back({ "passkey" });
|
1710
|
-
options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk });
|
1711
|
-
options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos });
|
1712
|
-
|
1713
|
-
options.push_back({ "imatrix" });
|
1714
|
-
options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() });
|
1715
|
-
options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq });
|
1716
|
-
options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
|
1717
|
-
options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
|
1718
|
-
options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
|
1719
|
-
options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk });
|
1720
|
-
|
1721
|
-
options.push_back({ "bench" });
|
1722
|
-
options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
|
1723
|
-
options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" });
|
1724
|
-
options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" });
|
1725
|
-
options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" });
|
1726
|
-
|
1727
|
-
options.push_back({ "embedding" });
|
1728
|
-
options.push_back({ "embedding", " --embd-normalize", "normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize });
|
1729
|
-
options.push_back({ "embedding", " --embd-output-format", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix" });
|
1730
|
-
options.push_back({ "embedding", " --embd-separator", "separator of embendings (default \\n) for example \"<#sep#>\"" });
|
1731
|
-
|
1732
|
-
options.push_back({ "server" });
|
1733
|
-
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
|
1734
|
-
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
|
1735
|
-
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
|
1736
|
-
options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
|
1737
|
-
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
|
1738
|
-
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
|
1739
|
-
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
|
1740
|
-
options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" });
|
1741
|
-
options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read });
|
1742
|
-
options.push_back({ "server", " --threads-http N", "number of threads used to process HTTP requests (default: %d)", params.n_threads_http });
|
1743
|
-
options.push_back({ "server", " --system-prompt-file FNAME",
|
1744
|
-
"set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" });
|
1745
|
-
options.push_back({ "server", " --log-format {text,json}",
|
1746
|
-
"log output format: json or text (default: json)" });
|
1747
|
-
options.push_back({ "server", " --metrics", "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" });
|
1748
|
-
options.push_back({ "server", " --no-slots", "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" });
|
1749
|
-
options.push_back({ "server", " --slot-save-path PATH", "path to save slot kv cache (default: disabled)" });
|
1750
|
-
options.push_back({ "server", " --chat-template JINJA_TEMPLATE",
|
1751
|
-
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
1752
|
-
"only commonly used templates are accepted:\n"
|
1753
|
-
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
|
1754
|
-
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
|
1755
|
-
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
|
1756
|
-
options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"});
|
1757
|
-
|
1758
|
-
#ifndef LOG_DISABLE_LOGS
|
1759
|
-
options.push_back({ "logging" });
|
1760
|
-
options.push_back({ "*", " --simple-io", "use basic IO for better compatibility in subprocesses and limited consoles" });
|
1761
|
-
options.push_back({ "*", "-ld, --logdir LOGDIR", "path under which to save YAML logs (no logging if unset)" });
|
1762
|
-
options.push_back({ "logging", " --log-test", "Run simple logging test" });
|
1763
|
-
options.push_back({ "logging", " --log-disable", "Disable trace logs" });
|
1764
|
-
options.push_back({ "logging", " --log-enable", "Enable trace logs" });
|
1765
|
-
options.push_back({ "logging", " --log-file FNAME", "Specify a log filename (without extension)" });
|
1766
|
-
options.push_back({ "logging", " --log-new", "Create a separate new log file on start. "
|
1767
|
-
"Each log file will have unique name: \"<name>.<ID>.log\"" });
|
1768
|
-
options.push_back({ "logging", " --log-append", "Don't truncate the old log file." });
|
1769
|
-
#endif // LOG_DISABLE_LOGS
|
1770
|
-
|
1771
|
-
options.push_back({ "cvector" });
|
1772
|
-
options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() });
|
1773
|
-
options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
|
1774
|
-
options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
|
1775
|
-
options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
|
1776
|
-
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
|
1777
|
-
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
|
1778
|
-
|
1779
|
-
options.push_back({ "export-lora" });
|
1780
|
-
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
|
1781
|
-
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
|
1782
|
-
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
1783
|
-
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
|
1784
|
-
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
|
1785
688
|
|
1786
|
-
|
1787
|
-
|
1788
|
-
|
1789
|
-
|
1790
|
-
|
1791
|
-
continue;
|
689
|
+
add_opt(llama_arg(
|
690
|
+
{"-h", "--help", "--usage"},
|
691
|
+
"print usage and exit",
|
692
|
+
[](gpt_params & params) {
|
693
|
+
params.usage = true;
|
1792
694
|
}
|
1793
|
-
|
1794
|
-
|
1795
|
-
|
695
|
+
));
|
696
|
+
add_opt(llama_arg(
|
697
|
+
{"--version"},
|
698
|
+
"show version and build info",
|
699
|
+
[](gpt_params &) {
|
700
|
+
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
701
|
+
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
702
|
+
exit(0);
|
703
|
+
}
|
704
|
+
));
|
705
|
+
add_opt(llama_arg(
|
706
|
+
{"-v", "--verbose"},
|
707
|
+
"print verbose information",
|
708
|
+
[](gpt_params & params) {
|
709
|
+
params.verbosity = 1;
|
710
|
+
}
|
711
|
+
));
|
712
|
+
add_opt(llama_arg(
|
713
|
+
{"--verbosity"}, "N",
|
714
|
+
format("set specific verbosity level (default: %d)", params.verbosity),
|
715
|
+
[](gpt_params & params, int value) {
|
716
|
+
params.verbosity = value;
|
717
|
+
}
|
718
|
+
));
|
719
|
+
add_opt(llama_arg(
|
720
|
+
{"--verbose-prompt"},
|
721
|
+
format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
|
722
|
+
[](gpt_params & params) {
|
723
|
+
params.verbose_prompt = true;
|
724
|
+
}
|
725
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
726
|
+
add_opt(llama_arg(
|
727
|
+
{"--no-display-prompt"},
|
728
|
+
format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
|
729
|
+
[](gpt_params & params) {
|
730
|
+
params.display_prompt = false;
|
731
|
+
}
|
732
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
733
|
+
add_opt(llama_arg(
|
734
|
+
{"-co", "--color"},
|
735
|
+
format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
|
736
|
+
[](gpt_params & params) {
|
737
|
+
params.use_color = true;
|
738
|
+
}
|
739
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
740
|
+
add_opt(llama_arg(
|
741
|
+
{"-s", "--seed"}, "SEED",
|
742
|
+
format("RNG seed (default: %d, use random seed for < 0)", params.sparams.seed),
|
743
|
+
[](gpt_params & params, const std::string & value) {
|
744
|
+
params.sparams.seed = std::stoul(value);
|
745
|
+
}
|
746
|
+
));
|
747
|
+
add_opt(llama_arg(
|
748
|
+
{"-t", "--threads"}, "N",
|
749
|
+
format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
750
|
+
[](gpt_params & params, int value) {
|
751
|
+
params.cpuparams.n_threads = value;
|
752
|
+
if (params.cpuparams.n_threads <= 0) {
|
753
|
+
params.cpuparams.n_threads = std::thread::hardware_concurrency();
|
754
|
+
}
|
1796
755
|
}
|
1797
|
-
|
1798
|
-
|
1799
|
-
|
1800
|
-
|
1801
|
-
|
1802
|
-
|
1803
|
-
|
1804
|
-
|
756
|
+
).set_env("LLAMA_ARG_THREADS"));
|
757
|
+
add_opt(llama_arg(
|
758
|
+
{"-tb", "--threads-batch"}, "N",
|
759
|
+
"number of threads to use during batch and prompt processing (default: same as --threads)",
|
760
|
+
[](gpt_params & params, int value) {
|
761
|
+
params.cpuparams_batch.n_threads = value;
|
762
|
+
if (params.cpuparams_batch.n_threads <= 0) {
|
763
|
+
params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
764
|
+
}
|
765
|
+
}
|
766
|
+
));
|
767
|
+
add_opt(llama_arg(
|
768
|
+
{"-td", "--threads-draft"}, "N",
|
769
|
+
"number of threads to use during generation (default: same as --threads)",
|
770
|
+
[](gpt_params & params, int value) {
|
771
|
+
params.draft_cpuparams.n_threads = value;
|
772
|
+
if (params.draft_cpuparams.n_threads <= 0) {
|
773
|
+
params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
|
774
|
+
}
|
775
|
+
}
|
776
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
777
|
+
add_opt(llama_arg(
|
778
|
+
{"-tbd", "--threads-batch-draft"}, "N",
|
779
|
+
"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
|
780
|
+
[](gpt_params & params, int value) {
|
781
|
+
params.draft_cpuparams_batch.n_threads = value;
|
782
|
+
if (params.draft_cpuparams_batch.n_threads <= 0) {
|
783
|
+
params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
784
|
+
}
|
785
|
+
}
|
786
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
787
|
+
add_opt(llama_arg(
|
788
|
+
{"-C", "--cpu-mask"}, "M",
|
789
|
+
"CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
|
790
|
+
[](gpt_params & params, const std::string & mask) {
|
791
|
+
params.cpuparams.mask_valid = true;
|
792
|
+
if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) {
|
793
|
+
throw std::invalid_argument("invalid cpumask");
|
794
|
+
}
|
795
|
+
}
|
796
|
+
));
|
797
|
+
add_opt(llama_arg(
|
798
|
+
{"-Cr", "--cpu-range"}, "lo-hi",
|
799
|
+
"range of CPUs for affinity. Complements --cpu-mask",
|
800
|
+
[](gpt_params & params, const std::string & range) {
|
801
|
+
params.cpuparams.mask_valid = true;
|
802
|
+
if (!parse_cpu_range(range, params.cpuparams.cpumask)) {
|
803
|
+
throw std::invalid_argument("invalid range");
|
804
|
+
}
|
805
|
+
}
|
806
|
+
));
|
807
|
+
add_opt(llama_arg(
|
808
|
+
{"--cpu-strict"}, "<0|1>",
|
809
|
+
format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
|
810
|
+
[](gpt_params & params, const std::string & value) {
|
811
|
+
params.cpuparams.strict_cpu = std::stoul(value);
|
812
|
+
}
|
813
|
+
));
|
814
|
+
add_opt(llama_arg(
|
815
|
+
{"--prio"}, "N",
|
816
|
+
format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
|
817
|
+
[](gpt_params & params, int prio) {
|
818
|
+
if (prio < 0 || prio > 3) {
|
819
|
+
throw std::invalid_argument("invalid value");
|
820
|
+
}
|
821
|
+
params.cpuparams.priority = (enum lm_ggml_sched_priority) prio;
|
822
|
+
}
|
823
|
+
));
|
824
|
+
add_opt(llama_arg(
|
825
|
+
{"--poll"}, "<0...100>",
|
826
|
+
format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
|
827
|
+
[](gpt_params & params, const std::string & value) {
|
828
|
+
params.cpuparams.poll = std::stoul(value);
|
829
|
+
}
|
830
|
+
));
|
831
|
+
add_opt(llama_arg(
|
832
|
+
{"-Cb", "--cpu-mask-batch"}, "M",
|
833
|
+
"CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
|
834
|
+
[](gpt_params & params, const std::string & mask) {
|
835
|
+
params.cpuparams_batch.mask_valid = true;
|
836
|
+
if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) {
|
837
|
+
throw std::invalid_argument("invalid cpumask");
|
838
|
+
}
|
839
|
+
}
|
840
|
+
));
|
841
|
+
add_opt(llama_arg(
|
842
|
+
{"-Crb", "--cpu-range-batch"}, "lo-hi",
|
843
|
+
"ranges of CPUs for affinity. Complements --cpu-mask-batch",
|
844
|
+
[](gpt_params & params, const std::string & range) {
|
845
|
+
params.cpuparams_batch.mask_valid = true;
|
846
|
+
if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) {
|
847
|
+
throw std::invalid_argument("invalid range");
|
848
|
+
}
|
849
|
+
}
|
850
|
+
));
|
851
|
+
add_opt(llama_arg(
|
852
|
+
{"--cpu-strict-batch"}, "<0|1>",
|
853
|
+
"use strict CPU placement (default: same as --cpu-strict)",
|
854
|
+
[](gpt_params & params, int value) {
|
855
|
+
params.cpuparams_batch.strict_cpu = value;
|
856
|
+
}
|
857
|
+
));
|
858
|
+
add_opt(llama_arg(
|
859
|
+
{"--prio-batch"}, "N",
|
860
|
+
format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
|
861
|
+
[](gpt_params & params, int prio) {
|
862
|
+
if (prio < 0 || prio > 3) {
|
863
|
+
throw std::invalid_argument("invalid value");
|
864
|
+
}
|
865
|
+
params.cpuparams_batch.priority = (enum lm_ggml_sched_priority) prio;
|
866
|
+
}
|
867
|
+
));
|
868
|
+
add_opt(llama_arg(
|
869
|
+
{"--poll-batch"}, "<0|1>",
|
870
|
+
"use polling to wait for work (default: same as --poll)",
|
871
|
+
[](gpt_params & params, int value) {
|
872
|
+
params.cpuparams_batch.poll = value;
|
873
|
+
}
|
874
|
+
));
|
875
|
+
add_opt(llama_arg(
|
876
|
+
{"-Cd", "--cpu-mask-draft"}, "M",
|
877
|
+
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
878
|
+
[](gpt_params & params, const std::string & mask) {
|
879
|
+
params.draft_cpuparams.mask_valid = true;
|
880
|
+
if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
|
881
|
+
throw std::invalid_argument("invalid cpumask");
|
882
|
+
}
|
883
|
+
}
|
884
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
885
|
+
add_opt(llama_arg(
|
886
|
+
{"-Crd", "--cpu-range-draft"}, "lo-hi",
|
887
|
+
"Ranges of CPUs for affinity. Complements --cpu-mask-draft",
|
888
|
+
[](gpt_params & params, const std::string & range) {
|
889
|
+
params.draft_cpuparams.mask_valid = true;
|
890
|
+
if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
|
891
|
+
throw std::invalid_argument("invalid range");
|
892
|
+
}
|
893
|
+
}
|
894
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
895
|
+
add_opt(llama_arg(
|
896
|
+
{"--cpu-strict-draft"}, "<0|1>",
|
897
|
+
"Use strict CPU placement for draft model (default: same as --cpu-strict)",
|
898
|
+
[](gpt_params & params, int value) {
|
899
|
+
params.draft_cpuparams.strict_cpu = value;
|
900
|
+
}
|
901
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
902
|
+
add_opt(llama_arg(
|
903
|
+
{"--prio-draft"}, "N",
|
904
|
+
format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
|
905
|
+
[](gpt_params & params, int prio) {
|
906
|
+
if (prio < 0 || prio > 3) {
|
907
|
+
throw std::invalid_argument("invalid value");
|
908
|
+
}
|
909
|
+
params.draft_cpuparams.priority = (enum lm_ggml_sched_priority) prio;
|
910
|
+
}
|
911
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
912
|
+
add_opt(llama_arg(
|
913
|
+
{"--poll-draft"}, "<0|1>",
|
914
|
+
"Use polling to wait for draft model work (default: same as --poll])",
|
915
|
+
[](gpt_params & params, int value) {
|
916
|
+
params.draft_cpuparams.poll = value;
|
917
|
+
}
|
918
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
919
|
+
add_opt(llama_arg(
|
920
|
+
{"-Cbd", "--cpu-mask-batch-draft"}, "M",
|
921
|
+
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
922
|
+
[](gpt_params & params, const std::string & mask) {
|
923
|
+
params.draft_cpuparams_batch.mask_valid = true;
|
924
|
+
if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) {
|
925
|
+
throw std::invalid_argument("invalid cpumask");
|
926
|
+
}
|
927
|
+
}
|
928
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
929
|
+
add_opt(llama_arg(
|
930
|
+
{"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
|
931
|
+
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
|
932
|
+
[](gpt_params & params, const std::string & range) {
|
933
|
+
params.draft_cpuparams_batch.mask_valid = true;
|
934
|
+
if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
|
935
|
+
throw std::invalid_argument("invalid cpumask");
|
936
|
+
}
|
937
|
+
}
|
938
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
939
|
+
add_opt(llama_arg(
|
940
|
+
{"--cpu-strict-batch-draft"}, "<0|1>",
|
941
|
+
"Use strict CPU placement for draft model (default: --cpu-strict-draft)",
|
942
|
+
[](gpt_params & params, int value) {
|
943
|
+
params.draft_cpuparams_batch.strict_cpu = value;
|
944
|
+
}
|
945
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
946
|
+
add_opt(llama_arg(
|
947
|
+
{"--prio-batch-draft"}, "N",
|
948
|
+
format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
|
949
|
+
[](gpt_params & params, int prio) {
|
950
|
+
if (prio < 0 || prio > 3) {
|
951
|
+
throw std::invalid_argument("invalid value");
|
952
|
+
}
|
953
|
+
params.draft_cpuparams_batch.priority = (enum lm_ggml_sched_priority) prio;
|
954
|
+
}
|
955
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
956
|
+
add_opt(llama_arg(
|
957
|
+
{"--poll-batch-draft"}, "<0|1>",
|
958
|
+
"Use polling to wait for draft model work (default: --poll-draft)",
|
959
|
+
[](gpt_params & params, int value) {
|
960
|
+
params.draft_cpuparams_batch.poll = value;
|
961
|
+
}
|
962
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
963
|
+
add_opt(llama_arg(
|
964
|
+
{"--draft"}, "N",
|
965
|
+
format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
|
966
|
+
[](gpt_params & params, int value) {
|
967
|
+
params.n_draft = value;
|
968
|
+
}
|
969
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
970
|
+
add_opt(llama_arg(
|
971
|
+
{"-ps", "--p-split"}, "N",
|
972
|
+
format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
|
973
|
+
[](gpt_params & params, const std::string & value) {
|
974
|
+
params.p_split = std::stof(value);
|
975
|
+
}
|
976
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
977
|
+
add_opt(llama_arg(
|
978
|
+
{"-lcs", "--lookup-cache-static"}, "FNAME",
|
979
|
+
"path to static lookup cache to use for lookup decoding (not updated by generation)",
|
980
|
+
[](gpt_params & params, const std::string & value) {
|
981
|
+
params.lookup_cache_static = value;
|
982
|
+
}
|
983
|
+
));
|
984
|
+
add_opt(llama_arg(
|
985
|
+
{"-lcd", "--lookup-cache-dynamic"}, "FNAME",
|
986
|
+
"path to dynamic lookup cache to use for lookup decoding (updated by generation)",
|
987
|
+
[](gpt_params & params, const std::string & value) {
|
988
|
+
params.lookup_cache_dynamic = value;
|
989
|
+
}
|
990
|
+
));
|
991
|
+
add_opt(llama_arg(
|
992
|
+
{"-c", "--ctx-size"}, "N",
|
993
|
+
format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
|
994
|
+
[](gpt_params & params, int value) {
|
995
|
+
params.n_ctx = value;
|
996
|
+
}
|
997
|
+
).set_env("LLAMA_ARG_CTX_SIZE"));
|
998
|
+
add_opt(llama_arg(
|
999
|
+
{"-n", "--predict", "--n-predict"}, "N",
|
1000
|
+
format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
|
1001
|
+
[](gpt_params & params, int value) {
|
1002
|
+
params.n_predict = value;
|
1003
|
+
}
|
1004
|
+
).set_env("LLAMA_ARG_N_PREDICT"));
|
1005
|
+
add_opt(llama_arg(
|
1006
|
+
{"-b", "--batch-size"}, "N",
|
1007
|
+
format("logical maximum batch size (default: %d)", params.n_batch),
|
1008
|
+
[](gpt_params & params, int value) {
|
1009
|
+
params.n_batch = value;
|
1010
|
+
}
|
1011
|
+
).set_env("LLAMA_ARG_BATCH"));
|
1012
|
+
add_opt(llama_arg(
|
1013
|
+
{"-ub", "--ubatch-size"}, "N",
|
1014
|
+
format("physical maximum batch size (default: %d)", params.n_ubatch),
|
1015
|
+
[](gpt_params & params, int value) {
|
1016
|
+
params.n_ubatch = value;
|
1017
|
+
}
|
1018
|
+
).set_env("LLAMA_ARG_UBATCH"));
|
1019
|
+
add_opt(llama_arg(
|
1020
|
+
{"--keep"}, "N",
|
1021
|
+
format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
|
1022
|
+
[](gpt_params & params, int value) {
|
1023
|
+
params.n_keep = value;
|
1024
|
+
}
|
1025
|
+
));
|
1026
|
+
add_opt(llama_arg(
|
1027
|
+
{"--chunks"}, "N",
|
1028
|
+
format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
|
1029
|
+
[](gpt_params & params, int value) {
|
1030
|
+
params.n_chunks = value;
|
1031
|
+
}
|
1032
|
+
));
|
1033
|
+
add_opt(llama_arg(
|
1034
|
+
{"-fa", "--flash-attn"},
|
1035
|
+
format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
|
1036
|
+
[](gpt_params & params) {
|
1037
|
+
params.flash_attn = true;
|
1038
|
+
}
|
1039
|
+
).set_env("LLAMA_ARG_FLASH_ATTN"));
|
1040
|
+
add_opt(llama_arg(
|
1041
|
+
{"-p", "--prompt"}, "PROMPT",
|
1042
|
+
ex == LLAMA_EXAMPLE_MAIN
|
1043
|
+
? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
|
1044
|
+
: "prompt to start generation with",
|
1045
|
+
[](gpt_params & params, const std::string & value) {
|
1046
|
+
params.prompt = value;
|
1047
|
+
}
|
1048
|
+
));
|
1049
|
+
add_opt(llama_arg(
|
1050
|
+
{"-f", "--file"}, "FNAME",
|
1051
|
+
"a file containing the prompt (default: none)",
|
1052
|
+
[](gpt_params & params, const std::string & value) {
|
1053
|
+
std::ifstream file(value);
|
1054
|
+
if (!file) {
|
1055
|
+
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
1056
|
+
}
|
1057
|
+
// store the external file name in params
|
1058
|
+
params.prompt_file = value;
|
1059
|
+
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
1060
|
+
if (!params.prompt.empty() && params.prompt.back() == '\n') {
|
1061
|
+
params.prompt.pop_back();
|
1062
|
+
}
|
1063
|
+
}
|
1064
|
+
));
|
1065
|
+
add_opt(llama_arg(
|
1066
|
+
{"--in-file"}, "FNAME",
|
1067
|
+
"an input file (repeat to specify multiple files)",
|
1068
|
+
[](gpt_params & params, const std::string & value) {
|
1069
|
+
std::ifstream file(value);
|
1070
|
+
if (!file) {
|
1071
|
+
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
1072
|
+
}
|
1073
|
+
params.in_files.push_back(value);
|
1074
|
+
}
|
1075
|
+
));
|
1076
|
+
add_opt(llama_arg(
|
1077
|
+
{"-bf", "--binary-file"}, "FNAME",
|
1078
|
+
"binary file containing the prompt (default: none)",
|
1079
|
+
[](gpt_params & params, const std::string & value) {
|
1080
|
+
std::ifstream file(value, std::ios::binary);
|
1081
|
+
if (!file) {
|
1082
|
+
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
1083
|
+
}
|
1084
|
+
// store the external file name in params
|
1085
|
+
params.prompt_file = value;
|
1086
|
+
std::ostringstream ss;
|
1087
|
+
ss << file.rdbuf();
|
1088
|
+
params.prompt = ss.str();
|
1089
|
+
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
|
1090
|
+
}
|
1091
|
+
));
|
1092
|
+
add_opt(llama_arg(
|
1093
|
+
{"-e", "--escape"},
|
1094
|
+
format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
1095
|
+
[](gpt_params & params) {
|
1096
|
+
params.escape = true;
|
1097
|
+
}
|
1098
|
+
));
|
1099
|
+
add_opt(llama_arg(
|
1100
|
+
{"--no-escape"},
|
1101
|
+
"do not process escape sequences",
|
1102
|
+
[](gpt_params & params) {
|
1103
|
+
params.escape = false;
|
1104
|
+
}
|
1105
|
+
));
|
1106
|
+
add_opt(llama_arg(
|
1107
|
+
{"-ptc", "--print-token-count"}, "N",
|
1108
|
+
format("print token count every N tokens (default: %d)", params.n_print),
|
1109
|
+
[](gpt_params & params, int value) {
|
1110
|
+
params.n_print = value;
|
1111
|
+
}
|
1112
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1113
|
+
add_opt(llama_arg(
|
1114
|
+
{"--prompt-cache"}, "FNAME",
|
1115
|
+
"file to cache prompt state for faster startup (default: none)",
|
1116
|
+
[](gpt_params & params, const std::string & value) {
|
1117
|
+
params.path_prompt_cache = value;
|
1118
|
+
}
|
1119
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1120
|
+
add_opt(llama_arg(
|
1121
|
+
{"--prompt-cache-all"},
|
1122
|
+
"if specified, saves user input and generations to cache as well\n",
|
1123
|
+
[](gpt_params & params) {
|
1124
|
+
params.prompt_cache_all = true;
|
1125
|
+
}
|
1126
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1127
|
+
add_opt(llama_arg(
|
1128
|
+
{"--prompt-cache-ro"},
|
1129
|
+
"if specified, uses the prompt cache but does not update it",
|
1130
|
+
[](gpt_params & params) {
|
1131
|
+
params.prompt_cache_ro = true;
|
1132
|
+
}
|
1133
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1134
|
+
add_opt(llama_arg(
|
1135
|
+
{"-r", "--reverse-prompt"}, "PROMPT",
|
1136
|
+
"halt generation at PROMPT, return control in interactive mode\n",
|
1137
|
+
[](gpt_params & params, const std::string & value) {
|
1138
|
+
params.antiprompt.emplace_back(value);
|
1139
|
+
}
|
1140
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1141
|
+
add_opt(llama_arg(
|
1142
|
+
{"-sp", "--special"},
|
1143
|
+
format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
|
1144
|
+
[](gpt_params & params) {
|
1145
|
+
params.special = true;
|
1146
|
+
}
|
1147
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1148
|
+
add_opt(llama_arg(
|
1149
|
+
{"-cnv", "--conversation"},
|
1150
|
+
format(
|
1151
|
+
"run in conversation mode:\n"
|
1152
|
+
"- does not print special tokens and suffix/prefix\n"
|
1153
|
+
"- interactive mode is also enabled\n"
|
1154
|
+
"(default: %s)",
|
1155
|
+
params.conversation ? "true" : "false"
|
1156
|
+
),
|
1157
|
+
[](gpt_params & params) {
|
1158
|
+
params.conversation = true;
|
1159
|
+
}
|
1160
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1161
|
+
add_opt(llama_arg(
|
1162
|
+
{"-i", "--interactive"},
|
1163
|
+
format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
|
1164
|
+
[](gpt_params & params) {
|
1165
|
+
params.interactive = true;
|
1166
|
+
}
|
1167
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1168
|
+
add_opt(llama_arg(
|
1169
|
+
{"-if", "--interactive-first"},
|
1170
|
+
format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
|
1171
|
+
[](gpt_params & params) {
|
1172
|
+
params.interactive_first = true;
|
1173
|
+
}
|
1174
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1175
|
+
add_opt(llama_arg(
|
1176
|
+
{"-mli", "--multiline-input"},
|
1177
|
+
"allows you to write or paste multiple lines without ending each in '\\'",
|
1178
|
+
[](gpt_params & params) {
|
1179
|
+
params.multiline_input = true;
|
1180
|
+
}
|
1181
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1182
|
+
add_opt(llama_arg(
|
1183
|
+
{"--in-prefix-bos"},
|
1184
|
+
"prefix BOS to user inputs, preceding the `--in-prefix` string",
|
1185
|
+
[](gpt_params & params) {
|
1186
|
+
params.input_prefix_bos = true;
|
1187
|
+
params.enable_chat_template = false;
|
1188
|
+
}
|
1189
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1190
|
+
add_opt(llama_arg(
|
1191
|
+
{"--in-prefix"}, "STRING",
|
1192
|
+
"string to prefix user inputs with (default: empty)",
|
1193
|
+
[](gpt_params & params, const std::string & value) {
|
1194
|
+
params.input_prefix = value;
|
1195
|
+
params.enable_chat_template = false;
|
1196
|
+
}
|
1197
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1198
|
+
add_opt(llama_arg(
|
1199
|
+
{"--in-suffix"}, "STRING",
|
1200
|
+
"string to suffix after user inputs with (default: empty)",
|
1201
|
+
[](gpt_params & params, const std::string & value) {
|
1202
|
+
params.input_suffix = value;
|
1203
|
+
params.enable_chat_template = false;
|
1204
|
+
}
|
1205
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1206
|
+
add_opt(llama_arg(
|
1207
|
+
{"--no-warmup"},
|
1208
|
+
"skip warming up the model with an empty run",
|
1209
|
+
[](gpt_params & params) {
|
1210
|
+
params.warmup = false;
|
1211
|
+
}
|
1212
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1213
|
+
add_opt(llama_arg(
|
1214
|
+
{"--spm-infill"},
|
1215
|
+
format(
|
1216
|
+
"use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
|
1217
|
+
params.spm_infill ? "enabled" : "disabled"
|
1218
|
+
),
|
1219
|
+
[](gpt_params & params) {
|
1220
|
+
params.spm_infill = true;
|
1221
|
+
}
|
1222
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
|
1223
|
+
add_opt(llama_arg(
|
1224
|
+
{"--samplers"}, "SAMPLERS",
|
1225
|
+
format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
1226
|
+
[](gpt_params & params, const std::string & value) {
|
1227
|
+
const auto sampler_names = string_split(value, ';');
|
1228
|
+
params.sparams.samplers = gpt_sampler_types_from_names(sampler_names, true);
|
1229
|
+
}
|
1230
|
+
));
|
1231
|
+
add_opt(llama_arg(
|
1232
|
+
{"--sampling-seq"}, "SEQUENCE",
|
1233
|
+
format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
|
1234
|
+
[](gpt_params & params, const std::string & value) {
|
1235
|
+
params.sparams.samplers = gpt_sampler_types_from_chars(value);
|
1236
|
+
}
|
1237
|
+
));
|
1238
|
+
add_opt(llama_arg(
|
1239
|
+
{"--ignore-eos"},
|
1240
|
+
"ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
|
1241
|
+
[](gpt_params & params) {
|
1242
|
+
params.sparams.ignore_eos = true;
|
1243
|
+
}
|
1244
|
+
));
|
1245
|
+
add_opt(llama_arg(
|
1246
|
+
{"--penalize-nl"},
|
1247
|
+
format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
|
1248
|
+
[](gpt_params & params) {
|
1249
|
+
params.sparams.penalize_nl = true;
|
1250
|
+
}
|
1251
|
+
));
|
1252
|
+
add_opt(llama_arg(
|
1253
|
+
{"--temp"}, "N",
|
1254
|
+
format("temperature (default: %.1f)", (double)params.sparams.temp),
|
1255
|
+
[](gpt_params & params, const std::string & value) {
|
1256
|
+
params.sparams.temp = std::stof(value);
|
1257
|
+
params.sparams.temp = std::max(params.sparams.temp, 0.0f);
|
1258
|
+
}
|
1259
|
+
));
|
1260
|
+
add_opt(llama_arg(
|
1261
|
+
{"--top-k"}, "N",
|
1262
|
+
format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
|
1263
|
+
[](gpt_params & params, int value) {
|
1264
|
+
params.sparams.top_k = value;
|
1265
|
+
}
|
1266
|
+
));
|
1267
|
+
add_opt(llama_arg(
|
1268
|
+
{"--top-p"}, "N",
|
1269
|
+
format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
|
1270
|
+
[](gpt_params & params, const std::string & value) {
|
1271
|
+
params.sparams.top_p = std::stof(value);
|
1272
|
+
}
|
1273
|
+
));
|
1274
|
+
add_opt(llama_arg(
|
1275
|
+
{"--min-p"}, "N",
|
1276
|
+
format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
|
1277
|
+
[](gpt_params & params, const std::string & value) {
|
1278
|
+
params.sparams.min_p = std::stof(value);
|
1279
|
+
}
|
1280
|
+
));
|
1281
|
+
add_opt(llama_arg(
|
1282
|
+
{"--tfs"}, "N",
|
1283
|
+
format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
|
1284
|
+
[](gpt_params & params, const std::string & value) {
|
1285
|
+
params.sparams.tfs_z = std::stof(value);
|
1286
|
+
}
|
1287
|
+
));
|
1288
|
+
add_opt(llama_arg(
|
1289
|
+
{"--typical"}, "N",
|
1290
|
+
format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
|
1291
|
+
[](gpt_params & params, const std::string & value) {
|
1292
|
+
params.sparams.typ_p = std::stof(value);
|
1293
|
+
}
|
1294
|
+
));
|
1295
|
+
add_opt(llama_arg(
|
1296
|
+
{"--repeat-last-n"}, "N",
|
1297
|
+
format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
|
1298
|
+
[](gpt_params & params, int value) {
|
1299
|
+
params.sparams.penalty_last_n = value;
|
1300
|
+
params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
|
1301
|
+
}
|
1302
|
+
));
|
1303
|
+
add_opt(llama_arg(
|
1304
|
+
{"--repeat-penalty"}, "N",
|
1305
|
+
format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
|
1306
|
+
[](gpt_params & params, const std::string & value) {
|
1307
|
+
params.sparams.penalty_repeat = std::stof(value);
|
1308
|
+
}
|
1309
|
+
));
|
1310
|
+
add_opt(llama_arg(
|
1311
|
+
{"--presence-penalty"}, "N",
|
1312
|
+
format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
|
1313
|
+
[](gpt_params & params, const std::string & value) {
|
1314
|
+
params.sparams.penalty_present = std::stof(value);
|
1315
|
+
}
|
1316
|
+
));
|
1317
|
+
add_opt(llama_arg(
|
1318
|
+
{"--frequency-penalty"}, "N",
|
1319
|
+
format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
|
1320
|
+
[](gpt_params & params, const std::string & value) {
|
1321
|
+
params.sparams.penalty_freq = std::stof(value);
|
1322
|
+
}
|
1323
|
+
));
|
1324
|
+
add_opt(llama_arg(
|
1325
|
+
{"--dynatemp-range"}, "N",
|
1326
|
+
format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
|
1327
|
+
[](gpt_params & params, const std::string & value) {
|
1328
|
+
params.sparams.dynatemp_range = std::stof(value);
|
1329
|
+
}
|
1330
|
+
));
|
1331
|
+
add_opt(llama_arg(
|
1332
|
+
{"--dynatemp-exp"}, "N",
|
1333
|
+
format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
|
1334
|
+
[](gpt_params & params, const std::string & value) {
|
1335
|
+
params.sparams.dynatemp_exponent = std::stof(value);
|
1336
|
+
}
|
1337
|
+
));
|
1338
|
+
add_opt(llama_arg(
|
1339
|
+
{"--mirostat"}, "N",
|
1340
|
+
format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
|
1341
|
+
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
|
1342
|
+
[](gpt_params & params, int value) {
|
1343
|
+
params.sparams.mirostat = value;
|
1344
|
+
}
|
1345
|
+
));
|
1346
|
+
add_opt(llama_arg(
|
1347
|
+
{"--mirostat-lr"}, "N",
|
1348
|
+
format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
|
1349
|
+
[](gpt_params & params, const std::string & value) {
|
1350
|
+
params.sparams.mirostat_eta = std::stof(value);
|
1351
|
+
}
|
1352
|
+
));
|
1353
|
+
add_opt(llama_arg(
|
1354
|
+
{"--mirostat-ent"}, "N",
|
1355
|
+
format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
|
1356
|
+
[](gpt_params & params, const std::string & value) {
|
1357
|
+
params.sparams.mirostat_tau = std::stof(value);
|
1358
|
+
}
|
1359
|
+
));
|
1360
|
+
add_opt(llama_arg(
|
1361
|
+
{"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS",
|
1362
|
+
"modifies the likelihood of token appearing in the completion,\n"
|
1363
|
+
"i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
|
1364
|
+
"or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
|
1365
|
+
[](gpt_params & params, const std::string & value) {
|
1366
|
+
std::stringstream ss(value);
|
1367
|
+
llama_token key;
|
1368
|
+
char sign;
|
1369
|
+
std::string value_str;
|
1370
|
+
try {
|
1371
|
+
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
1372
|
+
const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
1373
|
+
params.sparams.logit_bias.push_back({key, bias});
|
1374
|
+
} else {
|
1375
|
+
throw std::invalid_argument("invalid input format");
|
1376
|
+
}
|
1377
|
+
} catch (const std::exception&) {
|
1378
|
+
throw std::invalid_argument("invalid input format");
|
1379
|
+
}
|
1380
|
+
}
|
1381
|
+
));
|
1382
|
+
add_opt(llama_arg(
|
1383
|
+
{"--grammar"}, "GRAMMAR",
|
1384
|
+
format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
|
1385
|
+
[](gpt_params & params, const std::string & value) {
|
1386
|
+
params.sparams.grammar = value;
|
1387
|
+
}
|
1388
|
+
));
|
1389
|
+
add_opt(llama_arg(
|
1390
|
+
{"--grammar-file"}, "FNAME",
|
1391
|
+
"file to read grammar from",
|
1392
|
+
[](gpt_params & params, const std::string & value) {
|
1393
|
+
std::ifstream file(value);
|
1394
|
+
if (!file) {
|
1395
|
+
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
1396
|
+
}
|
1397
|
+
std::copy(
|
1398
|
+
std::istreambuf_iterator<char>(file),
|
1399
|
+
std::istreambuf_iterator<char>(),
|
1400
|
+
std::back_inserter(params.sparams.grammar)
|
1401
|
+
);
|
1402
|
+
}
|
1403
|
+
));
|
1404
|
+
add_opt(llama_arg(
|
1405
|
+
{"-j", "--json-schema"}, "SCHEMA",
|
1406
|
+
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
|
1407
|
+
[](gpt_params & params, const std::string & value) {
|
1408
|
+
params.sparams.grammar = json_schema_to_grammar(json::parse(value));
|
1409
|
+
}
|
1410
|
+
));
|
1411
|
+
add_opt(llama_arg(
|
1412
|
+
{"--pooling"}, "{none,mean,cls,last}",
|
1413
|
+
"pooling type for embeddings, use model default if unspecified",
|
1414
|
+
[](gpt_params & params, const std::string & value) {
|
1415
|
+
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
|
1416
|
+
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
|
1417
|
+
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
|
1418
|
+
else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
|
1419
|
+
else { throw std::invalid_argument("invalid value"); }
|
1420
|
+
}
|
1421
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
1422
|
+
add_opt(llama_arg(
|
1423
|
+
{"--attention"}, "{causal,non,causal}",
|
1424
|
+
"attention type for embeddings, use model default if unspecified",
|
1425
|
+
[](gpt_params & params, const std::string & value) {
|
1426
|
+
/**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
|
1427
|
+
else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
|
1428
|
+
else { throw std::invalid_argument("invalid value"); }
|
1429
|
+
}
|
1430
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
1431
|
+
add_opt(llama_arg(
|
1432
|
+
{"--rope-scaling"}, "{none,linear,yarn}",
|
1433
|
+
"RoPE frequency scaling method, defaults to linear unless specified by the model",
|
1434
|
+
[](gpt_params & params, const std::string & value) {
|
1435
|
+
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
1436
|
+
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
1437
|
+
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
1438
|
+
else { throw std::invalid_argument("invalid value"); }
|
1439
|
+
}
|
1440
|
+
));
|
1441
|
+
add_opt(llama_arg(
|
1442
|
+
{"--rope-scale"}, "N",
|
1443
|
+
"RoPE context scaling factor, expands context by a factor of N",
|
1444
|
+
[](gpt_params & params, const std::string & value) {
|
1445
|
+
params.rope_freq_scale = 1.0f / std::stof(value);
|
1446
|
+
}
|
1447
|
+
));
|
1448
|
+
add_opt(llama_arg(
|
1449
|
+
{"--rope-freq-base"}, "N",
|
1450
|
+
"RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
|
1451
|
+
[](gpt_params & params, const std::string & value) {
|
1452
|
+
params.rope_freq_base = std::stof(value);
|
1453
|
+
}
|
1454
|
+
));
|
1455
|
+
add_opt(llama_arg(
|
1456
|
+
{"--rope-freq-scale"}, "N",
|
1457
|
+
"RoPE frequency scaling factor, expands context by a factor of 1/N",
|
1458
|
+
[](gpt_params & params, const std::string & value) {
|
1459
|
+
params.rope_freq_scale = std::stof(value);
|
1460
|
+
}
|
1461
|
+
));
|
1462
|
+
add_opt(llama_arg(
|
1463
|
+
{"--yarn-orig-ctx"}, "N",
|
1464
|
+
format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
|
1465
|
+
[](gpt_params & params, int value) {
|
1466
|
+
params.yarn_orig_ctx = value;
|
1467
|
+
}
|
1468
|
+
));
|
1469
|
+
add_opt(llama_arg(
|
1470
|
+
{"--yarn-ext-factor"}, "N",
|
1471
|
+
format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
|
1472
|
+
[](gpt_params & params, const std::string & value) {
|
1473
|
+
params.yarn_ext_factor = std::stof(value);
|
1474
|
+
}
|
1475
|
+
));
|
1476
|
+
add_opt(llama_arg(
|
1477
|
+
{"--yarn-attn-factor"}, "N",
|
1478
|
+
format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
|
1479
|
+
[](gpt_params & params, const std::string & value) {
|
1480
|
+
params.yarn_attn_factor = std::stof(value);
|
1481
|
+
}
|
1482
|
+
));
|
1483
|
+
add_opt(llama_arg(
|
1484
|
+
{"--yarn-beta-slow"}, "N",
|
1485
|
+
format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
|
1486
|
+
[](gpt_params & params, const std::string & value) {
|
1487
|
+
params.yarn_beta_slow = std::stof(value);
|
1488
|
+
}
|
1489
|
+
));
|
1490
|
+
add_opt(llama_arg(
|
1491
|
+
{"--yarn-beta-fast"}, "N",
|
1492
|
+
format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
|
1493
|
+
[](gpt_params & params, const std::string & value) {
|
1494
|
+
params.yarn_beta_fast = std::stof(value);
|
1495
|
+
}
|
1496
|
+
));
|
1497
|
+
add_opt(llama_arg(
|
1498
|
+
{"-gan", "--grp-attn-n"}, "N",
|
1499
|
+
format("group-attention factor (default: %d)", params.grp_attn_n),
|
1500
|
+
[](gpt_params & params, int value) {
|
1501
|
+
params.grp_attn_n = value;
|
1502
|
+
}
|
1503
|
+
));
|
1504
|
+
add_opt(llama_arg(
|
1505
|
+
{"-gaw", "--grp-attn-w"}, "N",
|
1506
|
+
format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
|
1507
|
+
[](gpt_params & params, int value) {
|
1508
|
+
params.grp_attn_w = value;
|
1509
|
+
}
|
1510
|
+
));
|
1511
|
+
add_opt(llama_arg(
|
1512
|
+
{"-dkvc", "--dump-kv-cache"},
|
1513
|
+
"verbose print of the KV cache",
|
1514
|
+
[](gpt_params & params) {
|
1515
|
+
params.dump_kv_cache = true;
|
1516
|
+
}
|
1517
|
+
));
|
1518
|
+
add_opt(llama_arg(
|
1519
|
+
{"-nkvo", "--no-kv-offload"},
|
1520
|
+
"disable KV offload",
|
1521
|
+
[](gpt_params & params) {
|
1522
|
+
params.no_kv_offload = true;
|
1523
|
+
}
|
1524
|
+
));
|
1525
|
+
add_opt(llama_arg(
|
1526
|
+
{"-ctk", "--cache-type-k"}, "TYPE",
|
1527
|
+
format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
|
1528
|
+
[](gpt_params & params, const std::string & value) {
|
1529
|
+
// TODO: get the type right here
|
1530
|
+
params.cache_type_k = value;
|
1531
|
+
}
|
1532
|
+
));
|
1533
|
+
add_opt(llama_arg(
|
1534
|
+
{"-ctv", "--cache-type-v"}, "TYPE",
|
1535
|
+
format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
|
1536
|
+
[](gpt_params & params, const std::string & value) {
|
1537
|
+
// TODO: get the type right here
|
1538
|
+
params.cache_type_v = value;
|
1539
|
+
}
|
1540
|
+
));
|
1541
|
+
add_opt(llama_arg(
|
1542
|
+
{"--perplexity", "--all-logits"},
|
1543
|
+
format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
|
1544
|
+
[](gpt_params & params) {
|
1545
|
+
params.logits_all = true;
|
1546
|
+
}
|
1547
|
+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1548
|
+
add_opt(llama_arg(
|
1549
|
+
{"--hellaswag"},
|
1550
|
+
"compute HellaSwag score over random tasks from datafile supplied with -f",
|
1551
|
+
[](gpt_params & params) {
|
1552
|
+
params.hellaswag = true;
|
1553
|
+
}
|
1554
|
+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1555
|
+
add_opt(llama_arg(
|
1556
|
+
{"--hellaswag-tasks"}, "N",
|
1557
|
+
format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
|
1558
|
+
[](gpt_params & params, int value) {
|
1559
|
+
params.hellaswag_tasks = value;
|
1560
|
+
}
|
1561
|
+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1562
|
+
add_opt(llama_arg(
|
1563
|
+
{"--winogrande"},
|
1564
|
+
"compute Winogrande score over random tasks from datafile supplied with -f",
|
1565
|
+
[](gpt_params & params) {
|
1566
|
+
params.winogrande = true;
|
1567
|
+
}
|
1568
|
+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1569
|
+
add_opt(llama_arg(
|
1570
|
+
{"--winogrande-tasks"}, "N",
|
1571
|
+
format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
|
1572
|
+
[](gpt_params & params, int value) {
|
1573
|
+
params.winogrande_tasks = value;
|
1574
|
+
}
|
1575
|
+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1576
|
+
add_opt(llama_arg(
|
1577
|
+
{"--multiple-choice"},
|
1578
|
+
"compute multiple choice score over random tasks from datafile supplied with -f",
|
1579
|
+
[](gpt_params & params) {
|
1580
|
+
params.multiple_choice = true;
|
1581
|
+
}
|
1582
|
+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1583
|
+
add_opt(llama_arg(
|
1584
|
+
{"--multiple-choice-tasks"}, "N",
|
1585
|
+
format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
|
1586
|
+
[](gpt_params & params, int value) {
|
1587
|
+
params.multiple_choice_tasks = value;
|
1588
|
+
}
|
1589
|
+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1590
|
+
add_opt(llama_arg(
|
1591
|
+
{"--kl-divergence"},
|
1592
|
+
"computes KL-divergence to logits provided via --kl-divergence-base",
|
1593
|
+
[](gpt_params & params) {
|
1594
|
+
params.kl_divergence = true;
|
1595
|
+
}
|
1596
|
+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1597
|
+
add_opt(llama_arg(
|
1598
|
+
{"--save-all-logits", "--kl-divergence-base"}, "FNAME",
|
1599
|
+
"set logits file",
|
1600
|
+
[](gpt_params & params, const std::string & value) {
|
1601
|
+
params.logits_file = value;
|
1602
|
+
}
|
1603
|
+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1604
|
+
add_opt(llama_arg(
|
1605
|
+
{"--ppl-stride"}, "N",
|
1606
|
+
format("stride for perplexity calculation (default: %d)", params.ppl_stride),
|
1607
|
+
[](gpt_params & params, int value) {
|
1608
|
+
params.ppl_stride = value;
|
1609
|
+
}
|
1610
|
+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1611
|
+
add_opt(llama_arg(
|
1612
|
+
{"--ppl-output-type"}, "<0|1>",
|
1613
|
+
format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
|
1614
|
+
[](gpt_params & params, int value) {
|
1615
|
+
params.ppl_output_type = value;
|
1616
|
+
}
|
1617
|
+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1618
|
+
add_opt(llama_arg(
|
1619
|
+
{"-dt", "--defrag-thold"}, "N",
|
1620
|
+
format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
|
1621
|
+
[](gpt_params & params, const std::string & value) {
|
1622
|
+
params.defrag_thold = std::stof(value);
|
1623
|
+
}
|
1624
|
+
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
|
1625
|
+
add_opt(llama_arg(
|
1626
|
+
{"-np", "--parallel"}, "N",
|
1627
|
+
format("number of parallel sequences to decode (default: %d)", params.n_parallel),
|
1628
|
+
[](gpt_params & params, int value) {
|
1629
|
+
params.n_parallel = value;
|
1630
|
+
}
|
1631
|
+
));
|
1632
|
+
add_opt(llama_arg(
|
1633
|
+
{"-ns", "--sequences"}, "N",
|
1634
|
+
format("number of sequences to decode (default: %d)", params.n_sequences),
|
1635
|
+
[](gpt_params & params, int value) {
|
1636
|
+
params.n_sequences = value;
|
1637
|
+
}
|
1638
|
+
));
|
1639
|
+
add_opt(llama_arg(
|
1640
|
+
{"-cb", "--cont-batching"},
|
1641
|
+
format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
|
1642
|
+
[](gpt_params & params) {
|
1643
|
+
params.cont_batching = true;
|
1644
|
+
}
|
1645
|
+
).set_env("LLAMA_ARG_CONT_BATCHING"));
|
1646
|
+
add_opt(llama_arg(
|
1647
|
+
{"-nocb", "--no-cont-batching"},
|
1648
|
+
"disable continuous batching",
|
1649
|
+
[](gpt_params & params) {
|
1650
|
+
params.cont_batching = false;
|
1651
|
+
}
|
1652
|
+
).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
1653
|
+
add_opt(llama_arg(
|
1654
|
+
{"--mmproj"}, "FILE",
|
1655
|
+
"path to a multimodal projector file for LLaVA. see examples/llava/README.md",
|
1656
|
+
[](gpt_params & params, const std::string & value) {
|
1657
|
+
params.mmproj = value;
|
1658
|
+
}
|
1659
|
+
).set_examples({LLAMA_EXAMPLE_LLAVA}));
|
1660
|
+
add_opt(llama_arg(
|
1661
|
+
{"--image"}, "FILE",
|
1662
|
+
"path to an image file. use with multimodal models. Specify multiple times for batching",
|
1663
|
+
[](gpt_params & params, const std::string & value) {
|
1664
|
+
params.image.emplace_back(value);
|
1665
|
+
}
|
1666
|
+
).set_examples({LLAMA_EXAMPLE_LLAVA}));
|
1667
|
+
#ifdef LM_GGML_USE_RPC
|
1668
|
+
add_opt(llama_arg(
|
1669
|
+
{"--rpc"}, "SERVERS",
|
1670
|
+
"comma separated list of RPC servers",
|
1671
|
+
[](gpt_params & params, const std::string & value) {
|
1672
|
+
params.rpc_servers = value;
|
1673
|
+
}
|
1674
|
+
));
|
1675
|
+
#endif
|
1676
|
+
add_opt(llama_arg(
|
1677
|
+
{"--mlock"},
|
1678
|
+
"force system to keep model in RAM rather than swapping or compressing",
|
1679
|
+
[](gpt_params & params) {
|
1680
|
+
params.use_mlock = true;
|
1681
|
+
}
|
1682
|
+
));
|
1683
|
+
add_opt(llama_arg(
|
1684
|
+
{"--no-mmap"},
|
1685
|
+
"do not memory-map model (slower load but may reduce pageouts if not using mlock)",
|
1686
|
+
[](gpt_params & params) {
|
1687
|
+
params.use_mmap = false;
|
1688
|
+
}
|
1689
|
+
));
|
1690
|
+
add_opt(llama_arg(
|
1691
|
+
{"--numa"}, "TYPE",
|
1692
|
+
"attempt optimizations that help on some NUMA systems\n"
|
1693
|
+
"- distribute: spread execution evenly over all nodes\n"
|
1694
|
+
"- isolate: only spawn threads on CPUs on the node that execution started on\n"
|
1695
|
+
"- numactl: use the CPU map provided by numactl\n"
|
1696
|
+
"if run without this previously, it is recommended to drop the system page cache before using this\n"
|
1697
|
+
"see https://github.com/ggerganov/llama.cpp/issues/1437",
|
1698
|
+
[](gpt_params & params, const std::string & value) {
|
1699
|
+
/**/ if (value == "distribute" || value == "") { params.numa = LM_GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
1700
|
+
else if (value == "isolate") { params.numa = LM_GGML_NUMA_STRATEGY_ISOLATE; }
|
1701
|
+
else if (value == "numactl") { params.numa = LM_GGML_NUMA_STRATEGY_NUMACTL; }
|
1702
|
+
else { throw std::invalid_argument("invalid value"); }
|
1703
|
+
}
|
1704
|
+
));
|
1705
|
+
add_opt(llama_arg(
|
1706
|
+
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
1707
|
+
"number of layers to store in VRAM",
|
1708
|
+
[](gpt_params & params, int value) {
|
1709
|
+
params.n_gpu_layers = value;
|
1710
|
+
if (!llama_supports_gpu_offload()) {
|
1711
|
+
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
|
1712
|
+
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
1713
|
+
}
|
1714
|
+
}
|
1715
|
+
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
|
1716
|
+
add_opt(llama_arg(
|
1717
|
+
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
1718
|
+
"number of layers to store in VRAM for the draft model",
|
1719
|
+
[](gpt_params & params, int value) {
|
1720
|
+
params.n_gpu_layers_draft = value;
|
1721
|
+
if (!llama_supports_gpu_offload()) {
|
1722
|
+
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
1723
|
+
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
1724
|
+
}
|
1725
|
+
}
|
1726
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
1727
|
+
add_opt(llama_arg(
|
1728
|
+
{"-sm", "--split-mode"}, "{none,layer,row}",
|
1729
|
+
"how to split the model across multiple GPUs, one of:\n"
|
1730
|
+
"- none: use one GPU only\n"
|
1731
|
+
"- layer (default): split layers and KV across GPUs\n"
|
1732
|
+
"- row: split rows across GPUs",
|
1733
|
+
[](gpt_params & params, const std::string & value) {
|
1734
|
+
std::string arg_next = value;
|
1735
|
+
if (arg_next == "none") {
|
1736
|
+
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
1737
|
+
} else if (arg_next == "layer") {
|
1738
|
+
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
1739
|
+
}
|
1740
|
+
else if (arg_next == "row") {
|
1741
|
+
#ifdef LM_GGML_USE_SYCL
|
1742
|
+
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
|
1743
|
+
exit(1);
|
1744
|
+
#endif // LM_GGML_USE_SYCL
|
1745
|
+
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
1746
|
+
}
|
1747
|
+
else {
|
1748
|
+
throw std::invalid_argument("invalid value");
|
1749
|
+
}
|
1750
|
+
#ifndef LM_GGML_USE_CUDA_SYCL_VULKAN
|
1751
|
+
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
|
1752
|
+
#endif // LM_GGML_USE_CUDA_SYCL_VULKAN
|
1753
|
+
}
|
1754
|
+
));
|
1755
|
+
add_opt(llama_arg(
|
1756
|
+
{"-ts", "--tensor-split"}, "N0,N1,N2,...",
|
1757
|
+
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
|
1758
|
+
[](gpt_params & params, const std::string & value) {
|
1759
|
+
std::string arg_next = value;
|
1760
|
+
|
1761
|
+
// split string by , and /
|
1762
|
+
const std::regex regex{ R"([,/]+)" };
|
1763
|
+
std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
|
1764
|
+
std::vector<std::string> split_arg{ it, {} };
|
1765
|
+
if (split_arg.size() >= llama_max_devices()) {
|
1766
|
+
throw std::invalid_argument(
|
1767
|
+
format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
|
1768
|
+
);
|
1769
|
+
}
|
1770
|
+
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
1771
|
+
if (i < split_arg.size()) {
|
1772
|
+
params.tensor_split[i] = std::stof(split_arg[i]);
|
1773
|
+
} else {
|
1774
|
+
params.tensor_split[i] = 0.0f;
|
1775
|
+
}
|
1776
|
+
}
|
1777
|
+
#ifndef LM_GGML_USE_CUDA_SYCL_VULKAN
|
1778
|
+
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
|
1779
|
+
#endif // LM_GGML_USE_CUDA_SYCL_VULKAN
|
1805
1780
|
}
|
1781
|
+
));
|
1782
|
+
add_opt(llama_arg(
|
1783
|
+
{"-mg", "--main-gpu"}, "INDEX",
|
1784
|
+
format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
|
1785
|
+
[](gpt_params & params, int value) {
|
1786
|
+
params.main_gpu = value;
|
1787
|
+
#ifndef LM_GGML_USE_CUDA_SYCL_VULKAN
|
1788
|
+
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
|
1789
|
+
#endif // LM_GGML_USE_CUDA_SYCL_VULKAN
|
1790
|
+
}
|
1791
|
+
));
|
1792
|
+
add_opt(llama_arg(
|
1793
|
+
{"--check-tensors"},
|
1794
|
+
format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
|
1795
|
+
[](gpt_params & params) {
|
1796
|
+
params.check_tensors = true;
|
1797
|
+
}
|
1798
|
+
));
|
1799
|
+
add_opt(llama_arg(
|
1800
|
+
{"--override-kv"}, "KEY=TYPE:VALUE",
|
1801
|
+
"advanced option to override model metadata by key. may be specified multiple times.\n"
|
1802
|
+
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
|
1803
|
+
[](gpt_params & params, const std::string & value) {
|
1804
|
+
if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
|
1805
|
+
throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str()));
|
1806
|
+
}
|
1807
|
+
}
|
1808
|
+
));
|
1809
|
+
add_opt(llama_arg(
|
1810
|
+
{"--lora"}, "FNAME",
|
1811
|
+
"path to LoRA adapter (can be repeated to use multiple adapters)",
|
1812
|
+
[](gpt_params & params, const std::string & value) {
|
1813
|
+
params.lora_adapters.push_back({ std::string(value), 1.0 });
|
1814
|
+
}
|
1815
|
+
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
1816
|
+
add_opt(llama_arg(
|
1817
|
+
{"--lora-scaled"}, "FNAME", "SCALE",
|
1818
|
+
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
|
1819
|
+
[](gpt_params & params, const std::string & fname, const std::string & scale) {
|
1820
|
+
params.lora_adapters.push_back({ fname, std::stof(scale) });
|
1821
|
+
}
|
1822
|
+
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
1823
|
+
add_opt(llama_arg(
|
1824
|
+
{"--control-vector"}, "FNAME",
|
1825
|
+
"add a control vector\nnote: this argument can be repeated to add multiple control vectors",
|
1826
|
+
[](gpt_params & params, const std::string & value) {
|
1827
|
+
params.control_vectors.push_back({ 1.0f, value, });
|
1828
|
+
}
|
1829
|
+
));
|
1830
|
+
add_opt(llama_arg(
|
1831
|
+
{"--control-vector-scaled"}, "FNAME", "SCALE",
|
1832
|
+
"add a control vector with user defined scaling SCALE\n"
|
1833
|
+
"note: this argument can be repeated to add multiple scaled control vectors",
|
1834
|
+
[](gpt_params & params, const std::string & fname, const std::string & scale) {
|
1835
|
+
params.control_vectors.push_back({ std::stof(scale), fname });
|
1836
|
+
}
|
1837
|
+
));
|
1838
|
+
add_opt(llama_arg(
|
1839
|
+
{"--control-vector-layer-range"}, "START", "END",
|
1840
|
+
"layer range to apply the control vector(s) to, start and end inclusive",
|
1841
|
+
[](gpt_params & params, const std::string & start, const std::string & end) {
|
1842
|
+
params.control_vector_layer_start = std::stoi(start);
|
1843
|
+
params.control_vector_layer_end = std::stoi(end);
|
1844
|
+
}
|
1845
|
+
));
|
1846
|
+
add_opt(llama_arg(
|
1847
|
+
{"-a", "--alias"}, "STRING",
|
1848
|
+
"set alias for model name (to be used by REST API)",
|
1849
|
+
[](gpt_params & params, const std::string & value) {
|
1850
|
+
params.model_alias = value;
|
1851
|
+
}
|
1852
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
1853
|
+
add_opt(llama_arg(
|
1854
|
+
{"-m", "--model"}, "FNAME",
|
1855
|
+
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
1856
|
+
? std::string("model path from which to load base model")
|
1857
|
+
: format(
|
1858
|
+
"model path (default: `models/$filename` with filename from `--hf-file` "
|
1859
|
+
"or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
|
1860
|
+
),
|
1861
|
+
[](gpt_params & params, const std::string & value) {
|
1862
|
+
params.model = value;
|
1863
|
+
}
|
1864
|
+
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
|
1865
|
+
add_opt(llama_arg(
|
1866
|
+
{"-md", "--model-draft"}, "FNAME",
|
1867
|
+
"draft model for speculative decoding (default: unused)",
|
1868
|
+
[](gpt_params & params, const std::string & value) {
|
1869
|
+
params.model_draft = value;
|
1870
|
+
}
|
1871
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
1872
|
+
add_opt(llama_arg(
|
1873
|
+
{"-mu", "--model-url"}, "MODEL_URL",
|
1874
|
+
"model download url (default: unused)",
|
1875
|
+
[](gpt_params & params, const std::string & value) {
|
1876
|
+
params.model_url = value;
|
1877
|
+
}
|
1878
|
+
).set_env("LLAMA_ARG_MODEL_URL"));
|
1879
|
+
add_opt(llama_arg(
|
1880
|
+
{"-hfr", "--hf-repo"}, "REPO",
|
1881
|
+
"Hugging Face model repository (default: unused)",
|
1882
|
+
[](gpt_params & params, const std::string & value) {
|
1883
|
+
params.hf_repo = value;
|
1884
|
+
}
|
1885
|
+
).set_env("LLAMA_ARG_HF_REPO"));
|
1886
|
+
add_opt(llama_arg(
|
1887
|
+
{"-hff", "--hf-file"}, "FILE",
|
1888
|
+
"Hugging Face model file (default: unused)",
|
1889
|
+
[](gpt_params & params, const std::string & value) {
|
1890
|
+
params.hf_file = value;
|
1891
|
+
}
|
1892
|
+
).set_env("LLAMA_ARG_HF_FILE"));
|
1893
|
+
add_opt(llama_arg(
|
1894
|
+
{"-hft", "--hf-token"}, "TOKEN",
|
1895
|
+
"Hugging Face access token (default: value from HF_TOKEN environment variable)",
|
1896
|
+
[](gpt_params & params, const std::string & value) {
|
1897
|
+
params.hf_token = value;
|
1898
|
+
}
|
1899
|
+
).set_env("HF_TOKEN"));
|
1900
|
+
add_opt(llama_arg(
|
1901
|
+
{"--context-file"}, "FNAME",
|
1902
|
+
"file to load context from (repeat to specify multiple files)",
|
1903
|
+
[](gpt_params & params, const std::string & value) {
|
1904
|
+
std::ifstream file(value, std::ios::binary);
|
1905
|
+
if (!file) {
|
1906
|
+
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
1907
|
+
}
|
1908
|
+
params.context_files.push_back(value);
|
1909
|
+
}
|
1910
|
+
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
1911
|
+
add_opt(llama_arg(
|
1912
|
+
{"--chunk-size"}, "N",
|
1913
|
+
format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
|
1914
|
+
[](gpt_params & params, int value) {
|
1915
|
+
params.chunk_size = value;
|
1916
|
+
}
|
1917
|
+
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
1918
|
+
add_opt(llama_arg(
|
1919
|
+
{"--chunk-separator"}, "STRING",
|
1920
|
+
format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
|
1921
|
+
[](gpt_params & params, const std::string & value) {
|
1922
|
+
params.chunk_separator = value;
|
1923
|
+
}
|
1924
|
+
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
1925
|
+
add_opt(llama_arg(
|
1926
|
+
{"--junk"}, "N",
|
1927
|
+
format("number of times to repeat the junk text (default: %d)", params.n_junk),
|
1928
|
+
[](gpt_params & params, int value) {
|
1929
|
+
params.n_junk = value;
|
1930
|
+
}
|
1931
|
+
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
1932
|
+
add_opt(llama_arg(
|
1933
|
+
{"--pos"}, "N",
|
1934
|
+
format("position of the passkey in the junk text (default: %d)", params.i_pos),
|
1935
|
+
[](gpt_params & params, int value) {
|
1936
|
+
params.i_pos = value;
|
1937
|
+
}
|
1938
|
+
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
1939
|
+
add_opt(llama_arg(
|
1940
|
+
{"-o", "--output", "--output-file"}, "FNAME",
|
1941
|
+
format("output file (default: '%s')",
|
1942
|
+
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
1943
|
+
? params.lora_outfile.c_str()
|
1944
|
+
: ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
|
1945
|
+
? params.cvector_outfile.c_str()
|
1946
|
+
: params.out_file.c_str()),
|
1947
|
+
[](gpt_params & params, const std::string & value) {
|
1948
|
+
params.out_file = value;
|
1949
|
+
params.cvector_outfile = value;
|
1950
|
+
params.lora_outfile = value;
|
1951
|
+
}
|
1952
|
+
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
|
1953
|
+
add_opt(llama_arg(
|
1954
|
+
{"-ofreq", "--output-frequency"}, "N",
|
1955
|
+
format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
1956
|
+
[](gpt_params & params, int value) {
|
1957
|
+
params.n_out_freq = value;
|
1958
|
+
}
|
1959
|
+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
1960
|
+
add_opt(llama_arg(
|
1961
|
+
{"--save-frequency"}, "N",
|
1962
|
+
format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
|
1963
|
+
[](gpt_params & params, int value) {
|
1964
|
+
params.n_save_freq = value;
|
1965
|
+
}
|
1966
|
+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
1967
|
+
add_opt(llama_arg(
|
1968
|
+
{"--process-output"},
|
1969
|
+
format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
|
1970
|
+
[](gpt_params & params) {
|
1971
|
+
params.process_output = true;
|
1972
|
+
}
|
1973
|
+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
1974
|
+
add_opt(llama_arg(
|
1975
|
+
{"--no-ppl"},
|
1976
|
+
format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
|
1977
|
+
[](gpt_params & params) {
|
1978
|
+
params.compute_ppl = false;
|
1979
|
+
}
|
1980
|
+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
1981
|
+
add_opt(llama_arg(
|
1982
|
+
{"--chunk", "--from-chunk"}, "N",
|
1983
|
+
format("start processing the input from chunk N (default: %d)", params.i_chunk),
|
1984
|
+
[](gpt_params & params, int value) {
|
1985
|
+
params.i_chunk = value;
|
1986
|
+
}
|
1987
|
+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
1988
|
+
add_opt(llama_arg(
|
1989
|
+
{"-pps"},
|
1990
|
+
format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
|
1991
|
+
[](gpt_params & params) {
|
1992
|
+
params.is_pp_shared = true;
|
1993
|
+
}
|
1994
|
+
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
1995
|
+
add_opt(llama_arg(
|
1996
|
+
{"-npp"}, "n0,n1,...",
|
1997
|
+
"number of prompt tokens",
|
1998
|
+
[](gpt_params & params, const std::string & value) {
|
1999
|
+
auto p = string_split<int>(value, ',');
|
2000
|
+
params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
|
2001
|
+
}
|
2002
|
+
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
2003
|
+
add_opt(llama_arg(
|
2004
|
+
{"-ntg"}, "n0,n1,...",
|
2005
|
+
"number of text generation tokens",
|
2006
|
+
[](gpt_params & params, const std::string & value) {
|
2007
|
+
auto p = string_split<int>(value, ',');
|
2008
|
+
params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
|
2009
|
+
}
|
2010
|
+
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
2011
|
+
add_opt(llama_arg(
|
2012
|
+
{"-npl"}, "n0,n1,...",
|
2013
|
+
"number of parallel prompts",
|
2014
|
+
[](gpt_params & params, const std::string & value) {
|
2015
|
+
auto p = string_split<int>(value, ',');
|
2016
|
+
params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
|
2017
|
+
}
|
2018
|
+
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
2019
|
+
add_opt(llama_arg(
|
2020
|
+
{"--embd-normalize"}, "N",
|
2021
|
+
format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
|
2022
|
+
[](gpt_params & params, int value) {
|
2023
|
+
params.embd_normalize = value;
|
2024
|
+
}
|
2025
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
2026
|
+
add_opt(llama_arg(
|
2027
|
+
{"--embd-output-format"}, "FORMAT",
|
2028
|
+
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
|
2029
|
+
[](gpt_params & params, const std::string & value) {
|
2030
|
+
params.embd_out = value;
|
2031
|
+
}
|
2032
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
2033
|
+
add_opt(llama_arg(
|
2034
|
+
{"--embd-separator"}, "STRING",
|
2035
|
+
"separator of embendings (default \\n) for example \"<#sep#>\"",
|
2036
|
+
[](gpt_params & params, const std::string & value) {
|
2037
|
+
params.embd_sep = value;
|
2038
|
+
}
|
2039
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
2040
|
+
add_opt(llama_arg(
|
2041
|
+
{"--host"}, "HOST",
|
2042
|
+
format("ip address to listen (default: %s)", params.hostname.c_str()),
|
2043
|
+
[](gpt_params & params, const std::string & value) {
|
2044
|
+
params.hostname = value;
|
2045
|
+
}
|
2046
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
|
2047
|
+
add_opt(llama_arg(
|
2048
|
+
{"--port"}, "PORT",
|
2049
|
+
format("port to listen (default: %d)", params.port),
|
2050
|
+
[](gpt_params & params, int value) {
|
2051
|
+
params.port = value;
|
2052
|
+
}
|
2053
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
|
2054
|
+
add_opt(llama_arg(
|
2055
|
+
{"--path"}, "PATH",
|
2056
|
+
format("path to serve static files from (default: %s)", params.public_path.c_str()),
|
2057
|
+
[](gpt_params & params, const std::string & value) {
|
2058
|
+
params.public_path = value;
|
2059
|
+
}
|
2060
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2061
|
+
add_opt(llama_arg(
|
2062
|
+
{"--embedding", "--embeddings"},
|
2063
|
+
format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
|
2064
|
+
[](gpt_params & params) {
|
2065
|
+
params.embedding = true;
|
2066
|
+
}
|
2067
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
2068
|
+
add_opt(llama_arg(
|
2069
|
+
{"--api-key"}, "KEY",
|
2070
|
+
"API key to use for authentication (default: none)",
|
2071
|
+
[](gpt_params & params, const std::string & value) {
|
2072
|
+
params.api_keys.push_back(value);
|
2073
|
+
}
|
2074
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
|
2075
|
+
add_opt(llama_arg(
|
2076
|
+
{"--api-key-file"}, "FNAME",
|
2077
|
+
"path to file containing API keys (default: none)",
|
2078
|
+
[](gpt_params & params, const std::string & value) {
|
2079
|
+
std::ifstream key_file(value);
|
2080
|
+
if (!key_file) {
|
2081
|
+
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
2082
|
+
}
|
2083
|
+
std::string key;
|
2084
|
+
while (std::getline(key_file, key)) {
|
2085
|
+
if (!key.empty()) {
|
2086
|
+
params.api_keys.push_back(key);
|
2087
|
+
}
|
2088
|
+
}
|
2089
|
+
key_file.close();
|
2090
|
+
}
|
2091
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2092
|
+
add_opt(llama_arg(
|
2093
|
+
{"--ssl-key-file"}, "FNAME",
|
2094
|
+
"path to file a PEM-encoded SSL private key",
|
2095
|
+
[](gpt_params & params, const std::string & value) {
|
2096
|
+
params.ssl_file_key = value;
|
2097
|
+
}
|
2098
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2099
|
+
add_opt(llama_arg(
|
2100
|
+
{"--ssl-cert-file"}, "FNAME",
|
2101
|
+
"path to file a PEM-encoded SSL certificate",
|
2102
|
+
[](gpt_params & params, const std::string & value) {
|
2103
|
+
params.ssl_file_cert = value;
|
2104
|
+
}
|
2105
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2106
|
+
add_opt(llama_arg(
|
2107
|
+
{"-to", "--timeout"}, "N",
|
2108
|
+
format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
2109
|
+
[](gpt_params & params, int value) {
|
2110
|
+
params.timeout_read = value;
|
2111
|
+
params.timeout_write = value;
|
2112
|
+
}
|
2113
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2114
|
+
add_opt(llama_arg(
|
2115
|
+
{"--threads-http"}, "N",
|
2116
|
+
format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
|
2117
|
+
[](gpt_params & params, int value) {
|
2118
|
+
params.n_threads_http = value;
|
2119
|
+
}
|
2120
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
|
2121
|
+
add_opt(llama_arg(
|
2122
|
+
{"-spf", "--system-prompt-file"}, "FNAME",
|
2123
|
+
"set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications",
|
2124
|
+
[](gpt_params & params, const std::string & value) {
|
2125
|
+
std::ifstream file(value);
|
2126
|
+
if (!file) {
|
2127
|
+
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
2128
|
+
}
|
2129
|
+
std::string system_prompt;
|
2130
|
+
std::copy(
|
2131
|
+
std::istreambuf_iterator<char>(file),
|
2132
|
+
std::istreambuf_iterator<char>(),
|
2133
|
+
std::back_inserter(system_prompt)
|
2134
|
+
);
|
2135
|
+
params.system_prompt = system_prompt;
|
2136
|
+
}
|
2137
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2138
|
+
add_opt(llama_arg(
|
2139
|
+
{"--log-format"}, "{text, json}",
|
2140
|
+
"log output format: json or text (default: json)",
|
2141
|
+
[](gpt_params & params, const std::string & value) {
|
2142
|
+
if (value == "json") {
|
2143
|
+
params.log_json = true;
|
2144
|
+
} else if (value == "text") {
|
2145
|
+
params.log_json = false;
|
2146
|
+
} else {
|
2147
|
+
throw std::invalid_argument("invalid value");
|
2148
|
+
}
|
2149
|
+
}
|
2150
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2151
|
+
add_opt(llama_arg(
|
2152
|
+
{"--metrics"},
|
2153
|
+
format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
|
2154
|
+
[](gpt_params & params) {
|
2155
|
+
params.endpoint_metrics = true;
|
2156
|
+
}
|
2157
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
|
2158
|
+
add_opt(llama_arg(
|
2159
|
+
{"--no-slots"},
|
2160
|
+
format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
2161
|
+
[](gpt_params & params) {
|
2162
|
+
params.endpoint_slots = false;
|
2163
|
+
}
|
2164
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
|
2165
|
+
add_opt(llama_arg(
|
2166
|
+
{"--slot-save-path"}, "PATH",
|
2167
|
+
"path to save slot kv cache (default: disabled)",
|
2168
|
+
[](gpt_params & params, const std::string & value) {
|
2169
|
+
params.slot_save_path = value;
|
2170
|
+
// if doesn't end with DIRECTORY_SEPARATOR, add it
|
2171
|
+
if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
|
2172
|
+
params.slot_save_path += DIRECTORY_SEPARATOR;
|
2173
|
+
}
|
2174
|
+
}
|
2175
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2176
|
+
add_opt(llama_arg(
|
2177
|
+
{"--chat-template"}, "JINJA_TEMPLATE",
|
2178
|
+
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
2179
|
+
"if suffix/prefix are specified, template will be disabled\n"
|
2180
|
+
"only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
|
2181
|
+
[](gpt_params & params, const std::string & value) {
|
2182
|
+
if (!llama_chat_verify_template(value)) {
|
2183
|
+
throw std::runtime_error(format(
|
2184
|
+
"error: the supplied chat template is not supported: %s\n"
|
2185
|
+
"note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
|
2186
|
+
value.c_str()
|
2187
|
+
));
|
2188
|
+
}
|
2189
|
+
params.chat_template = value;
|
2190
|
+
}
|
2191
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
2192
|
+
add_opt(llama_arg(
|
2193
|
+
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
2194
|
+
format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
2195
|
+
[](gpt_params & params, const std::string & value) {
|
2196
|
+
params.slot_prompt_similarity = std::stof(value);
|
2197
|
+
}
|
2198
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2199
|
+
add_opt(llama_arg(
|
2200
|
+
{"--lora-init-without-apply"},
|
2201
|
+
format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
|
2202
|
+
[](gpt_params & params) {
|
2203
|
+
params.lora_init_without_apply = true;
|
2204
|
+
}
|
2205
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2206
|
+
add_opt(llama_arg(
|
2207
|
+
{"--simple-io"},
|
2208
|
+
"use basic IO for better compatibility in subprocesses and limited consoles",
|
2209
|
+
[](gpt_params & params) {
|
2210
|
+
params.simple_io = true;
|
2211
|
+
}
|
2212
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
2213
|
+
add_opt(llama_arg(
|
2214
|
+
{"-ld", "--logdir"}, "LOGDIR",
|
2215
|
+
"path under which to save YAML logs (no logging if unset)",
|
2216
|
+
[](gpt_params & params, const std::string & value) {
|
2217
|
+
params.logdir = value;
|
2218
|
+
|
2219
|
+
if (params.logdir.back() != DIRECTORY_SEPARATOR) {
|
2220
|
+
params.logdir += DIRECTORY_SEPARATOR;
|
2221
|
+
}
|
2222
|
+
}
|
2223
|
+
));
|
2224
|
+
add_opt(llama_arg(
|
2225
|
+
{"--positive-file"}, "FNAME",
|
2226
|
+
format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|
2227
|
+
[](gpt_params & params, const std::string & value) {
|
2228
|
+
params.cvector_positive_file = value;
|
2229
|
+
}
|
2230
|
+
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
2231
|
+
add_opt(llama_arg(
|
2232
|
+
{"--negative-file"}, "FNAME",
|
2233
|
+
format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
|
2234
|
+
[](gpt_params & params, const std::string & value) {
|
2235
|
+
params.cvector_negative_file = value;
|
2236
|
+
}
|
2237
|
+
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
2238
|
+
add_opt(llama_arg(
|
2239
|
+
{"--pca-batch"}, "N",
|
2240
|
+
format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
|
2241
|
+
[](gpt_params & params, int value) {
|
2242
|
+
params.n_pca_batch = value;
|
2243
|
+
}
|
2244
|
+
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
2245
|
+
add_opt(llama_arg(
|
2246
|
+
{"--pca-iter"}, "N",
|
2247
|
+
format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
|
2248
|
+
[](gpt_params & params, int value) {
|
2249
|
+
params.n_pca_iterations = value;
|
2250
|
+
}
|
2251
|
+
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
2252
|
+
add_opt(llama_arg(
|
2253
|
+
{"--method"}, "{pca, mean}",
|
2254
|
+
"dimensionality reduction method to be used (default: pca)",
|
2255
|
+
[](gpt_params & params, const std::string & value) {
|
2256
|
+
/**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
|
2257
|
+
else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
|
2258
|
+
else { throw std::invalid_argument("invalid value"); }
|
2259
|
+
}
|
2260
|
+
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
2261
|
+
add_opt(llama_arg(
|
2262
|
+
{"--output-format"}, "{md,jsonl}",
|
2263
|
+
"output format for batched-bench results (default: md)",
|
2264
|
+
[](gpt_params & params, const std::string & value) {
|
2265
|
+
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
|
2266
|
+
else if (value == "md") { params.batched_bench_output_jsonl = false; }
|
2267
|
+
else { std::invalid_argument("invalid value"); }
|
2268
|
+
}
|
2269
|
+
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
2270
|
+
#ifndef LOG_DISABLE_LOGS
|
2271
|
+
// TODO: make this looks less weird
|
2272
|
+
add_opt(llama_arg(
|
2273
|
+
{"--log-test"},
|
2274
|
+
"Log test",
|
2275
|
+
[](gpt_params &) { log_param_single_parse("--log-test"); }
|
2276
|
+
));
|
2277
|
+
add_opt(llama_arg(
|
2278
|
+
{"--log-disable"},
|
2279
|
+
"Log disable",
|
2280
|
+
[](gpt_params &) { log_param_single_parse("--log-disable"); }
|
2281
|
+
));
|
2282
|
+
add_opt(llama_arg(
|
2283
|
+
{"--log-enable"},
|
2284
|
+
"Log enable",
|
2285
|
+
[](gpt_params &) { log_param_single_parse("--log-enable"); }
|
2286
|
+
));
|
2287
|
+
add_opt(llama_arg(
|
2288
|
+
{"--log-new"},
|
2289
|
+
"Log new",
|
2290
|
+
[](gpt_params &) { log_param_single_parse("--log-new"); }
|
2291
|
+
));
|
2292
|
+
add_opt(llama_arg(
|
2293
|
+
{"--log-append"},
|
2294
|
+
"Log append",
|
2295
|
+
[](gpt_params &) { log_param_single_parse("--log-append"); }
|
2296
|
+
));
|
2297
|
+
add_opt(llama_arg(
|
2298
|
+
{"--log-file"}, "FNAME",
|
2299
|
+
"Log file",
|
2300
|
+
[](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); }
|
2301
|
+
));
|
2302
|
+
#endif // LOG_DISABLE_LOGS
|
1806
2303
|
|
1807
|
-
|
1808
|
-
}
|
1809
|
-
printf("\n");
|
2304
|
+
return options;
|
1810
2305
|
}
|
1811
2306
|
|
1812
2307
|
std::string gpt_params_get_system_info(const gpt_params & params) {
|
1813
2308
|
std::ostringstream os;
|
1814
2309
|
|
1815
|
-
os << "system_info: n_threads = " << params.n_threads;
|
1816
|
-
if (params.
|
1817
|
-
os << " (n_threads_batch = " << params.
|
2310
|
+
os << "system_info: n_threads = " << params.cpuparams.n_threads;
|
2311
|
+
if (params.cpuparams_batch.n_threads != -1) {
|
2312
|
+
os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
|
1818
2313
|
}
|
1819
2314
|
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
1820
2315
|
// TODO: windows + arm64 + mingw64
|
@@ -2232,8 +2727,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
2232
2727
|
llama_lora_adapters_apply(lctx, iparams.lora_adapters);
|
2233
2728
|
}
|
2234
2729
|
|
2235
|
-
if (params.ignore_eos) {
|
2236
|
-
|
2730
|
+
if (params.sparams.ignore_eos && llama_token_eos(model) == -1) {
|
2731
|
+
fprintf(stderr, "%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
2732
|
+
params.sparams.ignore_eos = false;
|
2237
2733
|
}
|
2238
2734
|
|
2239
2735
|
if (params.warmup) {
|
@@ -2243,10 +2739,15 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
2243
2739
|
llama_token bos = llama_token_bos(model);
|
2244
2740
|
llama_token eos = llama_token_eos(model);
|
2245
2741
|
// some models (e.g. T5) don't have a BOS token
|
2246
|
-
if (bos !=
|
2742
|
+
if (bos != LLAMA_TOKEN_NULL) {
|
2247
2743
|
tmp.push_back(bos);
|
2248
2744
|
}
|
2249
|
-
|
2745
|
+
if (eos != LLAMA_TOKEN_NULL) {
|
2746
|
+
tmp.push_back(eos);
|
2747
|
+
}
|
2748
|
+
if (tmp.empty()) {
|
2749
|
+
tmp.push_back(0);
|
2750
|
+
}
|
2250
2751
|
|
2251
2752
|
if (llama_model_has_encoder(model)) {
|
2252
2753
|
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
|
@@ -2262,7 +2763,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
2262
2763
|
}
|
2263
2764
|
llama_kv_cache_clear(lctx);
|
2264
2765
|
llama_synchronize(lctx);
|
2265
|
-
|
2766
|
+
llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT);
|
2266
2767
|
}
|
2267
2768
|
|
2268
2769
|
iparams.model = model;
|
@@ -2339,9 +2840,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
2339
2840
|
cparams.n_seq_max = params.n_parallel;
|
2340
2841
|
cparams.n_batch = params.n_batch;
|
2341
2842
|
cparams.n_ubatch = params.n_ubatch;
|
2342
|
-
cparams.n_threads = params.n_threads;
|
2343
|
-
cparams.n_threads_batch = params.
|
2344
|
-
|
2843
|
+
cparams.n_threads = params.cpuparams.n_threads;
|
2844
|
+
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
2845
|
+
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
2345
2846
|
cparams.logits_all = params.logits_all;
|
2346
2847
|
cparams.embeddings = params.embedding;
|
2347
2848
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
@@ -2366,6 +2867,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
2366
2867
|
return cparams;
|
2367
2868
|
}
|
2368
2869
|
|
2870
|
+
struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
|
2871
|
+
struct lm_ggml_threadpool_params tpp;
|
2872
|
+
|
2873
|
+
lm_ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
|
2874
|
+
|
2875
|
+
if (params.mask_valid) {
|
2876
|
+
std::memcpy(&tpp.cpumask, ¶ms.cpumask, LM_GGML_MAX_N_THREADS);
|
2877
|
+
}
|
2878
|
+
|
2879
|
+
tpp.prio = params.priority;
|
2880
|
+
tpp.poll = params.poll;
|
2881
|
+
tpp.strict_cpu = params.strict_cpu;
|
2882
|
+
|
2883
|
+
return tpp;
|
2884
|
+
}
|
2885
|
+
|
2369
2886
|
#ifdef LLAMA_USE_CURL
|
2370
2887
|
|
2371
2888
|
static bool starts_with(const std::string & str, const std::string & prefix) {
|
@@ -3211,7 +3728,7 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
|
|
3211
3728
|
|
3212
3729
|
void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
3213
3730
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
3214
|
-
const
|
3731
|
+
const auto & sparams = params.sparams;
|
3215
3732
|
|
3216
3733
|
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
3217
3734
|
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
@@ -3262,8 +3779,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
3262
3779
|
|
3263
3780
|
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
3264
3781
|
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
3265
|
-
yaml_dump_string_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
|
3266
|
-
fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
|
3267
3782
|
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
3268
3783
|
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
3269
3784
|
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
@@ -3274,10 +3789,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
3274
3789
|
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
3275
3790
|
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
3276
3791
|
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
3277
|
-
|
3278
|
-
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
|
3279
|
-
const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
3280
|
-
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
3792
|
+
fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");
|
3281
3793
|
|
3282
3794
|
yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
3283
3795
|
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
|
@@ -3288,11 +3800,8 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
3288
3800
|
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
3289
3801
|
|
3290
3802
|
fprintf(stream, "logit_bias:\n");
|
3291
|
-
for (
|
3292
|
-
|
3293
|
-
continue;
|
3294
|
-
}
|
3295
|
-
fprintf(stream, " %d: %f", lb.first, lb.second);
|
3803
|
+
for (const auto & logit_bias : sparams.logit_bias) {
|
3804
|
+
fprintf(stream, " %d: %f", logit_bias.token, logit_bias.bias);
|
3296
3805
|
}
|
3297
3806
|
|
3298
3807
|
fprintf(stream, "lora:\n");
|
@@ -3345,7 +3854,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
3345
3854
|
|
3346
3855
|
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
|
3347
3856
|
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
|
3348
|
-
fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
|
3349
3857
|
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
3350
3858
|
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
3351
3859
|
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
@@ -3355,11 +3863,11 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
3355
3863
|
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
|
3356
3864
|
|
3357
3865
|
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
3358
|
-
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
|
3866
|
+
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
|
3359
3867
|
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
3360
3868
|
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
3361
3869
|
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
3362
|
-
fprintf(stream, "
|
3870
|
+
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
|
3363
3871
|
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
3364
3872
|
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
3365
3873
|
}
|