cui-llama.rn 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +1 -0
- package/android/src/main/jni.cpp +3 -4
- package/cpp/common.cpp +183 -1990
- package/cpp/common.h +101 -130
- package/cpp/ggml-impl.h +32 -0
- package/cpp/ggml-metal.m +38 -28
- package/cpp/ggml-quants.c +275 -84
- package/cpp/ggml.c +89 -35
- package/cpp/ggml.h +30 -67
- package/cpp/llama-impl.h +1 -0
- package/cpp/llama-sampling.cpp +218 -102
- package/cpp/llama.cpp +599 -120
- package/cpp/llama.h +33 -25
- package/cpp/log.cpp +401 -0
- package/cpp/log.h +85 -703
- package/cpp/rn-llama.hpp +9 -11
- package/cpp/sampling.cpp +12 -9
- package/cpp/sampling.h +4 -56
- package/cpp/sgemm.cpp +38 -0
- package/package.json +1 -1
package/cpp/common.cpp
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
#endif
|
4
4
|
|
5
5
|
#include "common.h"
|
6
|
+
#include "log.h"
|
6
7
|
// Change JSON_ASSERT from assert() to LM_GGML_ASSERT:
|
7
8
|
#define JSON_ASSERT LM_GGML_ASSERT
|
8
9
|
#include "json.hpp"
|
@@ -25,7 +26,7 @@
|
|
25
26
|
#include <unordered_map>
|
26
27
|
#include <unordered_set>
|
27
28
|
#include <vector>
|
28
|
-
#include <
|
29
|
+
#include <thread>
|
29
30
|
|
30
31
|
#if defined(__APPLE__) && defined(__MACH__)
|
31
32
|
#include <sys/types.h>
|
@@ -49,7 +50,6 @@
|
|
49
50
|
#if defined(LLAMA_USE_CURL)
|
50
51
|
#include <curl/curl.h>
|
51
52
|
#include <curl/easy.h>
|
52
|
-
#include <thread>
|
53
53
|
#include <future>
|
54
54
|
#endif
|
55
55
|
|
@@ -63,14 +63,6 @@ char const *LLAMA_BUILD_TARGET = "unknown";
|
|
63
63
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
64
64
|
#endif
|
65
65
|
|
66
|
-
#if (defined(LM_GGML_USE_CUDA) || defined(LM_GGML_USE_SYCL))
|
67
|
-
#define LM_GGML_USE_CUDA_SYCL
|
68
|
-
#endif
|
69
|
-
|
70
|
-
#if (defined(LM_GGML_USE_CUDA) || defined(LM_GGML_USE_SYCL)) || defined(LM_GGML_USE_VULKAN)
|
71
|
-
#define LM_GGML_USE_CUDA_SYCL_VULKAN
|
72
|
-
#endif
|
73
|
-
|
74
66
|
#if defined(LLAMA_USE_CURL)
|
75
67
|
#ifdef __linux__
|
76
68
|
#include <linux/limits.h>
|
@@ -241,7 +233,7 @@ bool set_process_priority(enum lm_ggml_sched_priority prio) {
|
|
241
233
|
}
|
242
234
|
|
243
235
|
if (!SetPriorityClass(GetCurrentProcess(), p)) {
|
244
|
-
|
236
|
+
LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
|
245
237
|
return false;
|
246
238
|
}
|
247
239
|
|
@@ -266,7 +258,7 @@ bool set_process_priority(enum lm_ggml_sched_priority prio) {
|
|
266
258
|
}
|
267
259
|
|
268
260
|
if (!setpriority(PRIO_PROCESS, 0, p)) {
|
269
|
-
|
261
|
+
LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
|
270
262
|
return false;
|
271
263
|
}
|
272
264
|
return true;
|
@@ -278,53 +270,6 @@ bool set_process_priority(enum lm_ggml_sched_priority prio) {
|
|
278
270
|
// CLI argument parsing
|
279
271
|
//
|
280
272
|
|
281
|
-
#ifdef __GNUC__
|
282
|
-
#ifdef __MINGW32__
|
283
|
-
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
284
|
-
#else
|
285
|
-
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
286
|
-
#endif
|
287
|
-
#else
|
288
|
-
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
289
|
-
#endif
|
290
|
-
|
291
|
-
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
292
|
-
static std::string format(const char * fmt, ...) {
|
293
|
-
va_list ap;
|
294
|
-
va_list ap2;
|
295
|
-
va_start(ap, fmt);
|
296
|
-
va_copy(ap2, ap);
|
297
|
-
int size = vsnprintf(NULL, 0, fmt, ap);
|
298
|
-
LM_GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
299
|
-
std::vector<char> buf(size + 1);
|
300
|
-
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
301
|
-
LM_GGML_ASSERT(size2 == size);
|
302
|
-
va_end(ap2);
|
303
|
-
va_end(ap);
|
304
|
-
return std::string(buf.data(), size);
|
305
|
-
}
|
306
|
-
|
307
|
-
static void gpt_params_handle_model_default(gpt_params & params) {
|
308
|
-
if (!params.hf_repo.empty()) {
|
309
|
-
// short-hand to avoid specifying --hf-file -> default it to --model
|
310
|
-
if (params.hf_file.empty()) {
|
311
|
-
if (params.model.empty()) {
|
312
|
-
throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
|
313
|
-
}
|
314
|
-
params.hf_file = params.model;
|
315
|
-
} else if (params.model.empty()) {
|
316
|
-
params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
|
317
|
-
}
|
318
|
-
} else if (!params.model_url.empty()) {
|
319
|
-
if (params.model.empty()) {
|
320
|
-
auto f = string_split(params.model_url, '#').front();
|
321
|
-
f = string_split(f, '?').front();
|
322
|
-
params.model = fs_get_cache_file(string_split(f, '/').back());
|
323
|
-
}
|
324
|
-
} else if (params.model.empty()) {
|
325
|
-
params.model = DEFAULT_MODEL_PATH;
|
326
|
-
}
|
327
|
-
}
|
328
273
|
|
329
274
|
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
|
330
275
|
int32_t n_set = 0;
|
@@ -346,158 +291,14 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
|
|
346
291
|
|
347
292
|
if (n_set && n_set < cpuparams.n_threads) {
|
348
293
|
// Not enough set bits, may experience performance issues.
|
349
|
-
|
294
|
+
LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
|
350
295
|
}
|
351
296
|
}
|
352
297
|
|
353
|
-
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options) {
|
354
|
-
std::string arg;
|
355
|
-
const std::string arg_prefix = "--";
|
356
|
-
gpt_sampler_params & sparams = params.sparams;
|
357
|
-
|
358
|
-
std::unordered_map<std::string, llama_arg *> arg_to_options;
|
359
|
-
for (auto & opt : options) {
|
360
|
-
for (const auto & arg : opt.args) {
|
361
|
-
arg_to_options[arg] = &opt;
|
362
|
-
}
|
363
|
-
}
|
364
|
-
|
365
|
-
// handle environment variables
|
366
|
-
for (auto & opt : options) {
|
367
|
-
std::string value;
|
368
|
-
if (opt.get_value_from_env(value)) {
|
369
|
-
try {
|
370
|
-
if (opt.handler_void && (value == "1" || value == "true")) {
|
371
|
-
opt.handler_void(params);
|
372
|
-
}
|
373
|
-
if (opt.handler_int) {
|
374
|
-
opt.handler_int(params, std::stoi(value));
|
375
|
-
}
|
376
|
-
if (opt.handler_string) {
|
377
|
-
opt.handler_string(params, value);
|
378
|
-
continue;
|
379
|
-
}
|
380
|
-
} catch (std::exception & e) {
|
381
|
-
throw std::invalid_argument(format(
|
382
|
-
"error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
|
383
|
-
}
|
384
|
-
}
|
385
|
-
}
|
386
|
-
|
387
|
-
// handle command line arguments
|
388
|
-
auto check_arg = [&](int i) {
|
389
|
-
if (i+1 >= argc) {
|
390
|
-
throw std::invalid_argument("expected value for argument");
|
391
|
-
}
|
392
|
-
};
|
393
|
-
|
394
|
-
for (int i = 1; i < argc; i++) {
|
395
|
-
const std::string arg_prefix = "--";
|
396
|
-
|
397
|
-
std::string arg = argv[i];
|
398
|
-
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
399
|
-
std::replace(arg.begin(), arg.end(), '_', '-');
|
400
|
-
}
|
401
|
-
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
402
|
-
throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str()));
|
403
|
-
}
|
404
|
-
auto opt = *arg_to_options[arg];
|
405
|
-
if (opt.has_value_from_env()) {
|
406
|
-
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
|
407
|
-
}
|
408
|
-
try {
|
409
|
-
if (opt.handler_void) {
|
410
|
-
opt.handler_void(params);
|
411
|
-
continue;
|
412
|
-
}
|
413
|
-
|
414
|
-
// arg with single value
|
415
|
-
check_arg(i);
|
416
|
-
std::string val = argv[++i];
|
417
|
-
if (opt.handler_int) {
|
418
|
-
opt.handler_int(params, std::stoi(val));
|
419
|
-
continue;
|
420
|
-
}
|
421
|
-
if (opt.handler_string) {
|
422
|
-
opt.handler_string(params, val);
|
423
|
-
continue;
|
424
|
-
}
|
425
|
-
|
426
|
-
// arg with 2 values
|
427
|
-
check_arg(i);
|
428
|
-
std::string val2 = argv[++i];
|
429
|
-
if (opt.handler_str_str) {
|
430
|
-
opt.handler_str_str(params, val, val2);
|
431
|
-
continue;
|
432
|
-
}
|
433
|
-
} catch (std::exception & e) {
|
434
|
-
throw std::invalid_argument(format(
|
435
|
-
"error while handling argument \"%s\": %s\n\n"
|
436
|
-
"usage:\n%s\n\nto show complete usage, run with -h",
|
437
|
-
arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
|
438
|
-
}
|
439
|
-
}
|
440
|
-
|
441
|
-
postprocess_cpu_params(params.cpuparams, nullptr);
|
442
|
-
postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
|
443
|
-
postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams);
|
444
|
-
postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch);
|
445
|
-
|
446
|
-
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
|
447
|
-
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
448
|
-
}
|
449
|
-
|
450
|
-
gpt_params_handle_model_default(params);
|
451
|
-
|
452
|
-
if (params.escape) {
|
453
|
-
string_process_escapes(params.prompt);
|
454
|
-
string_process_escapes(params.input_prefix);
|
455
|
-
string_process_escapes(params.input_suffix);
|
456
|
-
for (auto & antiprompt : params.antiprompt) {
|
457
|
-
string_process_escapes(antiprompt);
|
458
|
-
}
|
459
|
-
}
|
460
|
-
|
461
|
-
if (!params.kv_overrides.empty()) {
|
462
|
-
params.kv_overrides.emplace_back();
|
463
|
-
params.kv_overrides.back().key[0] = 0;
|
464
|
-
}
|
465
|
-
|
466
|
-
if (sparams.seed == LLAMA_DEFAULT_SEED) {
|
467
|
-
sparams.seed = time(NULL);
|
468
|
-
}
|
469
|
-
|
470
|
-
return true;
|
471
|
-
}
|
472
|
-
|
473
|
-
bool gpt_params_parse(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options) {
|
474
|
-
const auto params_org = params; // the example can modify the default params
|
475
|
-
|
476
|
-
try {
|
477
|
-
if (!gpt_params_parse_ex(argc, argv, params, options)) {
|
478
|
-
params = params_org;
|
479
|
-
return false;
|
480
|
-
}
|
481
|
-
if (params.usage) {
|
482
|
-
gpt_params_print_usage(params, options);
|
483
|
-
if (params.print_usage) {
|
484
|
-
params.print_usage(argc, argv);
|
485
|
-
}
|
486
|
-
exit(0);
|
487
|
-
}
|
488
|
-
} catch (const std::invalid_argument & ex) {
|
489
|
-
fprintf(stderr, "%s\n", ex.what());
|
490
|
-
params = params_org;
|
491
|
-
return false;
|
492
|
-
}
|
493
|
-
|
494
|
-
return true;
|
495
|
-
}
|
496
|
-
|
497
298
|
bool parse_cpu_range(const std::string & range, bool (&boolmask)[LM_GGML_MAX_N_THREADS]) {
|
498
299
|
size_t dash_loc = range.find('-');
|
499
300
|
if (dash_loc == std::string::npos) {
|
500
|
-
|
301
|
+
LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
|
501
302
|
return false;
|
502
303
|
}
|
503
304
|
|
@@ -509,7 +310,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[LM_GGML_MAX_N_T
|
|
509
310
|
} else {
|
510
311
|
start_i = std::stoull(range.substr(0, dash_loc));
|
511
312
|
if (start_i >= LM_GGML_MAX_N_THREADS) {
|
512
|
-
|
313
|
+
LOG_ERR("Start index out of bounds!\n");
|
513
314
|
return false;
|
514
315
|
}
|
515
316
|
}
|
@@ -519,7 +320,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[LM_GGML_MAX_N_T
|
|
519
320
|
} else {
|
520
321
|
end_i = std::stoull(range.substr(dash_loc + 1));
|
521
322
|
if (end_i >= LM_GGML_MAX_N_THREADS) {
|
522
|
-
|
323
|
+
LOG_ERR("End index out of bounds!\n");
|
523
324
|
return false;
|
524
325
|
}
|
525
326
|
}
|
@@ -554,7 +355,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[LM_GGML_MAX_N_THR
|
|
554
355
|
} else if (c >= 'A' && c <= 'F') {
|
555
356
|
id -= 'A' - 10;
|
556
357
|
} else {
|
557
|
-
|
358
|
+
LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
|
558
359
|
return false;
|
559
360
|
}
|
560
361
|
|
@@ -567,1741 +368,20 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[LM_GGML_MAX_N_THR
|
|
567
368
|
return true;
|
568
369
|
}
|
569
370
|
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
auto add_line = [&](const std::string& l) {
|
575
|
-
if (l.length() <= max_char_per_line) {
|
576
|
-
result.push_back(l);
|
577
|
-
} else {
|
578
|
-
std::istringstream line_stream(l);
|
579
|
-
std::string word, current_line;
|
580
|
-
while (line_stream >> word) {
|
581
|
-
if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) {
|
582
|
-
if (!current_line.empty()) result.push_back(current_line);
|
583
|
-
current_line = word;
|
584
|
-
} else {
|
585
|
-
current_line += (!current_line.empty() ? " " : "") + word;
|
586
|
-
}
|
587
|
-
}
|
588
|
-
if (!current_line.empty()) result.push_back(current_line);
|
371
|
+
void gpt_init() {
|
372
|
+
llama_log_set([](lm_ggml_log_level level, const char * text, void * /*user_data*/) {
|
373
|
+
if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
|
374
|
+
gpt_log_add(gpt_log_main(), level, "%s", text);
|
589
375
|
}
|
590
|
-
};
|
591
|
-
while (std::getline(iss, line)) {
|
592
|
-
add_line(line);
|
593
|
-
}
|
594
|
-
return result;
|
595
|
-
}
|
596
|
-
|
597
|
-
std::string llama_arg::to_string() {
|
598
|
-
// params for printing to console
|
599
|
-
const static int n_leading_spaces = 40;
|
600
|
-
const static int n_char_per_line_help = 70; // TODO: detect this based on current console
|
601
|
-
std::string leading_spaces(n_leading_spaces, ' ');
|
602
|
-
|
603
|
-
std::ostringstream ss;
|
604
|
-
for (const auto arg : args) {
|
605
|
-
if (arg == args.front()) {
|
606
|
-
if (args.size() == 1) {
|
607
|
-
ss << arg;
|
608
|
-
} else {
|
609
|
-
// first arg is usually abbreviation, we need padding to make it more beautiful
|
610
|
-
auto tmp = std::string(arg) + ", ";
|
611
|
-
ss << format("%-7s", tmp.c_str());
|
612
|
-
}
|
613
|
-
} else {
|
614
|
-
ss << arg << (arg != args.back() ? ", " : "");
|
615
|
-
}
|
616
|
-
}
|
617
|
-
if (value_hint) ss << " " << value_hint;
|
618
|
-
if (value_hint_2) ss << " " << value_hint_2;
|
619
|
-
if (ss.tellp() > n_leading_spaces - 3) {
|
620
|
-
// current line is too long, add new line
|
621
|
-
ss << "\n" << leading_spaces;
|
622
|
-
} else {
|
623
|
-
// padding between arg and help, same line
|
624
|
-
ss << std::string(leading_spaces.size() - ss.tellp(), ' ');
|
625
|
-
}
|
626
|
-
const auto help_lines = break_str_into_lines(help, n_char_per_line_help);
|
627
|
-
for (const auto & line : help_lines) {
|
628
|
-
ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n";
|
629
|
-
}
|
630
|
-
return ss.str();
|
631
|
-
}
|
632
|
-
|
633
|
-
void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options) {
|
634
|
-
auto print_options = [](std::vector<llama_arg *> & options) {
|
635
|
-
for (llama_arg * opt : options) {
|
636
|
-
printf("%s", opt->to_string().c_str());
|
637
|
-
}
|
638
|
-
};
|
639
|
-
|
640
|
-
std::vector<llama_arg *> common_options;
|
641
|
-
std::vector<llama_arg *> specific_options;
|
642
|
-
for (auto & opt : options) {
|
643
|
-
// in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
|
644
|
-
if (opt.in_example(params.curr_ex)) {
|
645
|
-
specific_options.push_back(&opt);
|
646
|
-
} else {
|
647
|
-
common_options.push_back(&opt);
|
648
|
-
}
|
649
|
-
}
|
650
|
-
printf("----- common options -----\n\n");
|
651
|
-
print_options(common_options);
|
652
|
-
// TODO: maybe convert enum llama_example to string
|
653
|
-
printf("\n\n----- example-specific options -----\n\n");
|
654
|
-
print_options(specific_options);
|
655
|
-
}
|
656
|
-
|
657
|
-
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex) {
|
658
|
-
return gpt_params_parser_init(params, ex, nullptr);
|
659
|
-
}
|
660
|
-
|
661
|
-
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage) {
|
662
|
-
std::vector<llama_arg> options;
|
663
|
-
params.print_usage = print_usage;
|
664
|
-
params.curr_ex = ex;
|
665
|
-
|
666
|
-
std::string sampler_type_chars;
|
667
|
-
std::string sampler_type_names;
|
668
|
-
for (const auto & sampler : params.sparams.samplers) {
|
669
|
-
sampler_type_chars += gpt_sampler_type_to_chr(sampler);
|
670
|
-
sampler_type_names += gpt_sampler_type_to_str(sampler) + ";";
|
671
|
-
}
|
672
|
-
sampler_type_names.pop_back();
|
673
|
-
|
674
|
-
|
675
|
-
/**
|
676
|
-
* filter options by example
|
677
|
-
* rules:
|
678
|
-
* - all examples inherit options from LLAMA_EXAMPLE_COMMON
|
679
|
-
* - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
|
680
|
-
* - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
|
681
|
-
*/
|
682
|
-
auto add_opt = [&](llama_arg arg) {
|
683
|
-
if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
|
684
|
-
options.push_back(std::move(arg));
|
685
|
-
}
|
686
|
-
};
|
687
|
-
|
376
|
+
}, NULL);
|
688
377
|
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
params.usage = true;
|
694
|
-
}
|
695
|
-
));
|
696
|
-
add_opt(llama_arg(
|
697
|
-
{"--version"},
|
698
|
-
"show version and build info",
|
699
|
-
[](gpt_params &) {
|
700
|
-
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
701
|
-
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
702
|
-
exit(0);
|
703
|
-
}
|
704
|
-
));
|
705
|
-
add_opt(llama_arg(
|
706
|
-
{"-v", "--verbose"},
|
707
|
-
"print verbose information",
|
708
|
-
[](gpt_params & params) {
|
709
|
-
params.verbosity = 1;
|
710
|
-
}
|
711
|
-
));
|
712
|
-
add_opt(llama_arg(
|
713
|
-
{"--verbosity"}, "N",
|
714
|
-
format("set specific verbosity level (default: %d)", params.verbosity),
|
715
|
-
[](gpt_params & params, int value) {
|
716
|
-
params.verbosity = value;
|
717
|
-
}
|
718
|
-
));
|
719
|
-
add_opt(llama_arg(
|
720
|
-
{"--verbose-prompt"},
|
721
|
-
format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
|
722
|
-
[](gpt_params & params) {
|
723
|
-
params.verbose_prompt = true;
|
724
|
-
}
|
725
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
726
|
-
add_opt(llama_arg(
|
727
|
-
{"--no-display-prompt"},
|
728
|
-
format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
|
729
|
-
[](gpt_params & params) {
|
730
|
-
params.display_prompt = false;
|
731
|
-
}
|
732
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
733
|
-
add_opt(llama_arg(
|
734
|
-
{"-co", "--color"},
|
735
|
-
format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
|
736
|
-
[](gpt_params & params) {
|
737
|
-
params.use_color = true;
|
738
|
-
}
|
739
|
-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
740
|
-
add_opt(llama_arg(
|
741
|
-
{"-s", "--seed"}, "SEED",
|
742
|
-
format("RNG seed (default: %d, use random seed for < 0)", params.sparams.seed),
|
743
|
-
[](gpt_params & params, const std::string & value) {
|
744
|
-
params.sparams.seed = std::stoul(value);
|
745
|
-
}
|
746
|
-
));
|
747
|
-
add_opt(llama_arg(
|
748
|
-
{"-t", "--threads"}, "N",
|
749
|
-
format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
750
|
-
[](gpt_params & params, int value) {
|
751
|
-
params.cpuparams.n_threads = value;
|
752
|
-
if (params.cpuparams.n_threads <= 0) {
|
753
|
-
params.cpuparams.n_threads = std::thread::hardware_concurrency();
|
754
|
-
}
|
755
|
-
}
|
756
|
-
).set_env("LLAMA_ARG_THREADS"));
|
757
|
-
add_opt(llama_arg(
|
758
|
-
{"-tb", "--threads-batch"}, "N",
|
759
|
-
"number of threads to use during batch and prompt processing (default: same as --threads)",
|
760
|
-
[](gpt_params & params, int value) {
|
761
|
-
params.cpuparams_batch.n_threads = value;
|
762
|
-
if (params.cpuparams_batch.n_threads <= 0) {
|
763
|
-
params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
764
|
-
}
|
765
|
-
}
|
766
|
-
));
|
767
|
-
add_opt(llama_arg(
|
768
|
-
{"-td", "--threads-draft"}, "N",
|
769
|
-
"number of threads to use during generation (default: same as --threads)",
|
770
|
-
[](gpt_params & params, int value) {
|
771
|
-
params.draft_cpuparams.n_threads = value;
|
772
|
-
if (params.draft_cpuparams.n_threads <= 0) {
|
773
|
-
params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
|
774
|
-
}
|
775
|
-
}
|
776
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
777
|
-
add_opt(llama_arg(
|
778
|
-
{"-tbd", "--threads-batch-draft"}, "N",
|
779
|
-
"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
|
780
|
-
[](gpt_params & params, int value) {
|
781
|
-
params.draft_cpuparams_batch.n_threads = value;
|
782
|
-
if (params.draft_cpuparams_batch.n_threads <= 0) {
|
783
|
-
params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
784
|
-
}
|
785
|
-
}
|
786
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
787
|
-
add_opt(llama_arg(
|
788
|
-
{"-C", "--cpu-mask"}, "M",
|
789
|
-
"CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
|
790
|
-
[](gpt_params & params, const std::string & mask) {
|
791
|
-
params.cpuparams.mask_valid = true;
|
792
|
-
if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) {
|
793
|
-
throw std::invalid_argument("invalid cpumask");
|
794
|
-
}
|
795
|
-
}
|
796
|
-
));
|
797
|
-
add_opt(llama_arg(
|
798
|
-
{"-Cr", "--cpu-range"}, "lo-hi",
|
799
|
-
"range of CPUs for affinity. Complements --cpu-mask",
|
800
|
-
[](gpt_params & params, const std::string & range) {
|
801
|
-
params.cpuparams.mask_valid = true;
|
802
|
-
if (!parse_cpu_range(range, params.cpuparams.cpumask)) {
|
803
|
-
throw std::invalid_argument("invalid range");
|
804
|
-
}
|
805
|
-
}
|
806
|
-
));
|
807
|
-
add_opt(llama_arg(
|
808
|
-
{"--cpu-strict"}, "<0|1>",
|
809
|
-
format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
|
810
|
-
[](gpt_params & params, const std::string & value) {
|
811
|
-
params.cpuparams.strict_cpu = std::stoul(value);
|
812
|
-
}
|
813
|
-
));
|
814
|
-
add_opt(llama_arg(
|
815
|
-
{"--prio"}, "N",
|
816
|
-
format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
|
817
|
-
[](gpt_params & params, int prio) {
|
818
|
-
if (prio < 0 || prio > 3) {
|
819
|
-
throw std::invalid_argument("invalid value");
|
820
|
-
}
|
821
|
-
params.cpuparams.priority = (enum lm_ggml_sched_priority) prio;
|
822
|
-
}
|
823
|
-
));
|
824
|
-
add_opt(llama_arg(
|
825
|
-
{"--poll"}, "<0...100>",
|
826
|
-
format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
|
827
|
-
[](gpt_params & params, const std::string & value) {
|
828
|
-
params.cpuparams.poll = std::stoul(value);
|
829
|
-
}
|
830
|
-
));
|
831
|
-
add_opt(llama_arg(
|
832
|
-
{"-Cb", "--cpu-mask-batch"}, "M",
|
833
|
-
"CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
|
834
|
-
[](gpt_params & params, const std::string & mask) {
|
835
|
-
params.cpuparams_batch.mask_valid = true;
|
836
|
-
if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) {
|
837
|
-
throw std::invalid_argument("invalid cpumask");
|
838
|
-
}
|
839
|
-
}
|
840
|
-
));
|
841
|
-
add_opt(llama_arg(
|
842
|
-
{"-Crb", "--cpu-range-batch"}, "lo-hi",
|
843
|
-
"ranges of CPUs for affinity. Complements --cpu-mask-batch",
|
844
|
-
[](gpt_params & params, const std::string & range) {
|
845
|
-
params.cpuparams_batch.mask_valid = true;
|
846
|
-
if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) {
|
847
|
-
throw std::invalid_argument("invalid range");
|
848
|
-
}
|
849
|
-
}
|
850
|
-
));
|
851
|
-
add_opt(llama_arg(
|
852
|
-
{"--cpu-strict-batch"}, "<0|1>",
|
853
|
-
"use strict CPU placement (default: same as --cpu-strict)",
|
854
|
-
[](gpt_params & params, int value) {
|
855
|
-
params.cpuparams_batch.strict_cpu = value;
|
856
|
-
}
|
857
|
-
));
|
858
|
-
add_opt(llama_arg(
|
859
|
-
{"--prio-batch"}, "N",
|
860
|
-
format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
|
861
|
-
[](gpt_params & params, int prio) {
|
862
|
-
if (prio < 0 || prio > 3) {
|
863
|
-
throw std::invalid_argument("invalid value");
|
864
|
-
}
|
865
|
-
params.cpuparams_batch.priority = (enum lm_ggml_sched_priority) prio;
|
866
|
-
}
|
867
|
-
));
|
868
|
-
add_opt(llama_arg(
|
869
|
-
{"--poll-batch"}, "<0|1>",
|
870
|
-
"use polling to wait for work (default: same as --poll)",
|
871
|
-
[](gpt_params & params, int value) {
|
872
|
-
params.cpuparams_batch.poll = value;
|
873
|
-
}
|
874
|
-
));
|
875
|
-
add_opt(llama_arg(
|
876
|
-
{"-Cd", "--cpu-mask-draft"}, "M",
|
877
|
-
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
878
|
-
[](gpt_params & params, const std::string & mask) {
|
879
|
-
params.draft_cpuparams.mask_valid = true;
|
880
|
-
if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
|
881
|
-
throw std::invalid_argument("invalid cpumask");
|
882
|
-
}
|
883
|
-
}
|
884
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
885
|
-
add_opt(llama_arg(
|
886
|
-
{"-Crd", "--cpu-range-draft"}, "lo-hi",
|
887
|
-
"Ranges of CPUs for affinity. Complements --cpu-mask-draft",
|
888
|
-
[](gpt_params & params, const std::string & range) {
|
889
|
-
params.draft_cpuparams.mask_valid = true;
|
890
|
-
if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
|
891
|
-
throw std::invalid_argument("invalid range");
|
892
|
-
}
|
893
|
-
}
|
894
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
895
|
-
add_opt(llama_arg(
|
896
|
-
{"--cpu-strict-draft"}, "<0|1>",
|
897
|
-
"Use strict CPU placement for draft model (default: same as --cpu-strict)",
|
898
|
-
[](gpt_params & params, int value) {
|
899
|
-
params.draft_cpuparams.strict_cpu = value;
|
900
|
-
}
|
901
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
902
|
-
add_opt(llama_arg(
|
903
|
-
{"--prio-draft"}, "N",
|
904
|
-
format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
|
905
|
-
[](gpt_params & params, int prio) {
|
906
|
-
if (prio < 0 || prio > 3) {
|
907
|
-
throw std::invalid_argument("invalid value");
|
908
|
-
}
|
909
|
-
params.draft_cpuparams.priority = (enum lm_ggml_sched_priority) prio;
|
910
|
-
}
|
911
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
912
|
-
add_opt(llama_arg(
|
913
|
-
{"--poll-draft"}, "<0|1>",
|
914
|
-
"Use polling to wait for draft model work (default: same as --poll])",
|
915
|
-
[](gpt_params & params, int value) {
|
916
|
-
params.draft_cpuparams.poll = value;
|
917
|
-
}
|
918
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
919
|
-
add_opt(llama_arg(
|
920
|
-
{"-Cbd", "--cpu-mask-batch-draft"}, "M",
|
921
|
-
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
922
|
-
[](gpt_params & params, const std::string & mask) {
|
923
|
-
params.draft_cpuparams_batch.mask_valid = true;
|
924
|
-
if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) {
|
925
|
-
throw std::invalid_argument("invalid cpumask");
|
926
|
-
}
|
927
|
-
}
|
928
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
929
|
-
add_opt(llama_arg(
|
930
|
-
{"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
|
931
|
-
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
|
932
|
-
[](gpt_params & params, const std::string & range) {
|
933
|
-
params.draft_cpuparams_batch.mask_valid = true;
|
934
|
-
if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
|
935
|
-
throw std::invalid_argument("invalid cpumask");
|
936
|
-
}
|
937
|
-
}
|
938
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
939
|
-
add_opt(llama_arg(
|
940
|
-
{"--cpu-strict-batch-draft"}, "<0|1>",
|
941
|
-
"Use strict CPU placement for draft model (default: --cpu-strict-draft)",
|
942
|
-
[](gpt_params & params, int value) {
|
943
|
-
params.draft_cpuparams_batch.strict_cpu = value;
|
944
|
-
}
|
945
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
946
|
-
add_opt(llama_arg(
|
947
|
-
{"--prio-batch-draft"}, "N",
|
948
|
-
format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
|
949
|
-
[](gpt_params & params, int prio) {
|
950
|
-
if (prio < 0 || prio > 3) {
|
951
|
-
throw std::invalid_argument("invalid value");
|
952
|
-
}
|
953
|
-
params.draft_cpuparams_batch.priority = (enum lm_ggml_sched_priority) prio;
|
954
|
-
}
|
955
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
956
|
-
add_opt(llama_arg(
|
957
|
-
{"--poll-batch-draft"}, "<0|1>",
|
958
|
-
"Use polling to wait for draft model work (default: --poll-draft)",
|
959
|
-
[](gpt_params & params, int value) {
|
960
|
-
params.draft_cpuparams_batch.poll = value;
|
961
|
-
}
|
962
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
963
|
-
add_opt(llama_arg(
|
964
|
-
{"--draft"}, "N",
|
965
|
-
format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
|
966
|
-
[](gpt_params & params, int value) {
|
967
|
-
params.n_draft = value;
|
968
|
-
}
|
969
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
970
|
-
add_opt(llama_arg(
|
971
|
-
{"-ps", "--p-split"}, "N",
|
972
|
-
format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
|
973
|
-
[](gpt_params & params, const std::string & value) {
|
974
|
-
params.p_split = std::stof(value);
|
975
|
-
}
|
976
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
977
|
-
add_opt(llama_arg(
|
978
|
-
{"-lcs", "--lookup-cache-static"}, "FNAME",
|
979
|
-
"path to static lookup cache to use for lookup decoding (not updated by generation)",
|
980
|
-
[](gpt_params & params, const std::string & value) {
|
981
|
-
params.lookup_cache_static = value;
|
982
|
-
}
|
983
|
-
));
|
984
|
-
add_opt(llama_arg(
|
985
|
-
{"-lcd", "--lookup-cache-dynamic"}, "FNAME",
|
986
|
-
"path to dynamic lookup cache to use for lookup decoding (updated by generation)",
|
987
|
-
[](gpt_params & params, const std::string & value) {
|
988
|
-
params.lookup_cache_dynamic = value;
|
989
|
-
}
|
990
|
-
));
|
991
|
-
add_opt(llama_arg(
|
992
|
-
{"-c", "--ctx-size"}, "N",
|
993
|
-
format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
|
994
|
-
[](gpt_params & params, int value) {
|
995
|
-
params.n_ctx = value;
|
996
|
-
}
|
997
|
-
).set_env("LLAMA_ARG_CTX_SIZE"));
|
998
|
-
add_opt(llama_arg(
|
999
|
-
{"-n", "--predict", "--n-predict"}, "N",
|
1000
|
-
format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
|
1001
|
-
[](gpt_params & params, int value) {
|
1002
|
-
params.n_predict = value;
|
1003
|
-
}
|
1004
|
-
).set_env("LLAMA_ARG_N_PREDICT"));
|
1005
|
-
add_opt(llama_arg(
|
1006
|
-
{"-b", "--batch-size"}, "N",
|
1007
|
-
format("logical maximum batch size (default: %d)", params.n_batch),
|
1008
|
-
[](gpt_params & params, int value) {
|
1009
|
-
params.n_batch = value;
|
1010
|
-
}
|
1011
|
-
).set_env("LLAMA_ARG_BATCH"));
|
1012
|
-
add_opt(llama_arg(
|
1013
|
-
{"-ub", "--ubatch-size"}, "N",
|
1014
|
-
format("physical maximum batch size (default: %d)", params.n_ubatch),
|
1015
|
-
[](gpt_params & params, int value) {
|
1016
|
-
params.n_ubatch = value;
|
1017
|
-
}
|
1018
|
-
).set_env("LLAMA_ARG_UBATCH"));
|
1019
|
-
add_opt(llama_arg(
|
1020
|
-
{"--keep"}, "N",
|
1021
|
-
format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
|
1022
|
-
[](gpt_params & params, int value) {
|
1023
|
-
params.n_keep = value;
|
1024
|
-
}
|
1025
|
-
));
|
1026
|
-
add_opt(llama_arg(
|
1027
|
-
{"--chunks"}, "N",
|
1028
|
-
format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
|
1029
|
-
[](gpt_params & params, int value) {
|
1030
|
-
params.n_chunks = value;
|
1031
|
-
}
|
1032
|
-
));
|
1033
|
-
add_opt(llama_arg(
|
1034
|
-
{"-fa", "--flash-attn"},
|
1035
|
-
format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
|
1036
|
-
[](gpt_params & params) {
|
1037
|
-
params.flash_attn = true;
|
1038
|
-
}
|
1039
|
-
).set_env("LLAMA_ARG_FLASH_ATTN"));
|
1040
|
-
add_opt(llama_arg(
|
1041
|
-
{"-p", "--prompt"}, "PROMPT",
|
1042
|
-
ex == LLAMA_EXAMPLE_MAIN
|
1043
|
-
? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
|
1044
|
-
: "prompt to start generation with",
|
1045
|
-
[](gpt_params & params, const std::string & value) {
|
1046
|
-
params.prompt = value;
|
1047
|
-
}
|
1048
|
-
));
|
1049
|
-
add_opt(llama_arg(
|
1050
|
-
{"-f", "--file"}, "FNAME",
|
1051
|
-
"a file containing the prompt (default: none)",
|
1052
|
-
[](gpt_params & params, const std::string & value) {
|
1053
|
-
std::ifstream file(value);
|
1054
|
-
if (!file) {
|
1055
|
-
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
1056
|
-
}
|
1057
|
-
// store the external file name in params
|
1058
|
-
params.prompt_file = value;
|
1059
|
-
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
1060
|
-
if (!params.prompt.empty() && params.prompt.back() == '\n') {
|
1061
|
-
params.prompt.pop_back();
|
1062
|
-
}
|
1063
|
-
}
|
1064
|
-
));
|
1065
|
-
add_opt(llama_arg(
|
1066
|
-
{"--in-file"}, "FNAME",
|
1067
|
-
"an input file (repeat to specify multiple files)",
|
1068
|
-
[](gpt_params & params, const std::string & value) {
|
1069
|
-
std::ifstream file(value);
|
1070
|
-
if (!file) {
|
1071
|
-
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
1072
|
-
}
|
1073
|
-
params.in_files.push_back(value);
|
1074
|
-
}
|
1075
|
-
));
|
1076
|
-
add_opt(llama_arg(
|
1077
|
-
{"-bf", "--binary-file"}, "FNAME",
|
1078
|
-
"binary file containing the prompt (default: none)",
|
1079
|
-
[](gpt_params & params, const std::string & value) {
|
1080
|
-
std::ifstream file(value, std::ios::binary);
|
1081
|
-
if (!file) {
|
1082
|
-
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
1083
|
-
}
|
1084
|
-
// store the external file name in params
|
1085
|
-
params.prompt_file = value;
|
1086
|
-
std::ostringstream ss;
|
1087
|
-
ss << file.rdbuf();
|
1088
|
-
params.prompt = ss.str();
|
1089
|
-
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
|
1090
|
-
}
|
1091
|
-
));
|
1092
|
-
add_opt(llama_arg(
|
1093
|
-
{"-e", "--escape"},
|
1094
|
-
format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
1095
|
-
[](gpt_params & params) {
|
1096
|
-
params.escape = true;
|
1097
|
-
}
|
1098
|
-
));
|
1099
|
-
add_opt(llama_arg(
|
1100
|
-
{"--no-escape"},
|
1101
|
-
"do not process escape sequences",
|
1102
|
-
[](gpt_params & params) {
|
1103
|
-
params.escape = false;
|
1104
|
-
}
|
1105
|
-
));
|
1106
|
-
add_opt(llama_arg(
|
1107
|
-
{"-ptc", "--print-token-count"}, "N",
|
1108
|
-
format("print token count every N tokens (default: %d)", params.n_print),
|
1109
|
-
[](gpt_params & params, int value) {
|
1110
|
-
params.n_print = value;
|
1111
|
-
}
|
1112
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1113
|
-
add_opt(llama_arg(
|
1114
|
-
{"--prompt-cache"}, "FNAME",
|
1115
|
-
"file to cache prompt state for faster startup (default: none)",
|
1116
|
-
[](gpt_params & params, const std::string & value) {
|
1117
|
-
params.path_prompt_cache = value;
|
1118
|
-
}
|
1119
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1120
|
-
add_opt(llama_arg(
|
1121
|
-
{"--prompt-cache-all"},
|
1122
|
-
"if specified, saves user input and generations to cache as well\n",
|
1123
|
-
[](gpt_params & params) {
|
1124
|
-
params.prompt_cache_all = true;
|
1125
|
-
}
|
1126
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1127
|
-
add_opt(llama_arg(
|
1128
|
-
{"--prompt-cache-ro"},
|
1129
|
-
"if specified, uses the prompt cache but does not update it",
|
1130
|
-
[](gpt_params & params) {
|
1131
|
-
params.prompt_cache_ro = true;
|
1132
|
-
}
|
1133
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1134
|
-
add_opt(llama_arg(
|
1135
|
-
{"-r", "--reverse-prompt"}, "PROMPT",
|
1136
|
-
"halt generation at PROMPT, return control in interactive mode\n",
|
1137
|
-
[](gpt_params & params, const std::string & value) {
|
1138
|
-
params.antiprompt.emplace_back(value);
|
1139
|
-
}
|
1140
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1141
|
-
add_opt(llama_arg(
|
1142
|
-
{"-sp", "--special"},
|
1143
|
-
format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
|
1144
|
-
[](gpt_params & params) {
|
1145
|
-
params.special = true;
|
1146
|
-
}
|
1147
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1148
|
-
add_opt(llama_arg(
|
1149
|
-
{"-cnv", "--conversation"},
|
1150
|
-
format(
|
1151
|
-
"run in conversation mode:\n"
|
1152
|
-
"- does not print special tokens and suffix/prefix\n"
|
1153
|
-
"- interactive mode is also enabled\n"
|
1154
|
-
"(default: %s)",
|
1155
|
-
params.conversation ? "true" : "false"
|
1156
|
-
),
|
1157
|
-
[](gpt_params & params) {
|
1158
|
-
params.conversation = true;
|
1159
|
-
}
|
1160
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1161
|
-
add_opt(llama_arg(
|
1162
|
-
{"-i", "--interactive"},
|
1163
|
-
format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
|
1164
|
-
[](gpt_params & params) {
|
1165
|
-
params.interactive = true;
|
1166
|
-
}
|
1167
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1168
|
-
add_opt(llama_arg(
|
1169
|
-
{"-if", "--interactive-first"},
|
1170
|
-
format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
|
1171
|
-
[](gpt_params & params) {
|
1172
|
-
params.interactive_first = true;
|
1173
|
-
}
|
1174
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1175
|
-
add_opt(llama_arg(
|
1176
|
-
{"-mli", "--multiline-input"},
|
1177
|
-
"allows you to write or paste multiple lines without ending each in '\\'",
|
1178
|
-
[](gpt_params & params) {
|
1179
|
-
params.multiline_input = true;
|
1180
|
-
}
|
1181
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1182
|
-
add_opt(llama_arg(
|
1183
|
-
{"--in-prefix-bos"},
|
1184
|
-
"prefix BOS to user inputs, preceding the `--in-prefix` string",
|
1185
|
-
[](gpt_params & params) {
|
1186
|
-
params.input_prefix_bos = true;
|
1187
|
-
params.enable_chat_template = false;
|
1188
|
-
}
|
1189
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1190
|
-
add_opt(llama_arg(
|
1191
|
-
{"--in-prefix"}, "STRING",
|
1192
|
-
"string to prefix user inputs with (default: empty)",
|
1193
|
-
[](gpt_params & params, const std::string & value) {
|
1194
|
-
params.input_prefix = value;
|
1195
|
-
params.enable_chat_template = false;
|
1196
|
-
}
|
1197
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1198
|
-
add_opt(llama_arg(
|
1199
|
-
{"--in-suffix"}, "STRING",
|
1200
|
-
"string to suffix after user inputs with (default: empty)",
|
1201
|
-
[](gpt_params & params, const std::string & value) {
|
1202
|
-
params.input_suffix = value;
|
1203
|
-
params.enable_chat_template = false;
|
1204
|
-
}
|
1205
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1206
|
-
add_opt(llama_arg(
|
1207
|
-
{"--no-warmup"},
|
1208
|
-
"skip warming up the model with an empty run",
|
1209
|
-
[](gpt_params & params) {
|
1210
|
-
params.warmup = false;
|
1211
|
-
}
|
1212
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
1213
|
-
add_opt(llama_arg(
|
1214
|
-
{"--spm-infill"},
|
1215
|
-
format(
|
1216
|
-
"use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
|
1217
|
-
params.spm_infill ? "enabled" : "disabled"
|
1218
|
-
),
|
1219
|
-
[](gpt_params & params) {
|
1220
|
-
params.spm_infill = true;
|
1221
|
-
}
|
1222
|
-
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
|
1223
|
-
add_opt(llama_arg(
|
1224
|
-
{"--samplers"}, "SAMPLERS",
|
1225
|
-
format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
1226
|
-
[](gpt_params & params, const std::string & value) {
|
1227
|
-
const auto sampler_names = string_split(value, ';');
|
1228
|
-
params.sparams.samplers = gpt_sampler_types_from_names(sampler_names, true);
|
1229
|
-
}
|
1230
|
-
));
|
1231
|
-
add_opt(llama_arg(
|
1232
|
-
{"--sampling-seq"}, "SEQUENCE",
|
1233
|
-
format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
|
1234
|
-
[](gpt_params & params, const std::string & value) {
|
1235
|
-
params.sparams.samplers = gpt_sampler_types_from_chars(value);
|
1236
|
-
}
|
1237
|
-
));
|
1238
|
-
add_opt(llama_arg(
|
1239
|
-
{"--ignore-eos"},
|
1240
|
-
"ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
|
1241
|
-
[](gpt_params & params) {
|
1242
|
-
params.sparams.ignore_eos = true;
|
1243
|
-
}
|
1244
|
-
));
|
1245
|
-
add_opt(llama_arg(
|
1246
|
-
{"--penalize-nl"},
|
1247
|
-
format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
|
1248
|
-
[](gpt_params & params) {
|
1249
|
-
params.sparams.penalize_nl = true;
|
1250
|
-
}
|
1251
|
-
));
|
1252
|
-
add_opt(llama_arg(
|
1253
|
-
{"--temp"}, "N",
|
1254
|
-
format("temperature (default: %.1f)", (double)params.sparams.temp),
|
1255
|
-
[](gpt_params & params, const std::string & value) {
|
1256
|
-
params.sparams.temp = std::stof(value);
|
1257
|
-
params.sparams.temp = std::max(params.sparams.temp, 0.0f);
|
1258
|
-
}
|
1259
|
-
));
|
1260
|
-
add_opt(llama_arg(
|
1261
|
-
{"--top-k"}, "N",
|
1262
|
-
format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
|
1263
|
-
[](gpt_params & params, int value) {
|
1264
|
-
params.sparams.top_k = value;
|
1265
|
-
}
|
1266
|
-
));
|
1267
|
-
add_opt(llama_arg(
|
1268
|
-
{"--top-p"}, "N",
|
1269
|
-
format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
|
1270
|
-
[](gpt_params & params, const std::string & value) {
|
1271
|
-
params.sparams.top_p = std::stof(value);
|
1272
|
-
}
|
1273
|
-
));
|
1274
|
-
add_opt(llama_arg(
|
1275
|
-
{"--min-p"}, "N",
|
1276
|
-
format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
|
1277
|
-
[](gpt_params & params, const std::string & value) {
|
1278
|
-
params.sparams.min_p = std::stof(value);
|
1279
|
-
}
|
1280
|
-
));
|
1281
|
-
add_opt(llama_arg(
|
1282
|
-
{"--tfs"}, "N",
|
1283
|
-
format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
|
1284
|
-
[](gpt_params & params, const std::string & value) {
|
1285
|
-
params.sparams.tfs_z = std::stof(value);
|
1286
|
-
}
|
1287
|
-
));
|
1288
|
-
add_opt(llama_arg(
|
1289
|
-
{"--typical"}, "N",
|
1290
|
-
format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
|
1291
|
-
[](gpt_params & params, const std::string & value) {
|
1292
|
-
params.sparams.typ_p = std::stof(value);
|
1293
|
-
}
|
1294
|
-
));
|
1295
|
-
add_opt(llama_arg(
|
1296
|
-
{"--repeat-last-n"}, "N",
|
1297
|
-
format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
|
1298
|
-
[](gpt_params & params, int value) {
|
1299
|
-
params.sparams.penalty_last_n = value;
|
1300
|
-
params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
|
1301
|
-
}
|
1302
|
-
));
|
1303
|
-
add_opt(llama_arg(
|
1304
|
-
{"--repeat-penalty"}, "N",
|
1305
|
-
format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
|
1306
|
-
[](gpt_params & params, const std::string & value) {
|
1307
|
-
params.sparams.penalty_repeat = std::stof(value);
|
1308
|
-
}
|
1309
|
-
));
|
1310
|
-
add_opt(llama_arg(
|
1311
|
-
{"--presence-penalty"}, "N",
|
1312
|
-
format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
|
1313
|
-
[](gpt_params & params, const std::string & value) {
|
1314
|
-
params.sparams.penalty_present = std::stof(value);
|
1315
|
-
}
|
1316
|
-
));
|
1317
|
-
add_opt(llama_arg(
|
1318
|
-
{"--frequency-penalty"}, "N",
|
1319
|
-
format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
|
1320
|
-
[](gpt_params & params, const std::string & value) {
|
1321
|
-
params.sparams.penalty_freq = std::stof(value);
|
1322
|
-
}
|
1323
|
-
));
|
1324
|
-
add_opt(llama_arg(
|
1325
|
-
{"--dynatemp-range"}, "N",
|
1326
|
-
format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
|
1327
|
-
[](gpt_params & params, const std::string & value) {
|
1328
|
-
params.sparams.dynatemp_range = std::stof(value);
|
1329
|
-
}
|
1330
|
-
));
|
1331
|
-
add_opt(llama_arg(
|
1332
|
-
{"--dynatemp-exp"}, "N",
|
1333
|
-
format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
|
1334
|
-
[](gpt_params & params, const std::string & value) {
|
1335
|
-
params.sparams.dynatemp_exponent = std::stof(value);
|
1336
|
-
}
|
1337
|
-
));
|
1338
|
-
add_opt(llama_arg(
|
1339
|
-
{"--mirostat"}, "N",
|
1340
|
-
format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
|
1341
|
-
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
|
1342
|
-
[](gpt_params & params, int value) {
|
1343
|
-
params.sparams.mirostat = value;
|
1344
|
-
}
|
1345
|
-
));
|
1346
|
-
add_opt(llama_arg(
|
1347
|
-
{"--mirostat-lr"}, "N",
|
1348
|
-
format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
|
1349
|
-
[](gpt_params & params, const std::string & value) {
|
1350
|
-
params.sparams.mirostat_eta = std::stof(value);
|
1351
|
-
}
|
1352
|
-
));
|
1353
|
-
add_opt(llama_arg(
|
1354
|
-
{"--mirostat-ent"}, "N",
|
1355
|
-
format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
|
1356
|
-
[](gpt_params & params, const std::string & value) {
|
1357
|
-
params.sparams.mirostat_tau = std::stof(value);
|
1358
|
-
}
|
1359
|
-
));
|
1360
|
-
add_opt(llama_arg(
|
1361
|
-
{"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS",
|
1362
|
-
"modifies the likelihood of token appearing in the completion,\n"
|
1363
|
-
"i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
|
1364
|
-
"or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
|
1365
|
-
[](gpt_params & params, const std::string & value) {
|
1366
|
-
std::stringstream ss(value);
|
1367
|
-
llama_token key;
|
1368
|
-
char sign;
|
1369
|
-
std::string value_str;
|
1370
|
-
try {
|
1371
|
-
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
1372
|
-
const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
1373
|
-
params.sparams.logit_bias.push_back({key, bias});
|
1374
|
-
} else {
|
1375
|
-
throw std::invalid_argument("invalid input format");
|
1376
|
-
}
|
1377
|
-
} catch (const std::exception&) {
|
1378
|
-
throw std::invalid_argument("invalid input format");
|
1379
|
-
}
|
1380
|
-
}
|
1381
|
-
));
|
1382
|
-
add_opt(llama_arg(
|
1383
|
-
{"--grammar"}, "GRAMMAR",
|
1384
|
-
format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
|
1385
|
-
[](gpt_params & params, const std::string & value) {
|
1386
|
-
params.sparams.grammar = value;
|
1387
|
-
}
|
1388
|
-
));
|
1389
|
-
add_opt(llama_arg(
|
1390
|
-
{"--grammar-file"}, "FNAME",
|
1391
|
-
"file to read grammar from",
|
1392
|
-
[](gpt_params & params, const std::string & value) {
|
1393
|
-
std::ifstream file(value);
|
1394
|
-
if (!file) {
|
1395
|
-
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
1396
|
-
}
|
1397
|
-
std::copy(
|
1398
|
-
std::istreambuf_iterator<char>(file),
|
1399
|
-
std::istreambuf_iterator<char>(),
|
1400
|
-
std::back_inserter(params.sparams.grammar)
|
1401
|
-
);
|
1402
|
-
}
|
1403
|
-
));
|
1404
|
-
add_opt(llama_arg(
|
1405
|
-
{"-j", "--json-schema"}, "SCHEMA",
|
1406
|
-
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
|
1407
|
-
[](gpt_params & params, const std::string & value) {
|
1408
|
-
params.sparams.grammar = json_schema_to_grammar(json::parse(value));
|
1409
|
-
}
|
1410
|
-
));
|
1411
|
-
add_opt(llama_arg(
|
1412
|
-
{"--pooling"}, "{none,mean,cls,last}",
|
1413
|
-
"pooling type for embeddings, use model default if unspecified",
|
1414
|
-
[](gpt_params & params, const std::string & value) {
|
1415
|
-
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
|
1416
|
-
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
|
1417
|
-
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
|
1418
|
-
else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
|
1419
|
-
else { throw std::invalid_argument("invalid value"); }
|
1420
|
-
}
|
1421
|
-
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
1422
|
-
add_opt(llama_arg(
|
1423
|
-
{"--attention"}, "{causal,non,causal}",
|
1424
|
-
"attention type for embeddings, use model default if unspecified",
|
1425
|
-
[](gpt_params & params, const std::string & value) {
|
1426
|
-
/**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
|
1427
|
-
else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
|
1428
|
-
else { throw std::invalid_argument("invalid value"); }
|
1429
|
-
}
|
1430
|
-
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
1431
|
-
add_opt(llama_arg(
|
1432
|
-
{"--rope-scaling"}, "{none,linear,yarn}",
|
1433
|
-
"RoPE frequency scaling method, defaults to linear unless specified by the model",
|
1434
|
-
[](gpt_params & params, const std::string & value) {
|
1435
|
-
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
1436
|
-
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
1437
|
-
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
1438
|
-
else { throw std::invalid_argument("invalid value"); }
|
1439
|
-
}
|
1440
|
-
));
|
1441
|
-
add_opt(llama_arg(
|
1442
|
-
{"--rope-scale"}, "N",
|
1443
|
-
"RoPE context scaling factor, expands context by a factor of N",
|
1444
|
-
[](gpt_params & params, const std::string & value) {
|
1445
|
-
params.rope_freq_scale = 1.0f / std::stof(value);
|
1446
|
-
}
|
1447
|
-
));
|
1448
|
-
add_opt(llama_arg(
|
1449
|
-
{"--rope-freq-base"}, "N",
|
1450
|
-
"RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
|
1451
|
-
[](gpt_params & params, const std::string & value) {
|
1452
|
-
params.rope_freq_base = std::stof(value);
|
1453
|
-
}
|
1454
|
-
));
|
1455
|
-
add_opt(llama_arg(
|
1456
|
-
{"--rope-freq-scale"}, "N",
|
1457
|
-
"RoPE frequency scaling factor, expands context by a factor of 1/N",
|
1458
|
-
[](gpt_params & params, const std::string & value) {
|
1459
|
-
params.rope_freq_scale = std::stof(value);
|
1460
|
-
}
|
1461
|
-
));
|
1462
|
-
add_opt(llama_arg(
|
1463
|
-
{"--yarn-orig-ctx"}, "N",
|
1464
|
-
format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
|
1465
|
-
[](gpt_params & params, int value) {
|
1466
|
-
params.yarn_orig_ctx = value;
|
1467
|
-
}
|
1468
|
-
));
|
1469
|
-
add_opt(llama_arg(
|
1470
|
-
{"--yarn-ext-factor"}, "N",
|
1471
|
-
format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
|
1472
|
-
[](gpt_params & params, const std::string & value) {
|
1473
|
-
params.yarn_ext_factor = std::stof(value);
|
1474
|
-
}
|
1475
|
-
));
|
1476
|
-
add_opt(llama_arg(
|
1477
|
-
{"--yarn-attn-factor"}, "N",
|
1478
|
-
format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
|
1479
|
-
[](gpt_params & params, const std::string & value) {
|
1480
|
-
params.yarn_attn_factor = std::stof(value);
|
1481
|
-
}
|
1482
|
-
));
|
1483
|
-
add_opt(llama_arg(
|
1484
|
-
{"--yarn-beta-slow"}, "N",
|
1485
|
-
format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
|
1486
|
-
[](gpt_params & params, const std::string & value) {
|
1487
|
-
params.yarn_beta_slow = std::stof(value);
|
1488
|
-
}
|
1489
|
-
));
|
1490
|
-
add_opt(llama_arg(
|
1491
|
-
{"--yarn-beta-fast"}, "N",
|
1492
|
-
format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
|
1493
|
-
[](gpt_params & params, const std::string & value) {
|
1494
|
-
params.yarn_beta_fast = std::stof(value);
|
1495
|
-
}
|
1496
|
-
));
|
1497
|
-
add_opt(llama_arg(
|
1498
|
-
{"-gan", "--grp-attn-n"}, "N",
|
1499
|
-
format("group-attention factor (default: %d)", params.grp_attn_n),
|
1500
|
-
[](gpt_params & params, int value) {
|
1501
|
-
params.grp_attn_n = value;
|
1502
|
-
}
|
1503
|
-
));
|
1504
|
-
add_opt(llama_arg(
|
1505
|
-
{"-gaw", "--grp-attn-w"}, "N",
|
1506
|
-
format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
|
1507
|
-
[](gpt_params & params, int value) {
|
1508
|
-
params.grp_attn_w = value;
|
1509
|
-
}
|
1510
|
-
));
|
1511
|
-
add_opt(llama_arg(
|
1512
|
-
{"-dkvc", "--dump-kv-cache"},
|
1513
|
-
"verbose print of the KV cache",
|
1514
|
-
[](gpt_params & params) {
|
1515
|
-
params.dump_kv_cache = true;
|
1516
|
-
}
|
1517
|
-
));
|
1518
|
-
add_opt(llama_arg(
|
1519
|
-
{"-nkvo", "--no-kv-offload"},
|
1520
|
-
"disable KV offload",
|
1521
|
-
[](gpt_params & params) {
|
1522
|
-
params.no_kv_offload = true;
|
1523
|
-
}
|
1524
|
-
));
|
1525
|
-
add_opt(llama_arg(
|
1526
|
-
{"-ctk", "--cache-type-k"}, "TYPE",
|
1527
|
-
format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
|
1528
|
-
[](gpt_params & params, const std::string & value) {
|
1529
|
-
// TODO: get the type right here
|
1530
|
-
params.cache_type_k = value;
|
1531
|
-
}
|
1532
|
-
));
|
1533
|
-
add_opt(llama_arg(
|
1534
|
-
{"-ctv", "--cache-type-v"}, "TYPE",
|
1535
|
-
format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
|
1536
|
-
[](gpt_params & params, const std::string & value) {
|
1537
|
-
// TODO: get the type right here
|
1538
|
-
params.cache_type_v = value;
|
1539
|
-
}
|
1540
|
-
));
|
1541
|
-
add_opt(llama_arg(
|
1542
|
-
{"--perplexity", "--all-logits"},
|
1543
|
-
format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
|
1544
|
-
[](gpt_params & params) {
|
1545
|
-
params.logits_all = true;
|
1546
|
-
}
|
1547
|
-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1548
|
-
add_opt(llama_arg(
|
1549
|
-
{"--hellaswag"},
|
1550
|
-
"compute HellaSwag score over random tasks from datafile supplied with -f",
|
1551
|
-
[](gpt_params & params) {
|
1552
|
-
params.hellaswag = true;
|
1553
|
-
}
|
1554
|
-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1555
|
-
add_opt(llama_arg(
|
1556
|
-
{"--hellaswag-tasks"}, "N",
|
1557
|
-
format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
|
1558
|
-
[](gpt_params & params, int value) {
|
1559
|
-
params.hellaswag_tasks = value;
|
1560
|
-
}
|
1561
|
-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1562
|
-
add_opt(llama_arg(
|
1563
|
-
{"--winogrande"},
|
1564
|
-
"compute Winogrande score over random tasks from datafile supplied with -f",
|
1565
|
-
[](gpt_params & params) {
|
1566
|
-
params.winogrande = true;
|
1567
|
-
}
|
1568
|
-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1569
|
-
add_opt(llama_arg(
|
1570
|
-
{"--winogrande-tasks"}, "N",
|
1571
|
-
format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
|
1572
|
-
[](gpt_params & params, int value) {
|
1573
|
-
params.winogrande_tasks = value;
|
1574
|
-
}
|
1575
|
-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1576
|
-
add_opt(llama_arg(
|
1577
|
-
{"--multiple-choice"},
|
1578
|
-
"compute multiple choice score over random tasks from datafile supplied with -f",
|
1579
|
-
[](gpt_params & params) {
|
1580
|
-
params.multiple_choice = true;
|
1581
|
-
}
|
1582
|
-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1583
|
-
add_opt(llama_arg(
|
1584
|
-
{"--multiple-choice-tasks"}, "N",
|
1585
|
-
format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
|
1586
|
-
[](gpt_params & params, int value) {
|
1587
|
-
params.multiple_choice_tasks = value;
|
1588
|
-
}
|
1589
|
-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1590
|
-
add_opt(llama_arg(
|
1591
|
-
{"--kl-divergence"},
|
1592
|
-
"computes KL-divergence to logits provided via --kl-divergence-base",
|
1593
|
-
[](gpt_params & params) {
|
1594
|
-
params.kl_divergence = true;
|
1595
|
-
}
|
1596
|
-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1597
|
-
add_opt(llama_arg(
|
1598
|
-
{"--save-all-logits", "--kl-divergence-base"}, "FNAME",
|
1599
|
-
"set logits file",
|
1600
|
-
[](gpt_params & params, const std::string & value) {
|
1601
|
-
params.logits_file = value;
|
1602
|
-
}
|
1603
|
-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1604
|
-
add_opt(llama_arg(
|
1605
|
-
{"--ppl-stride"}, "N",
|
1606
|
-
format("stride for perplexity calculation (default: %d)", params.ppl_stride),
|
1607
|
-
[](gpt_params & params, int value) {
|
1608
|
-
params.ppl_stride = value;
|
1609
|
-
}
|
1610
|
-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1611
|
-
add_opt(llama_arg(
|
1612
|
-
{"--ppl-output-type"}, "<0|1>",
|
1613
|
-
format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
|
1614
|
-
[](gpt_params & params, int value) {
|
1615
|
-
params.ppl_output_type = value;
|
1616
|
-
}
|
1617
|
-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
1618
|
-
add_opt(llama_arg(
|
1619
|
-
{"-dt", "--defrag-thold"}, "N",
|
1620
|
-
format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
|
1621
|
-
[](gpt_params & params, const std::string & value) {
|
1622
|
-
params.defrag_thold = std::stof(value);
|
1623
|
-
}
|
1624
|
-
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
|
1625
|
-
add_opt(llama_arg(
|
1626
|
-
{"-np", "--parallel"}, "N",
|
1627
|
-
format("number of parallel sequences to decode (default: %d)", params.n_parallel),
|
1628
|
-
[](gpt_params & params, int value) {
|
1629
|
-
params.n_parallel = value;
|
1630
|
-
}
|
1631
|
-
));
|
1632
|
-
add_opt(llama_arg(
|
1633
|
-
{"-ns", "--sequences"}, "N",
|
1634
|
-
format("number of sequences to decode (default: %d)", params.n_sequences),
|
1635
|
-
[](gpt_params & params, int value) {
|
1636
|
-
params.n_sequences = value;
|
1637
|
-
}
|
1638
|
-
));
|
1639
|
-
add_opt(llama_arg(
|
1640
|
-
{"-cb", "--cont-batching"},
|
1641
|
-
format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
|
1642
|
-
[](gpt_params & params) {
|
1643
|
-
params.cont_batching = true;
|
1644
|
-
}
|
1645
|
-
).set_env("LLAMA_ARG_CONT_BATCHING"));
|
1646
|
-
add_opt(llama_arg(
|
1647
|
-
{"-nocb", "--no-cont-batching"},
|
1648
|
-
"disable continuous batching",
|
1649
|
-
[](gpt_params & params) {
|
1650
|
-
params.cont_batching = false;
|
1651
|
-
}
|
1652
|
-
).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
1653
|
-
add_opt(llama_arg(
|
1654
|
-
{"--mmproj"}, "FILE",
|
1655
|
-
"path to a multimodal projector file for LLaVA. see examples/llava/README.md",
|
1656
|
-
[](gpt_params & params, const std::string & value) {
|
1657
|
-
params.mmproj = value;
|
1658
|
-
}
|
1659
|
-
).set_examples({LLAMA_EXAMPLE_LLAVA}));
|
1660
|
-
add_opt(llama_arg(
|
1661
|
-
{"--image"}, "FILE",
|
1662
|
-
"path to an image file. use with multimodal models. Specify multiple times for batching",
|
1663
|
-
[](gpt_params & params, const std::string & value) {
|
1664
|
-
params.image.emplace_back(value);
|
1665
|
-
}
|
1666
|
-
).set_examples({LLAMA_EXAMPLE_LLAVA}));
|
1667
|
-
#ifdef LM_GGML_USE_RPC
|
1668
|
-
add_opt(llama_arg(
|
1669
|
-
{"--rpc"}, "SERVERS",
|
1670
|
-
"comma separated list of RPC servers",
|
1671
|
-
[](gpt_params & params, const std::string & value) {
|
1672
|
-
params.rpc_servers = value;
|
1673
|
-
}
|
1674
|
-
));
|
378
|
+
#ifdef NDEBUG
|
379
|
+
const char * build_type = "";
|
380
|
+
#else
|
381
|
+
const char * build_type = " (debug)";
|
1675
382
|
#endif
|
1676
|
-
|
1677
|
-
|
1678
|
-
"force system to keep model in RAM rather than swapping or compressing",
|
1679
|
-
[](gpt_params & params) {
|
1680
|
-
params.use_mlock = true;
|
1681
|
-
}
|
1682
|
-
));
|
1683
|
-
add_opt(llama_arg(
|
1684
|
-
{"--no-mmap"},
|
1685
|
-
"do not memory-map model (slower load but may reduce pageouts if not using mlock)",
|
1686
|
-
[](gpt_params & params) {
|
1687
|
-
params.use_mmap = false;
|
1688
|
-
}
|
1689
|
-
));
|
1690
|
-
add_opt(llama_arg(
|
1691
|
-
{"--numa"}, "TYPE",
|
1692
|
-
"attempt optimizations that help on some NUMA systems\n"
|
1693
|
-
"- distribute: spread execution evenly over all nodes\n"
|
1694
|
-
"- isolate: only spawn threads on CPUs on the node that execution started on\n"
|
1695
|
-
"- numactl: use the CPU map provided by numactl\n"
|
1696
|
-
"if run without this previously, it is recommended to drop the system page cache before using this\n"
|
1697
|
-
"see https://github.com/ggerganov/llama.cpp/issues/1437",
|
1698
|
-
[](gpt_params & params, const std::string & value) {
|
1699
|
-
/**/ if (value == "distribute" || value == "") { params.numa = LM_GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
1700
|
-
else if (value == "isolate") { params.numa = LM_GGML_NUMA_STRATEGY_ISOLATE; }
|
1701
|
-
else if (value == "numactl") { params.numa = LM_GGML_NUMA_STRATEGY_NUMACTL; }
|
1702
|
-
else { throw std::invalid_argument("invalid value"); }
|
1703
|
-
}
|
1704
|
-
));
|
1705
|
-
add_opt(llama_arg(
|
1706
|
-
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
1707
|
-
"number of layers to store in VRAM",
|
1708
|
-
[](gpt_params & params, int value) {
|
1709
|
-
params.n_gpu_layers = value;
|
1710
|
-
if (!llama_supports_gpu_offload()) {
|
1711
|
-
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
|
1712
|
-
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
1713
|
-
}
|
1714
|
-
}
|
1715
|
-
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
|
1716
|
-
add_opt(llama_arg(
|
1717
|
-
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
1718
|
-
"number of layers to store in VRAM for the draft model",
|
1719
|
-
[](gpt_params & params, int value) {
|
1720
|
-
params.n_gpu_layers_draft = value;
|
1721
|
-
if (!llama_supports_gpu_offload()) {
|
1722
|
-
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
1723
|
-
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
1724
|
-
}
|
1725
|
-
}
|
1726
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
1727
|
-
add_opt(llama_arg(
|
1728
|
-
{"-sm", "--split-mode"}, "{none,layer,row}",
|
1729
|
-
"how to split the model across multiple GPUs, one of:\n"
|
1730
|
-
"- none: use one GPU only\n"
|
1731
|
-
"- layer (default): split layers and KV across GPUs\n"
|
1732
|
-
"- row: split rows across GPUs",
|
1733
|
-
[](gpt_params & params, const std::string & value) {
|
1734
|
-
std::string arg_next = value;
|
1735
|
-
if (arg_next == "none") {
|
1736
|
-
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
1737
|
-
} else if (arg_next == "layer") {
|
1738
|
-
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
1739
|
-
}
|
1740
|
-
else if (arg_next == "row") {
|
1741
|
-
#ifdef LM_GGML_USE_SYCL
|
1742
|
-
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
|
1743
|
-
exit(1);
|
1744
|
-
#endif // LM_GGML_USE_SYCL
|
1745
|
-
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
1746
|
-
}
|
1747
|
-
else {
|
1748
|
-
throw std::invalid_argument("invalid value");
|
1749
|
-
}
|
1750
|
-
#ifndef LM_GGML_USE_CUDA_SYCL_VULKAN
|
1751
|
-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
|
1752
|
-
#endif // LM_GGML_USE_CUDA_SYCL_VULKAN
|
1753
|
-
}
|
1754
|
-
));
|
1755
|
-
add_opt(llama_arg(
|
1756
|
-
{"-ts", "--tensor-split"}, "N0,N1,N2,...",
|
1757
|
-
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
|
1758
|
-
[](gpt_params & params, const std::string & value) {
|
1759
|
-
std::string arg_next = value;
|
1760
|
-
|
1761
|
-
// split string by , and /
|
1762
|
-
const std::regex regex{ R"([,/]+)" };
|
1763
|
-
std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
|
1764
|
-
std::vector<std::string> split_arg{ it, {} };
|
1765
|
-
if (split_arg.size() >= llama_max_devices()) {
|
1766
|
-
throw std::invalid_argument(
|
1767
|
-
format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
|
1768
|
-
);
|
1769
|
-
}
|
1770
|
-
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
1771
|
-
if (i < split_arg.size()) {
|
1772
|
-
params.tensor_split[i] = std::stof(split_arg[i]);
|
1773
|
-
} else {
|
1774
|
-
params.tensor_split[i] = 0.0f;
|
1775
|
-
}
|
1776
|
-
}
|
1777
|
-
#ifndef LM_GGML_USE_CUDA_SYCL_VULKAN
|
1778
|
-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
|
1779
|
-
#endif // LM_GGML_USE_CUDA_SYCL_VULKAN
|
1780
|
-
}
|
1781
|
-
));
|
1782
|
-
add_opt(llama_arg(
|
1783
|
-
{"-mg", "--main-gpu"}, "INDEX",
|
1784
|
-
format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
|
1785
|
-
[](gpt_params & params, int value) {
|
1786
|
-
params.main_gpu = value;
|
1787
|
-
#ifndef LM_GGML_USE_CUDA_SYCL_VULKAN
|
1788
|
-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
|
1789
|
-
#endif // LM_GGML_USE_CUDA_SYCL_VULKAN
|
1790
|
-
}
|
1791
|
-
));
|
1792
|
-
add_opt(llama_arg(
|
1793
|
-
{"--check-tensors"},
|
1794
|
-
format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
|
1795
|
-
[](gpt_params & params) {
|
1796
|
-
params.check_tensors = true;
|
1797
|
-
}
|
1798
|
-
));
|
1799
|
-
add_opt(llama_arg(
|
1800
|
-
{"--override-kv"}, "KEY=TYPE:VALUE",
|
1801
|
-
"advanced option to override model metadata by key. may be specified multiple times.\n"
|
1802
|
-
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
|
1803
|
-
[](gpt_params & params, const std::string & value) {
|
1804
|
-
if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
|
1805
|
-
throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str()));
|
1806
|
-
}
|
1807
|
-
}
|
1808
|
-
));
|
1809
|
-
add_opt(llama_arg(
|
1810
|
-
{"--lora"}, "FNAME",
|
1811
|
-
"path to LoRA adapter (can be repeated to use multiple adapters)",
|
1812
|
-
[](gpt_params & params, const std::string & value) {
|
1813
|
-
params.lora_adapters.push_back({ std::string(value), 1.0 });
|
1814
|
-
}
|
1815
|
-
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
1816
|
-
add_opt(llama_arg(
|
1817
|
-
{"--lora-scaled"}, "FNAME", "SCALE",
|
1818
|
-
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
|
1819
|
-
[](gpt_params & params, const std::string & fname, const std::string & scale) {
|
1820
|
-
params.lora_adapters.push_back({ fname, std::stof(scale) });
|
1821
|
-
}
|
1822
|
-
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
1823
|
-
add_opt(llama_arg(
|
1824
|
-
{"--control-vector"}, "FNAME",
|
1825
|
-
"add a control vector\nnote: this argument can be repeated to add multiple control vectors",
|
1826
|
-
[](gpt_params & params, const std::string & value) {
|
1827
|
-
params.control_vectors.push_back({ 1.0f, value, });
|
1828
|
-
}
|
1829
|
-
));
|
1830
|
-
add_opt(llama_arg(
|
1831
|
-
{"--control-vector-scaled"}, "FNAME", "SCALE",
|
1832
|
-
"add a control vector with user defined scaling SCALE\n"
|
1833
|
-
"note: this argument can be repeated to add multiple scaled control vectors",
|
1834
|
-
[](gpt_params & params, const std::string & fname, const std::string & scale) {
|
1835
|
-
params.control_vectors.push_back({ std::stof(scale), fname });
|
1836
|
-
}
|
1837
|
-
));
|
1838
|
-
add_opt(llama_arg(
|
1839
|
-
{"--control-vector-layer-range"}, "START", "END",
|
1840
|
-
"layer range to apply the control vector(s) to, start and end inclusive",
|
1841
|
-
[](gpt_params & params, const std::string & start, const std::string & end) {
|
1842
|
-
params.control_vector_layer_start = std::stoi(start);
|
1843
|
-
params.control_vector_layer_end = std::stoi(end);
|
1844
|
-
}
|
1845
|
-
));
|
1846
|
-
add_opt(llama_arg(
|
1847
|
-
{"-a", "--alias"}, "STRING",
|
1848
|
-
"set alias for model name (to be used by REST API)",
|
1849
|
-
[](gpt_params & params, const std::string & value) {
|
1850
|
-
params.model_alias = value;
|
1851
|
-
}
|
1852
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
1853
|
-
add_opt(llama_arg(
|
1854
|
-
{"-m", "--model"}, "FNAME",
|
1855
|
-
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
1856
|
-
? std::string("model path from which to load base model")
|
1857
|
-
: format(
|
1858
|
-
"model path (default: `models/$filename` with filename from `--hf-file` "
|
1859
|
-
"or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
|
1860
|
-
),
|
1861
|
-
[](gpt_params & params, const std::string & value) {
|
1862
|
-
params.model = value;
|
1863
|
-
}
|
1864
|
-
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
|
1865
|
-
add_opt(llama_arg(
|
1866
|
-
{"-md", "--model-draft"}, "FNAME",
|
1867
|
-
"draft model for speculative decoding (default: unused)",
|
1868
|
-
[](gpt_params & params, const std::string & value) {
|
1869
|
-
params.model_draft = value;
|
1870
|
-
}
|
1871
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
1872
|
-
add_opt(llama_arg(
|
1873
|
-
{"-mu", "--model-url"}, "MODEL_URL",
|
1874
|
-
"model download url (default: unused)",
|
1875
|
-
[](gpt_params & params, const std::string & value) {
|
1876
|
-
params.model_url = value;
|
1877
|
-
}
|
1878
|
-
).set_env("LLAMA_ARG_MODEL_URL"));
|
1879
|
-
add_opt(llama_arg(
|
1880
|
-
{"-hfr", "--hf-repo"}, "REPO",
|
1881
|
-
"Hugging Face model repository (default: unused)",
|
1882
|
-
[](gpt_params & params, const std::string & value) {
|
1883
|
-
params.hf_repo = value;
|
1884
|
-
}
|
1885
|
-
).set_env("LLAMA_ARG_HF_REPO"));
|
1886
|
-
add_opt(llama_arg(
|
1887
|
-
{"-hff", "--hf-file"}, "FILE",
|
1888
|
-
"Hugging Face model file (default: unused)",
|
1889
|
-
[](gpt_params & params, const std::string & value) {
|
1890
|
-
params.hf_file = value;
|
1891
|
-
}
|
1892
|
-
).set_env("LLAMA_ARG_HF_FILE"));
|
1893
|
-
add_opt(llama_arg(
|
1894
|
-
{"-hft", "--hf-token"}, "TOKEN",
|
1895
|
-
"Hugging Face access token (default: value from HF_TOKEN environment variable)",
|
1896
|
-
[](gpt_params & params, const std::string & value) {
|
1897
|
-
params.hf_token = value;
|
1898
|
-
}
|
1899
|
-
).set_env("HF_TOKEN"));
|
1900
|
-
add_opt(llama_arg(
|
1901
|
-
{"--context-file"}, "FNAME",
|
1902
|
-
"file to load context from (repeat to specify multiple files)",
|
1903
|
-
[](gpt_params & params, const std::string & value) {
|
1904
|
-
std::ifstream file(value, std::ios::binary);
|
1905
|
-
if (!file) {
|
1906
|
-
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
1907
|
-
}
|
1908
|
-
params.context_files.push_back(value);
|
1909
|
-
}
|
1910
|
-
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
1911
|
-
add_opt(llama_arg(
|
1912
|
-
{"--chunk-size"}, "N",
|
1913
|
-
format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
|
1914
|
-
[](gpt_params & params, int value) {
|
1915
|
-
params.chunk_size = value;
|
1916
|
-
}
|
1917
|
-
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
1918
|
-
add_opt(llama_arg(
|
1919
|
-
{"--chunk-separator"}, "STRING",
|
1920
|
-
format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
|
1921
|
-
[](gpt_params & params, const std::string & value) {
|
1922
|
-
params.chunk_separator = value;
|
1923
|
-
}
|
1924
|
-
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
1925
|
-
add_opt(llama_arg(
|
1926
|
-
{"--junk"}, "N",
|
1927
|
-
format("number of times to repeat the junk text (default: %d)", params.n_junk),
|
1928
|
-
[](gpt_params & params, int value) {
|
1929
|
-
params.n_junk = value;
|
1930
|
-
}
|
1931
|
-
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
1932
|
-
add_opt(llama_arg(
|
1933
|
-
{"--pos"}, "N",
|
1934
|
-
format("position of the passkey in the junk text (default: %d)", params.i_pos),
|
1935
|
-
[](gpt_params & params, int value) {
|
1936
|
-
params.i_pos = value;
|
1937
|
-
}
|
1938
|
-
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
1939
|
-
add_opt(llama_arg(
|
1940
|
-
{"-o", "--output", "--output-file"}, "FNAME",
|
1941
|
-
format("output file (default: '%s')",
|
1942
|
-
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
1943
|
-
? params.lora_outfile.c_str()
|
1944
|
-
: ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
|
1945
|
-
? params.cvector_outfile.c_str()
|
1946
|
-
: params.out_file.c_str()),
|
1947
|
-
[](gpt_params & params, const std::string & value) {
|
1948
|
-
params.out_file = value;
|
1949
|
-
params.cvector_outfile = value;
|
1950
|
-
params.lora_outfile = value;
|
1951
|
-
}
|
1952
|
-
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
|
1953
|
-
add_opt(llama_arg(
|
1954
|
-
{"-ofreq", "--output-frequency"}, "N",
|
1955
|
-
format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
1956
|
-
[](gpt_params & params, int value) {
|
1957
|
-
params.n_out_freq = value;
|
1958
|
-
}
|
1959
|
-
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
1960
|
-
add_opt(llama_arg(
|
1961
|
-
{"--save-frequency"}, "N",
|
1962
|
-
format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
|
1963
|
-
[](gpt_params & params, int value) {
|
1964
|
-
params.n_save_freq = value;
|
1965
|
-
}
|
1966
|
-
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
1967
|
-
add_opt(llama_arg(
|
1968
|
-
{"--process-output"},
|
1969
|
-
format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
|
1970
|
-
[](gpt_params & params) {
|
1971
|
-
params.process_output = true;
|
1972
|
-
}
|
1973
|
-
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
1974
|
-
add_opt(llama_arg(
|
1975
|
-
{"--no-ppl"},
|
1976
|
-
format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
|
1977
|
-
[](gpt_params & params) {
|
1978
|
-
params.compute_ppl = false;
|
1979
|
-
}
|
1980
|
-
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
1981
|
-
add_opt(llama_arg(
|
1982
|
-
{"--chunk", "--from-chunk"}, "N",
|
1983
|
-
format("start processing the input from chunk N (default: %d)", params.i_chunk),
|
1984
|
-
[](gpt_params & params, int value) {
|
1985
|
-
params.i_chunk = value;
|
1986
|
-
}
|
1987
|
-
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
1988
|
-
add_opt(llama_arg(
|
1989
|
-
{"-pps"},
|
1990
|
-
format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
|
1991
|
-
[](gpt_params & params) {
|
1992
|
-
params.is_pp_shared = true;
|
1993
|
-
}
|
1994
|
-
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
1995
|
-
add_opt(llama_arg(
|
1996
|
-
{"-npp"}, "n0,n1,...",
|
1997
|
-
"number of prompt tokens",
|
1998
|
-
[](gpt_params & params, const std::string & value) {
|
1999
|
-
auto p = string_split<int>(value, ',');
|
2000
|
-
params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
|
2001
|
-
}
|
2002
|
-
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
2003
|
-
add_opt(llama_arg(
|
2004
|
-
{"-ntg"}, "n0,n1,...",
|
2005
|
-
"number of text generation tokens",
|
2006
|
-
[](gpt_params & params, const std::string & value) {
|
2007
|
-
auto p = string_split<int>(value, ',');
|
2008
|
-
params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
|
2009
|
-
}
|
2010
|
-
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
2011
|
-
add_opt(llama_arg(
|
2012
|
-
{"-npl"}, "n0,n1,...",
|
2013
|
-
"number of parallel prompts",
|
2014
|
-
[](gpt_params & params, const std::string & value) {
|
2015
|
-
auto p = string_split<int>(value, ',');
|
2016
|
-
params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
|
2017
|
-
}
|
2018
|
-
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
2019
|
-
add_opt(llama_arg(
|
2020
|
-
{"--embd-normalize"}, "N",
|
2021
|
-
format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
|
2022
|
-
[](gpt_params & params, int value) {
|
2023
|
-
params.embd_normalize = value;
|
2024
|
-
}
|
2025
|
-
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
2026
|
-
add_opt(llama_arg(
|
2027
|
-
{"--embd-output-format"}, "FORMAT",
|
2028
|
-
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
|
2029
|
-
[](gpt_params & params, const std::string & value) {
|
2030
|
-
params.embd_out = value;
|
2031
|
-
}
|
2032
|
-
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
2033
|
-
add_opt(llama_arg(
|
2034
|
-
{"--embd-separator"}, "STRING",
|
2035
|
-
"separator of embendings (default \\n) for example \"<#sep#>\"",
|
2036
|
-
[](gpt_params & params, const std::string & value) {
|
2037
|
-
params.embd_sep = value;
|
2038
|
-
}
|
2039
|
-
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
2040
|
-
add_opt(llama_arg(
|
2041
|
-
{"--host"}, "HOST",
|
2042
|
-
format("ip address to listen (default: %s)", params.hostname.c_str()),
|
2043
|
-
[](gpt_params & params, const std::string & value) {
|
2044
|
-
params.hostname = value;
|
2045
|
-
}
|
2046
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
|
2047
|
-
add_opt(llama_arg(
|
2048
|
-
{"--port"}, "PORT",
|
2049
|
-
format("port to listen (default: %d)", params.port),
|
2050
|
-
[](gpt_params & params, int value) {
|
2051
|
-
params.port = value;
|
2052
|
-
}
|
2053
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
|
2054
|
-
add_opt(llama_arg(
|
2055
|
-
{"--path"}, "PATH",
|
2056
|
-
format("path to serve static files from (default: %s)", params.public_path.c_str()),
|
2057
|
-
[](gpt_params & params, const std::string & value) {
|
2058
|
-
params.public_path = value;
|
2059
|
-
}
|
2060
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2061
|
-
add_opt(llama_arg(
|
2062
|
-
{"--embedding", "--embeddings"},
|
2063
|
-
format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
|
2064
|
-
[](gpt_params & params) {
|
2065
|
-
params.embedding = true;
|
2066
|
-
}
|
2067
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
2068
|
-
add_opt(llama_arg(
|
2069
|
-
{"--api-key"}, "KEY",
|
2070
|
-
"API key to use for authentication (default: none)",
|
2071
|
-
[](gpt_params & params, const std::string & value) {
|
2072
|
-
params.api_keys.push_back(value);
|
2073
|
-
}
|
2074
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
|
2075
|
-
add_opt(llama_arg(
|
2076
|
-
{"--api-key-file"}, "FNAME",
|
2077
|
-
"path to file containing API keys (default: none)",
|
2078
|
-
[](gpt_params & params, const std::string & value) {
|
2079
|
-
std::ifstream key_file(value);
|
2080
|
-
if (!key_file) {
|
2081
|
-
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
2082
|
-
}
|
2083
|
-
std::string key;
|
2084
|
-
while (std::getline(key_file, key)) {
|
2085
|
-
if (!key.empty()) {
|
2086
|
-
params.api_keys.push_back(key);
|
2087
|
-
}
|
2088
|
-
}
|
2089
|
-
key_file.close();
|
2090
|
-
}
|
2091
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2092
|
-
add_opt(llama_arg(
|
2093
|
-
{"--ssl-key-file"}, "FNAME",
|
2094
|
-
"path to file a PEM-encoded SSL private key",
|
2095
|
-
[](gpt_params & params, const std::string & value) {
|
2096
|
-
params.ssl_file_key = value;
|
2097
|
-
}
|
2098
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2099
|
-
add_opt(llama_arg(
|
2100
|
-
{"--ssl-cert-file"}, "FNAME",
|
2101
|
-
"path to file a PEM-encoded SSL certificate",
|
2102
|
-
[](gpt_params & params, const std::string & value) {
|
2103
|
-
params.ssl_file_cert = value;
|
2104
|
-
}
|
2105
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2106
|
-
add_opt(llama_arg(
|
2107
|
-
{"-to", "--timeout"}, "N",
|
2108
|
-
format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
2109
|
-
[](gpt_params & params, int value) {
|
2110
|
-
params.timeout_read = value;
|
2111
|
-
params.timeout_write = value;
|
2112
|
-
}
|
2113
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2114
|
-
add_opt(llama_arg(
|
2115
|
-
{"--threads-http"}, "N",
|
2116
|
-
format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
|
2117
|
-
[](gpt_params & params, int value) {
|
2118
|
-
params.n_threads_http = value;
|
2119
|
-
}
|
2120
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
|
2121
|
-
add_opt(llama_arg(
|
2122
|
-
{"-spf", "--system-prompt-file"}, "FNAME",
|
2123
|
-
"set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications",
|
2124
|
-
[](gpt_params & params, const std::string & value) {
|
2125
|
-
std::ifstream file(value);
|
2126
|
-
if (!file) {
|
2127
|
-
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
2128
|
-
}
|
2129
|
-
std::string system_prompt;
|
2130
|
-
std::copy(
|
2131
|
-
std::istreambuf_iterator<char>(file),
|
2132
|
-
std::istreambuf_iterator<char>(),
|
2133
|
-
std::back_inserter(system_prompt)
|
2134
|
-
);
|
2135
|
-
params.system_prompt = system_prompt;
|
2136
|
-
}
|
2137
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2138
|
-
add_opt(llama_arg(
|
2139
|
-
{"--log-format"}, "{text, json}",
|
2140
|
-
"log output format: json or text (default: json)",
|
2141
|
-
[](gpt_params & params, const std::string & value) {
|
2142
|
-
if (value == "json") {
|
2143
|
-
params.log_json = true;
|
2144
|
-
} else if (value == "text") {
|
2145
|
-
params.log_json = false;
|
2146
|
-
} else {
|
2147
|
-
throw std::invalid_argument("invalid value");
|
2148
|
-
}
|
2149
|
-
}
|
2150
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2151
|
-
add_opt(llama_arg(
|
2152
|
-
{"--metrics"},
|
2153
|
-
format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
|
2154
|
-
[](gpt_params & params) {
|
2155
|
-
params.endpoint_metrics = true;
|
2156
|
-
}
|
2157
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
|
2158
|
-
add_opt(llama_arg(
|
2159
|
-
{"--no-slots"},
|
2160
|
-
format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
2161
|
-
[](gpt_params & params) {
|
2162
|
-
params.endpoint_slots = false;
|
2163
|
-
}
|
2164
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
|
2165
|
-
add_opt(llama_arg(
|
2166
|
-
{"--slot-save-path"}, "PATH",
|
2167
|
-
"path to save slot kv cache (default: disabled)",
|
2168
|
-
[](gpt_params & params, const std::string & value) {
|
2169
|
-
params.slot_save_path = value;
|
2170
|
-
// if doesn't end with DIRECTORY_SEPARATOR, add it
|
2171
|
-
if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
|
2172
|
-
params.slot_save_path += DIRECTORY_SEPARATOR;
|
2173
|
-
}
|
2174
|
-
}
|
2175
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2176
|
-
add_opt(llama_arg(
|
2177
|
-
{"--chat-template"}, "JINJA_TEMPLATE",
|
2178
|
-
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
2179
|
-
"if suffix/prefix are specified, template will be disabled\n"
|
2180
|
-
"only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
|
2181
|
-
[](gpt_params & params, const std::string & value) {
|
2182
|
-
if (!llama_chat_verify_template(value)) {
|
2183
|
-
throw std::runtime_error(format(
|
2184
|
-
"error: the supplied chat template is not supported: %s\n"
|
2185
|
-
"note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
|
2186
|
-
value.c_str()
|
2187
|
-
));
|
2188
|
-
}
|
2189
|
-
params.chat_template = value;
|
2190
|
-
}
|
2191
|
-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
2192
|
-
add_opt(llama_arg(
|
2193
|
-
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
2194
|
-
format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
2195
|
-
[](gpt_params & params, const std::string & value) {
|
2196
|
-
params.slot_prompt_similarity = std::stof(value);
|
2197
|
-
}
|
2198
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2199
|
-
add_opt(llama_arg(
|
2200
|
-
{"--lora-init-without-apply"},
|
2201
|
-
format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
|
2202
|
-
[](gpt_params & params) {
|
2203
|
-
params.lora_init_without_apply = true;
|
2204
|
-
}
|
2205
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
2206
|
-
add_opt(llama_arg(
|
2207
|
-
{"--simple-io"},
|
2208
|
-
"use basic IO for better compatibility in subprocesses and limited consoles",
|
2209
|
-
[](gpt_params & params) {
|
2210
|
-
params.simple_io = true;
|
2211
|
-
}
|
2212
|
-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
2213
|
-
add_opt(llama_arg(
|
2214
|
-
{"-ld", "--logdir"}, "LOGDIR",
|
2215
|
-
"path under which to save YAML logs (no logging if unset)",
|
2216
|
-
[](gpt_params & params, const std::string & value) {
|
2217
|
-
params.logdir = value;
|
2218
|
-
|
2219
|
-
if (params.logdir.back() != DIRECTORY_SEPARATOR) {
|
2220
|
-
params.logdir += DIRECTORY_SEPARATOR;
|
2221
|
-
}
|
2222
|
-
}
|
2223
|
-
));
|
2224
|
-
add_opt(llama_arg(
|
2225
|
-
{"--positive-file"}, "FNAME",
|
2226
|
-
format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|
2227
|
-
[](gpt_params & params, const std::string & value) {
|
2228
|
-
params.cvector_positive_file = value;
|
2229
|
-
}
|
2230
|
-
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
2231
|
-
add_opt(llama_arg(
|
2232
|
-
{"--negative-file"}, "FNAME",
|
2233
|
-
format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
|
2234
|
-
[](gpt_params & params, const std::string & value) {
|
2235
|
-
params.cvector_negative_file = value;
|
2236
|
-
}
|
2237
|
-
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
2238
|
-
add_opt(llama_arg(
|
2239
|
-
{"--pca-batch"}, "N",
|
2240
|
-
format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
|
2241
|
-
[](gpt_params & params, int value) {
|
2242
|
-
params.n_pca_batch = value;
|
2243
|
-
}
|
2244
|
-
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
2245
|
-
add_opt(llama_arg(
|
2246
|
-
{"--pca-iter"}, "N",
|
2247
|
-
format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
|
2248
|
-
[](gpt_params & params, int value) {
|
2249
|
-
params.n_pca_iterations = value;
|
2250
|
-
}
|
2251
|
-
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
2252
|
-
add_opt(llama_arg(
|
2253
|
-
{"--method"}, "{pca, mean}",
|
2254
|
-
"dimensionality reduction method to be used (default: pca)",
|
2255
|
-
[](gpt_params & params, const std::string & value) {
|
2256
|
-
/**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
|
2257
|
-
else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
|
2258
|
-
else { throw std::invalid_argument("invalid value"); }
|
2259
|
-
}
|
2260
|
-
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
2261
|
-
add_opt(llama_arg(
|
2262
|
-
{"--output-format"}, "{md,jsonl}",
|
2263
|
-
"output format for batched-bench results (default: md)",
|
2264
|
-
[](gpt_params & params, const std::string & value) {
|
2265
|
-
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
|
2266
|
-
else if (value == "md") { params.batched_bench_output_jsonl = false; }
|
2267
|
-
else { std::invalid_argument("invalid value"); }
|
2268
|
-
}
|
2269
|
-
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
2270
|
-
#ifndef LOG_DISABLE_LOGS
|
2271
|
-
// TODO: make this looks less weird
|
2272
|
-
add_opt(llama_arg(
|
2273
|
-
{"--log-test"},
|
2274
|
-
"Log test",
|
2275
|
-
[](gpt_params &) { log_param_single_parse("--log-test"); }
|
2276
|
-
));
|
2277
|
-
add_opt(llama_arg(
|
2278
|
-
{"--log-disable"},
|
2279
|
-
"Log disable",
|
2280
|
-
[](gpt_params &) { log_param_single_parse("--log-disable"); }
|
2281
|
-
));
|
2282
|
-
add_opt(llama_arg(
|
2283
|
-
{"--log-enable"},
|
2284
|
-
"Log enable",
|
2285
|
-
[](gpt_params &) { log_param_single_parse("--log-enable"); }
|
2286
|
-
));
|
2287
|
-
add_opt(llama_arg(
|
2288
|
-
{"--log-new"},
|
2289
|
-
"Log new",
|
2290
|
-
[](gpt_params &) { log_param_single_parse("--log-new"); }
|
2291
|
-
));
|
2292
|
-
add_opt(llama_arg(
|
2293
|
-
{"--log-append"},
|
2294
|
-
"Log append",
|
2295
|
-
[](gpt_params &) { log_param_single_parse("--log-append"); }
|
2296
|
-
));
|
2297
|
-
add_opt(llama_arg(
|
2298
|
-
{"--log-file"}, "FNAME",
|
2299
|
-
"Log file",
|
2300
|
-
[](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); }
|
2301
|
-
));
|
2302
|
-
#endif // LOG_DISABLE_LOGS
|
2303
|
-
|
2304
|
-
return options;
|
383
|
+
|
384
|
+
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
2305
385
|
}
|
2306
386
|
|
2307
387
|
std::string gpt_params_get_system_info(const gpt_params & params) {
|
@@ -2384,6 +464,94 @@ void string_replace_all(std::string & s, const std::string & search, const std::
|
|
2384
464
|
s = std::move(builder);
|
2385
465
|
}
|
2386
466
|
|
467
|
+
std::string string_from(bool value) {
|
468
|
+
return value ? "true" : "false";
|
469
|
+
}
|
470
|
+
|
471
|
+
std::string string_from(const std::vector<int> & values) {
|
472
|
+
std::stringstream buf;
|
473
|
+
|
474
|
+
buf << "[ ";
|
475
|
+
bool first = true;
|
476
|
+
for (auto e : values) {
|
477
|
+
if (first) {
|
478
|
+
first = false;
|
479
|
+
} else {
|
480
|
+
buf << ", ";
|
481
|
+
}
|
482
|
+
buf << std::to_string(e);
|
483
|
+
}
|
484
|
+
buf << " ]";
|
485
|
+
|
486
|
+
return buf.str();
|
487
|
+
}
|
488
|
+
|
489
|
+
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
|
490
|
+
std::stringstream buf;
|
491
|
+
|
492
|
+
buf << "[ ";
|
493
|
+
|
494
|
+
bool first = true;
|
495
|
+
for (const auto & token : tokens) {
|
496
|
+
if (!first) {
|
497
|
+
buf << ", ";
|
498
|
+
} else {
|
499
|
+
first = false;
|
500
|
+
}
|
501
|
+
|
502
|
+
auto detokenized = llama_token_to_piece(ctx, token);
|
503
|
+
|
504
|
+
detokenized.erase(
|
505
|
+
std::remove_if(
|
506
|
+
detokenized.begin(),
|
507
|
+
detokenized.end(),
|
508
|
+
[](const unsigned char c) { return !std::isprint(c); }),
|
509
|
+
detokenized.end());
|
510
|
+
|
511
|
+
buf << "'" << detokenized << "'"
|
512
|
+
<< ":" << std::to_string(token);
|
513
|
+
}
|
514
|
+
|
515
|
+
buf << " ]";
|
516
|
+
|
517
|
+
return buf.str();
|
518
|
+
}
|
519
|
+
|
520
|
+
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
|
521
|
+
std::stringstream buf;
|
522
|
+
|
523
|
+
buf << "[ ";
|
524
|
+
|
525
|
+
bool first = true;
|
526
|
+
for (int i = 0; i < batch.n_tokens; ++i) {
|
527
|
+
if (!first) {
|
528
|
+
buf << ", ";
|
529
|
+
} else {
|
530
|
+
first = false;
|
531
|
+
}
|
532
|
+
|
533
|
+
auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
|
534
|
+
|
535
|
+
detokenized.erase(
|
536
|
+
std::remove_if(
|
537
|
+
detokenized.begin(),
|
538
|
+
detokenized.end(),
|
539
|
+
[](const unsigned char c) { return !std::isprint(c); }),
|
540
|
+
detokenized.end());
|
541
|
+
|
542
|
+
buf << "\n" << std::to_string(i)
|
543
|
+
<< ":token '" << detokenized << "'"
|
544
|
+
<< ":pos " << std::to_string(batch.pos[i])
|
545
|
+
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
|
546
|
+
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
|
547
|
+
<< ":logits " << std::to_string(batch.logits[i]);
|
548
|
+
}
|
549
|
+
|
550
|
+
buf << " ]";
|
551
|
+
|
552
|
+
return buf.str();
|
553
|
+
}
|
554
|
+
|
2387
555
|
void string_process_escapes(std::string & input) {
|
2388
556
|
std::size_t input_len = input.length();
|
2389
557
|
std::size_t output_idx = 0;
|
@@ -2424,7 +592,7 @@ void string_process_escapes(std::string & input) {
|
|
2424
592
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
|
2425
593
|
const char * sep = strchr(data, '=');
|
2426
594
|
if (sep == nullptr || sep - data >= 128) {
|
2427
|
-
|
595
|
+
LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
|
2428
596
|
return false;
|
2429
597
|
}
|
2430
598
|
llama_model_kv_override kvo;
|
@@ -2447,20 +615,20 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
|
|
2447
615
|
} else if (std::strcmp(sep, "false") == 0) {
|
2448
616
|
kvo.val_bool = false;
|
2449
617
|
} else {
|
2450
|
-
|
618
|
+
LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
|
2451
619
|
return false;
|
2452
620
|
}
|
2453
621
|
} else if (strncmp(sep, "str:", 4) == 0) {
|
2454
622
|
sep += 4;
|
2455
623
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
2456
624
|
if (strlen(sep) > 127) {
|
2457
|
-
|
625
|
+
LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
|
2458
626
|
return false;
|
2459
627
|
}
|
2460
628
|
strncpy(kvo.val_str, sep, 127);
|
2461
629
|
kvo.val_str[127] = '\0';
|
2462
630
|
} else {
|
2463
|
-
|
631
|
+
LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
|
2464
632
|
return false;
|
2465
633
|
}
|
2466
634
|
overrides.emplace_back(std::move(kvo));
|
@@ -2672,7 +840,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
2672
840
|
}
|
2673
841
|
|
2674
842
|
if (model == NULL) {
|
2675
|
-
|
843
|
+
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
|
2676
844
|
return iparams;
|
2677
845
|
}
|
2678
846
|
|
@@ -2680,7 +848,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
2680
848
|
|
2681
849
|
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
2682
850
|
if (lctx == NULL) {
|
2683
|
-
|
851
|
+
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
2684
852
|
llama_free_model(model);
|
2685
853
|
return iparams;
|
2686
854
|
}
|
@@ -2716,7 +884,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
2716
884
|
loaded_la.scale = la.scale;
|
2717
885
|
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
2718
886
|
if (loaded_la.adapter == nullptr) {
|
2719
|
-
|
887
|
+
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
2720
888
|
llama_free(lctx);
|
2721
889
|
llama_free_model(model);
|
2722
890
|
return iparams;
|
@@ -2728,12 +896,12 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
2728
896
|
}
|
2729
897
|
|
2730
898
|
if (params.sparams.ignore_eos && llama_token_eos(model) == -1) {
|
2731
|
-
|
899
|
+
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
2732
900
|
params.sparams.ignore_eos = false;
|
2733
901
|
}
|
2734
902
|
|
2735
903
|
if (params.warmup) {
|
2736
|
-
|
904
|
+
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
2737
905
|
|
2738
906
|
std::vector<llama_token> tmp;
|
2739
907
|
llama_token bos = llama_token_bos(model);
|
@@ -2763,7 +931,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
2763
931
|
}
|
2764
932
|
llama_kv_cache_clear(lctx);
|
2765
933
|
llama_synchronize(lctx);
|
2766
|
-
|
934
|
+
llama_perf_context_reset(lctx);
|
2767
935
|
}
|
2768
936
|
|
2769
937
|
iparams.model = model;
|
@@ -2860,6 +1028,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
2860
1028
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
2861
1029
|
cparams.offload_kqv = !params.no_kv_offload;
|
2862
1030
|
cparams.flash_attn = params.flash_attn;
|
1031
|
+
cparams.no_perf = params.no_perf;
|
2863
1032
|
|
2864
1033
|
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
2865
1034
|
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
@@ -2885,17 +1054,44 @@ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const
|
|
2885
1054
|
|
2886
1055
|
#ifdef LLAMA_USE_CURL
|
2887
1056
|
|
1057
|
+
#define CURL_MAX_RETRY 3
|
1058
|
+
#define CURL_RETRY_DELAY_SECONDS 2
|
1059
|
+
|
1060
|
+
|
2888
1061
|
static bool starts_with(const std::string & str, const std::string & prefix) {
|
2889
1062
|
// While we wait for C++20's std::string::starts_with...
|
2890
1063
|
return str.rfind(prefix, 0) == 0;
|
2891
1064
|
}
|
2892
1065
|
|
1066
|
+
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
|
1067
|
+
int remaining_attempts = max_attempts;
|
1068
|
+
|
1069
|
+
while (remaining_attempts > 0) {
|
1070
|
+
LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
1071
|
+
|
1072
|
+
CURLcode res = curl_easy_perform(curl);
|
1073
|
+
if (res == CURLE_OK) {
|
1074
|
+
return true;
|
1075
|
+
}
|
1076
|
+
|
1077
|
+
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
|
1078
|
+
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
|
1079
|
+
|
1080
|
+
remaining_attempts--;
|
1081
|
+
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
1082
|
+
}
|
1083
|
+
|
1084
|
+
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
|
1085
|
+
|
1086
|
+
return false;
|
1087
|
+
}
|
1088
|
+
|
2893
1089
|
static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
2894
1090
|
|
2895
1091
|
// Initialize libcurl
|
2896
1092
|
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
2897
1093
|
if (!curl) {
|
2898
|
-
|
1094
|
+
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
2899
1095
|
return false;
|
2900
1096
|
}
|
2901
1097
|
|
@@ -2936,11 +1132,11 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
2936
1132
|
if (metadata_in.good()) {
|
2937
1133
|
try {
|
2938
1134
|
metadata_in >> metadata;
|
2939
|
-
|
1135
|
+
LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
2940
1136
|
if (metadata.contains("url") && metadata.at("url").is_string()) {
|
2941
1137
|
auto previous_url = metadata.at("url").get<std::string>();
|
2942
1138
|
if (previous_url != url) {
|
2943
|
-
|
1139
|
+
LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
|
2944
1140
|
return false;
|
2945
1141
|
}
|
2946
1142
|
}
|
@@ -2951,12 +1147,12 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
2951
1147
|
last_modified = metadata.at("lastModified");
|
2952
1148
|
}
|
2953
1149
|
} catch (const nlohmann::json::exception & e) {
|
2954
|
-
|
1150
|
+
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
2955
1151
|
return false;
|
2956
1152
|
}
|
2957
1153
|
}
|
2958
1154
|
} else {
|
2959
|
-
|
1155
|
+
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
2960
1156
|
}
|
2961
1157
|
|
2962
1158
|
// Send a HEAD request to retrieve the etag and last-modified headers
|
@@ -2993,9 +1189,8 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
2993
1189
|
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
2994
1190
|
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
2995
1191
|
|
2996
|
-
|
2997
|
-
if (
|
2998
|
-
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
|
1192
|
+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
1193
|
+
if (!was_perform_successful) {
|
2999
1194
|
return false;
|
3000
1195
|
}
|
3001
1196
|
|
@@ -3005,26 +1200,26 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
3005
1200
|
// HEAD not supported, we don't know if the file has changed
|
3006
1201
|
// force trigger downloading
|
3007
1202
|
force_download = true;
|
3008
|
-
|
1203
|
+
LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
3009
1204
|
}
|
3010
1205
|
}
|
3011
1206
|
|
3012
1207
|
bool should_download = !file_exists || force_download;
|
3013
1208
|
if (!should_download) {
|
3014
1209
|
if (!etag.empty() && etag != headers.etag) {
|
3015
|
-
|
1210
|
+
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|
3016
1211
|
should_download = true;
|
3017
1212
|
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
|
3018
|
-
|
1213
|
+
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
|
3019
1214
|
should_download = true;
|
3020
1215
|
}
|
3021
1216
|
}
|
3022
1217
|
if (should_download) {
|
3023
1218
|
std::string path_temporary = path + ".downloadInProgress";
|
3024
1219
|
if (file_exists) {
|
3025
|
-
|
1220
|
+
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
3026
1221
|
if (remove(path.c_str()) != 0) {
|
3027
|
-
|
1222
|
+
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
3028
1223
|
return false;
|
3029
1224
|
}
|
3030
1225
|
}
|
@@ -3039,7 +1234,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
3039
1234
|
|
3040
1235
|
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
|
3041
1236
|
if (!outfile) {
|
3042
|
-
|
1237
|
+
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
|
3043
1238
|
return false;
|
3044
1239
|
}
|
3045
1240
|
|
@@ -3070,18 +1265,17 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
3070
1265
|
};
|
3071
1266
|
|
3072
1267
|
// start the download
|
3073
|
-
|
3074
|
-
|
3075
|
-
|
3076
|
-
if (
|
3077
|
-
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
|
1268
|
+
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
1269
|
+
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
1270
|
+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
1271
|
+
if (!was_perform_successful) {
|
3078
1272
|
return false;
|
3079
1273
|
}
|
3080
1274
|
|
3081
1275
|
long http_code = 0;
|
3082
1276
|
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
3083
1277
|
if (http_code < 200 || http_code >= 400) {
|
3084
|
-
|
1278
|
+
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
3085
1279
|
return false;
|
3086
1280
|
}
|
3087
1281
|
|
@@ -3095,10 +1289,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
3095
1289
|
{"lastModified", headers.last_modified}
|
3096
1290
|
});
|
3097
1291
|
std::ofstream(metadata_path) << metadata.dump(4);
|
3098
|
-
|
1292
|
+
LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
3099
1293
|
|
3100
1294
|
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
3101
|
-
|
1295
|
+
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
3102
1296
|
return false;
|
3103
1297
|
}
|
3104
1298
|
}
|
@@ -3113,7 +1307,7 @@ struct llama_model * llama_load_model_from_url(
|
|
3113
1307
|
const struct llama_model_params & params) {
|
3114
1308
|
// Basic validation of the model_url
|
3115
1309
|
if (!model_url || strlen(model_url) == 0) {
|
3116
|
-
|
1310
|
+
LOG_ERR("%s: invalid model_url\n", __func__);
|
3117
1311
|
return NULL;
|
3118
1312
|
}
|
3119
1313
|
|
@@ -3130,7 +1324,7 @@ struct llama_model * llama_load_model_from_url(
|
|
3130
1324
|
};
|
3131
1325
|
auto * ctx_gguf = lm_gguf_init_from_file(path_model, lm_gguf_params);
|
3132
1326
|
if (!ctx_gguf) {
|
3133
|
-
|
1327
|
+
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
|
3134
1328
|
return NULL;
|
3135
1329
|
}
|
3136
1330
|
|
@@ -3150,14 +1344,12 @@ struct llama_model * llama_load_model_from_url(
|
|
3150
1344
|
// and extract split URL and PATH prefixes
|
3151
1345
|
{
|
3152
1346
|
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
|
3153
|
-
|
3154
|
-
" n_split=%d\n", __func__, path_model, n_split);
|
1347
|
+
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
|
3155
1348
|
return NULL;
|
3156
1349
|
}
|
3157
1350
|
|
3158
1351
|
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
3159
|
-
|
3160
|
-
" n_split=%d\n", __func__, model_url, n_split);
|
1352
|
+
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
|
3161
1353
|
return NULL;
|
3162
1354
|
}
|
3163
1355
|
}
|
@@ -3217,7 +1409,7 @@ struct llama_model * llama_load_model_from_url(
|
|
3217
1409
|
const char * /*path_model*/,
|
3218
1410
|
const char * /*hf_token*/,
|
3219
1411
|
const struct llama_model_params & /*params*/) {
|
3220
|
-
|
1412
|
+
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
3221
1413
|
return nullptr;
|
3222
1414
|
}
|
3223
1415
|
|
@@ -3227,7 +1419,7 @@ struct llama_model * llama_load_model_from_hf(
|
|
3227
1419
|
const char * /*path_model*/,
|
3228
1420
|
const char * /*hf_token*/,
|
3229
1421
|
const struct llama_model_params & /*params*/) {
|
3230
|
-
|
1422
|
+
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
3231
1423
|
return nullptr;
|
3232
1424
|
}
|
3233
1425
|
|
@@ -3555,13 +1747,13 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
3555
1747
|
};
|
3556
1748
|
struct lm_gguf_context * ctx_gguf = lm_gguf_init_from_file(load_info.fname.c_str(), meta_lm_gguf_params);
|
3557
1749
|
if (!ctx_gguf) {
|
3558
|
-
|
1750
|
+
LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
|
3559
1751
|
return result;
|
3560
1752
|
}
|
3561
1753
|
|
3562
1754
|
int32_t n_tensors = lm_gguf_get_n_tensors(ctx_gguf);
|
3563
1755
|
if (n_tensors == 0) {
|
3564
|
-
|
1756
|
+
LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
|
3565
1757
|
}
|
3566
1758
|
|
3567
1759
|
for (int i = 0; i < n_tensors; i++) {
|
@@ -3579,23 +1771,23 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
3579
1771
|
}
|
3580
1772
|
}
|
3581
1773
|
if (layer_idx < 0) {
|
3582
|
-
|
1774
|
+
LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
3583
1775
|
result.n_embd = -1;
|
3584
1776
|
break;
|
3585
1777
|
} else if (layer_idx == 0) {
|
3586
|
-
|
1778
|
+
LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
3587
1779
|
result.n_embd = -1;
|
3588
1780
|
break;
|
3589
1781
|
}
|
3590
1782
|
|
3591
1783
|
struct lm_ggml_tensor * tensor = lm_ggml_get_tensor(ctx, name.c_str());
|
3592
1784
|
if (tensor->type != LM_GGML_TYPE_F32) {
|
3593
|
-
|
1785
|
+
LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
|
3594
1786
|
result.n_embd = -1;
|
3595
1787
|
break;
|
3596
1788
|
}
|
3597
1789
|
if (lm_ggml_n_dims(tensor) != 1) {
|
3598
|
-
|
1790
|
+
LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
|
3599
1791
|
result.n_embd = -1;
|
3600
1792
|
break;
|
3601
1793
|
}
|
@@ -3603,7 +1795,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
3603
1795
|
if (result.n_embd == -1) {
|
3604
1796
|
result.n_embd = lm_ggml_nelements(tensor);
|
3605
1797
|
} else if (lm_ggml_nelements(tensor) != result.n_embd) {
|
3606
|
-
|
1798
|
+
LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
|
3607
1799
|
result.n_embd = -1;
|
3608
1800
|
break;
|
3609
1801
|
}
|
@@ -3620,7 +1812,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
3620
1812
|
}
|
3621
1813
|
|
3622
1814
|
if (result.n_embd == -1) {
|
3623
|
-
|
1815
|
+
LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
|
3624
1816
|
result.data.clear();
|
3625
1817
|
}
|
3626
1818
|
|
@@ -3641,7 +1833,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|
3641
1833
|
break;
|
3642
1834
|
}
|
3643
1835
|
if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
|
3644
|
-
|
1836
|
+
LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
|
3645
1837
|
result.n_embd = -1;
|
3646
1838
|
break;
|
3647
1839
|
}
|
@@ -3657,7 +1849,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|
3657
1849
|
}
|
3658
1850
|
|
3659
1851
|
if (result.n_embd == -1) {
|
3660
|
-
|
1852
|
+
LOG_ERR("%s: no valid control vector files passed\n", __func__);
|
3661
1853
|
result.data.clear();
|
3662
1854
|
}
|
3663
1855
|
|
@@ -3748,6 +1940,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
3748
1940
|
fprintf(stream, "cpu_has_sve: %s\n", lm_ggml_cpu_has_sve() ? "true" : "false");
|
3749
1941
|
fprintf(stream, "cpu_has_f16c: %s\n", lm_ggml_cpu_has_f16c() ? "true" : "false");
|
3750
1942
|
fprintf(stream, "cpu_has_fp16_va: %s\n", lm_ggml_cpu_has_fp16_va() ? "true" : "false");
|
1943
|
+
fprintf(stream, "cpu_has_riscv_v: %s\n", lm_ggml_cpu_has_riscv_v() ? "true" : "false");
|
3751
1944
|
fprintf(stream, "cpu_has_wasm_simd: %s\n", lm_ggml_cpu_has_wasm_simd() ? "true" : "false");
|
3752
1945
|
fprintf(stream, "cpu_has_blas: %s\n", lm_ggml_cpu_has_blas() ? "true" : "false");
|
3753
1946
|
fprintf(stream, "cpu_has_sse3: %s\n", lm_ggml_cpu_has_sse3() ? "true" : "false");
|