cui-llama.rn 1.1.2 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +1 -2
- package/android/src/main/jni.cpp +26 -21
- package/cpp/common.cpp +181 -1584
- package/cpp/common.h +131 -52
- package/cpp/ggml-aarch64.c +612 -0
- package/cpp/ggml-alloc.h +2 -2
- package/cpp/ggml-backend.c +33 -6
- package/cpp/ggml-backend.h +2 -0
- package/cpp/ggml-common.h +20 -0
- package/cpp/ggml-impl.h +36 -7
- package/cpp/ggml-metal.m +68 -8
- package/cpp/ggml-quants.c +932 -50
- package/cpp/ggml-quants.h +15 -0
- package/cpp/ggml.c +1712 -325
- package/cpp/ggml.h +169 -100
- package/cpp/llama-grammar.cpp +721 -122
- package/cpp/llama-grammar.h +120 -15
- package/cpp/llama-impl.h +132 -1
- package/cpp/llama-sampling.cpp +1483 -354
- package/cpp/llama-sampling.h +20 -48
- package/cpp/llama-vocab.cpp +140 -7
- package/cpp/llama-vocab.h +3 -2
- package/cpp/llama.cpp +824 -327
- package/cpp/llama.h +235 -256
- package/cpp/rn-llama.hpp +18 -14
- package/cpp/sampling.cpp +353 -354
- package/cpp/sampling.h +62 -143
- package/cpp/sgemm.cpp +153 -0
- package/package.json +1 -1
- package/cpp/grammar-parser.cpp +0 -539
- package/cpp/grammar-parser.h +0 -29
package/cpp/common.cpp
CHANGED
@@ -62,14 +62,6 @@ char const *LLAMA_BUILD_TARGET = "unknown";
|
|
62
62
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
63
63
|
#endif
|
64
64
|
|
65
|
-
#if (defined(LM_GGML_USE_CUDA) || defined(LM_GGML_USE_SYCL))
|
66
|
-
#define LM_GGML_USE_CUDA_SYCL
|
67
|
-
#endif
|
68
|
-
|
69
|
-
#if (defined(LM_GGML_USE_CUDA) || defined(LM_GGML_USE_SYCL)) || defined(LM_GGML_USE_VULKAN)
|
70
|
-
#define LM_GGML_USE_CUDA_SYCL_VULKAN
|
71
|
-
#endif
|
72
|
-
|
73
65
|
#if defined(LLAMA_USE_CURL)
|
74
66
|
#ifdef __linux__
|
75
67
|
#include <linux/limits.h>
|
@@ -83,41 +75,6 @@ char const *LLAMA_BUILD_TARGET = "unknown";
|
|
83
75
|
|
84
76
|
using json = nlohmann::ordered_json;
|
85
77
|
|
86
|
-
//
|
87
|
-
// Environment variable utils
|
88
|
-
//
|
89
|
-
|
90
|
-
template<typename T>
|
91
|
-
static typename std::enable_if<std::is_same<T, std::string>::value, void>::type
|
92
|
-
get_env(std::string name, T & target) {
|
93
|
-
char * value = std::getenv(name.c_str());
|
94
|
-
target = value ? std::string(value) : target;
|
95
|
-
}
|
96
|
-
|
97
|
-
template<typename T>
|
98
|
-
static typename std::enable_if<!std::is_same<T, bool>::value && std::is_integral<T>::value, void>::type
|
99
|
-
get_env(std::string name, T & target) {
|
100
|
-
char * value = std::getenv(name.c_str());
|
101
|
-
target = value ? std::stoi(value) : target;
|
102
|
-
}
|
103
|
-
|
104
|
-
template<typename T>
|
105
|
-
static typename std::enable_if<std::is_floating_point<T>::value, void>::type
|
106
|
-
get_env(std::string name, T & target) {
|
107
|
-
char * value = std::getenv(name.c_str());
|
108
|
-
target = value ? std::stof(value) : target;
|
109
|
-
}
|
110
|
-
|
111
|
-
template<typename T>
|
112
|
-
static typename std::enable_if<std::is_same<T, bool>::value, void>::type
|
113
|
-
get_env(std::string name, T & target) {
|
114
|
-
char * value = std::getenv(name.c_str());
|
115
|
-
if (value) {
|
116
|
-
std::string val(value);
|
117
|
-
target = val == "1" || val == "true";
|
118
|
-
}
|
119
|
-
}
|
120
|
-
|
121
78
|
//
|
122
79
|
// CPU utils
|
123
80
|
//
|
@@ -257,1564 +214,165 @@ int32_t cpu_get_num_math() {
|
|
257
214
|
return cpu_get_num_physical_cores();
|
258
215
|
}
|
259
216
|
|
260
|
-
//
|
261
|
-
// CLI argument parsing
|
262
|
-
//
|
263
|
-
|
264
|
-
void gpt_params_handle_model_default(gpt_params & params) {
|
265
|
-
if (!params.hf_repo.empty()) {
|
266
|
-
// short-hand to avoid specifying --hf-file -> default it to --model
|
267
|
-
if (params.hf_file.empty()) {
|
268
|
-
if (params.model.empty()) {
|
269
|
-
throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
|
270
|
-
}
|
271
|
-
params.hf_file = params.model;
|
272
|
-
} else if (params.model.empty()) {
|
273
|
-
params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
|
274
|
-
}
|
275
|
-
} else if (!params.model_url.empty()) {
|
276
|
-
if (params.model.empty()) {
|
277
|
-
auto f = string_split(params.model_url, '#').front();
|
278
|
-
f = string_split(f, '?').front();
|
279
|
-
params.model = fs_get_cache_file(string_split(f, '/').back());
|
280
|
-
}
|
281
|
-
} else if (params.model.empty()) {
|
282
|
-
params.model = DEFAULT_MODEL_PATH;
|
283
|
-
}
|
284
|
-
}
|
285
|
-
|
286
|
-
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
287
|
-
bool invalid_param = false;
|
288
|
-
std::string arg;
|
289
|
-
const std::string arg_prefix = "--";
|
290
|
-
llama_sampling_params & sparams = params.sparams;
|
291
|
-
|
292
|
-
for (int i = 1; i < argc; i++) {
|
293
|
-
arg = argv[i];
|
294
|
-
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
295
|
-
std::replace(arg.begin(), arg.end(), '_', '-');
|
296
|
-
}
|
297
|
-
if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
|
298
|
-
throw std::invalid_argument("error: unknown argument: " + arg);
|
299
|
-
}
|
300
|
-
if (invalid_param) {
|
301
|
-
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
302
|
-
}
|
303
|
-
}
|
304
|
-
|
305
|
-
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
|
306
|
-
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
307
|
-
}
|
217
|
+
// Helper for setting process priority
|
308
218
|
|
309
|
-
|
219
|
+
#if defined(_WIN32)
|
310
220
|
|
311
|
-
|
312
|
-
|
221
|
+
bool set_process_priority(enum lm_ggml_sched_priority prio) {
|
222
|
+
if (prio == LM_GGML_SCHED_PRIO_NORMAL) {
|
223
|
+
return true;
|
313
224
|
}
|
314
225
|
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
string_process_escapes(antiprompt);
|
322
|
-
}
|
226
|
+
DWORD p = NORMAL_PRIORITY_CLASS;
|
227
|
+
switch (prio) {
|
228
|
+
case LM_GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
|
229
|
+
case LM_GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
|
230
|
+
case LM_GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
|
231
|
+
case LM_GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
|
323
232
|
}
|
324
233
|
|
325
|
-
if (!
|
326
|
-
|
327
|
-
|
234
|
+
if (!SetPriorityClass(GetCurrentProcess(), p)) {
|
235
|
+
fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
|
236
|
+
return false;
|
328
237
|
}
|
329
238
|
|
330
239
|
return true;
|
331
240
|
}
|
332
241
|
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
get_env("LLAMA_ARG_MODEL_URL", params.model_url);
|
337
|
-
get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias);
|
338
|
-
get_env("LLAMA_ARG_HF_REPO", params.hf_repo);
|
339
|
-
get_env("LLAMA_ARG_HF_FILE", params.hf_file);
|
340
|
-
get_env("LLAMA_ARG_THREADS", params.n_threads);
|
341
|
-
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
|
342
|
-
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
|
343
|
-
get_env("LLAMA_ARG_BATCH", params.n_batch);
|
344
|
-
get_env("LLAMA_ARG_UBATCH", params.n_ubatch);
|
345
|
-
get_env("LLAMA_ARG_N_GPU_LAYERS", params.n_gpu_layers);
|
346
|
-
get_env("LLAMA_ARG_THREADS_HTTP", params.n_threads_http);
|
347
|
-
get_env("LLAMA_ARG_CHAT_TEMPLATE", params.chat_template);
|
348
|
-
get_env("LLAMA_ARG_N_PREDICT", params.n_predict);
|
349
|
-
get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics);
|
350
|
-
get_env("LLAMA_ARG_ENDPOINT_SLOTS", params.endpoint_slots);
|
351
|
-
get_env("LLAMA_ARG_EMBEDDINGS", params.embedding);
|
352
|
-
get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn);
|
353
|
-
get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold);
|
354
|
-
get_env("LLAMA_ARG_CONT_BATCHING", params.cont_batching);
|
355
|
-
get_env("LLAMA_ARG_HOST", params.hostname);
|
356
|
-
get_env("LLAMA_ARG_PORT", params.port);
|
357
|
-
}
|
242
|
+
#else // MacOS and POSIX
|
243
|
+
#include <sys/types.h>
|
244
|
+
#include <sys/resource.h>
|
358
245
|
|
359
|
-
bool
|
360
|
-
|
246
|
+
bool set_process_priority(enum lm_ggml_sched_priority prio) {
|
247
|
+
if (prio == LM_GGML_SCHED_PRIO_NORMAL) {
|
248
|
+
return true;
|
249
|
+
}
|
361
250
|
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
} catch (const std::invalid_argument & ex) {
|
369
|
-
fprintf(stderr, "%s\n", ex.what());
|
370
|
-
params = params_org;
|
371
|
-
return false;
|
251
|
+
int p = 0;
|
252
|
+
switch (prio) {
|
253
|
+
case LM_GGML_SCHED_PRIO_NORMAL: p = 0; break;
|
254
|
+
case LM_GGML_SCHED_PRIO_MEDIUM: p = -5; break;
|
255
|
+
case LM_GGML_SCHED_PRIO_HIGH: p = -10; break;
|
256
|
+
case LM_GGML_SCHED_PRIO_REALTIME: p = -20; break;
|
372
257
|
}
|
373
258
|
|
259
|
+
if (!setpriority(PRIO_PROCESS, 0, p)) {
|
260
|
+
fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
|
261
|
+
return false;
|
262
|
+
}
|
374
263
|
return true;
|
375
264
|
}
|
376
265
|
|
377
|
-
#
|
266
|
+
#endif
|
378
267
|
|
379
|
-
|
380
|
-
|
268
|
+
//
|
269
|
+
// CLI argument parsing
|
270
|
+
//
|
381
271
|
|
382
|
-
llama_sampling_params & sparams = params.sparams;
|
383
272
|
|
384
|
-
|
385
|
-
|
386
|
-
// TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
|
387
|
-
params.seed = std::stoul(argv[i]);
|
388
|
-
sparams.seed = std::stoul(argv[i]);
|
389
|
-
return true;
|
390
|
-
}
|
391
|
-
if (arg == "-t" || arg == "--threads") {
|
392
|
-
CHECK_ARG
|
393
|
-
params.n_threads = std::stoi(argv[i]);
|
394
|
-
if (params.n_threads <= 0) {
|
395
|
-
params.n_threads = std::thread::hardware_concurrency();
|
396
|
-
}
|
397
|
-
return true;
|
398
|
-
}
|
399
|
-
if (arg == "-tb" || arg == "--threads-batch") {
|
400
|
-
CHECK_ARG
|
401
|
-
params.n_threads_batch = std::stoi(argv[i]);
|
402
|
-
if (params.n_threads_batch <= 0) {
|
403
|
-
params.n_threads_batch = std::thread::hardware_concurrency();
|
404
|
-
}
|
405
|
-
return true;
|
406
|
-
}
|
407
|
-
if (arg == "-td" || arg == "--threads-draft") {
|
408
|
-
CHECK_ARG
|
409
|
-
params.n_threads_draft = std::stoi(argv[i]);
|
410
|
-
if (params.n_threads_draft <= 0) {
|
411
|
-
params.n_threads_draft = std::thread::hardware_concurrency();
|
412
|
-
}
|
413
|
-
return true;
|
414
|
-
}
|
415
|
-
if (arg == "-tbd" || arg == "--threads-batch-draft") {
|
416
|
-
CHECK_ARG
|
417
|
-
params.n_threads_batch_draft = std::stoi(argv[i]);
|
418
|
-
if (params.n_threads_batch_draft <= 0) {
|
419
|
-
params.n_threads_batch_draft = std::thread::hardware_concurrency();
|
420
|
-
}
|
421
|
-
return true;
|
422
|
-
}
|
423
|
-
if (arg == "-p" || arg == "--prompt") {
|
424
|
-
CHECK_ARG
|
425
|
-
params.prompt = argv[i];
|
426
|
-
return true;
|
427
|
-
}
|
428
|
-
if (arg == "-e" || arg == "--escape") {
|
429
|
-
params.escape = true;
|
430
|
-
return true;
|
431
|
-
}
|
432
|
-
if (arg == "--no-escape") {
|
433
|
-
params.escape = false;
|
434
|
-
return true;
|
435
|
-
}
|
436
|
-
if (arg == "--prompt-cache") {
|
437
|
-
CHECK_ARG
|
438
|
-
params.path_prompt_cache = argv[i];
|
439
|
-
return true;
|
440
|
-
}
|
441
|
-
if (arg == "--prompt-cache-all") {
|
442
|
-
params.prompt_cache_all = true;
|
443
|
-
return true;
|
444
|
-
}
|
445
|
-
if (arg == "--prompt-cache-ro") {
|
446
|
-
params.prompt_cache_ro = true;
|
447
|
-
return true;
|
448
|
-
}
|
449
|
-
if (arg == "-bf" || arg == "--binary-file") {
|
450
|
-
CHECK_ARG
|
451
|
-
std::ifstream file(argv[i], std::ios::binary);
|
452
|
-
if (!file) {
|
453
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
454
|
-
invalid_param = true;
|
455
|
-
return true;
|
456
|
-
}
|
457
|
-
// store the external file name in params
|
458
|
-
params.prompt_file = argv[i];
|
459
|
-
std::ostringstream ss;
|
460
|
-
ss << file.rdbuf();
|
461
|
-
params.prompt = ss.str();
|
462
|
-
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
|
463
|
-
return true;
|
464
|
-
}
|
465
|
-
if (arg == "-f" || arg == "--file") {
|
466
|
-
CHECK_ARG
|
467
|
-
std::ifstream file(argv[i]);
|
468
|
-
if (!file) {
|
469
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
470
|
-
invalid_param = true;
|
471
|
-
return true;
|
472
|
-
}
|
473
|
-
// store the external file name in params
|
474
|
-
params.prompt_file = argv[i];
|
475
|
-
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
476
|
-
if (!params.prompt.empty() && params.prompt.back() == '\n') {
|
477
|
-
params.prompt.pop_back();
|
478
|
-
}
|
479
|
-
return true;
|
480
|
-
}
|
481
|
-
if (arg == "--in-file") {
|
482
|
-
CHECK_ARG
|
483
|
-
std::ifstream file(argv[i]);
|
484
|
-
if (!file) {
|
485
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
486
|
-
invalid_param = true;
|
487
|
-
return true;
|
488
|
-
}
|
489
|
-
params.in_files.push_back(argv[i]);
|
490
|
-
return true;
|
491
|
-
}
|
492
|
-
if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
|
493
|
-
CHECK_ARG
|
494
|
-
params.n_predict = std::stoi(argv[i]);
|
495
|
-
return true;
|
496
|
-
}
|
497
|
-
if (arg == "--top-k") {
|
498
|
-
CHECK_ARG
|
499
|
-
sparams.top_k = std::stoi(argv[i]);
|
500
|
-
return true;
|
501
|
-
}
|
502
|
-
if (arg == "-c" || arg == "--ctx-size") {
|
503
|
-
CHECK_ARG
|
504
|
-
params.n_ctx = std::stoi(argv[i]);
|
505
|
-
return true;
|
506
|
-
}
|
507
|
-
if (arg == "--grp-attn-n" || arg == "-gan") {
|
508
|
-
CHECK_ARG
|
509
|
-
params.grp_attn_n = std::stoi(argv[i]);
|
510
|
-
return true;
|
511
|
-
}
|
512
|
-
if (arg == "--grp-attn-w" || arg == "-gaw") {
|
513
|
-
CHECK_ARG
|
514
|
-
params.grp_attn_w = std::stoi(argv[i]);
|
515
|
-
return true;
|
516
|
-
}
|
517
|
-
if (arg == "--rope-freq-base") {
|
518
|
-
CHECK_ARG
|
519
|
-
params.rope_freq_base = std::stof(argv[i]);
|
520
|
-
return true;
|
521
|
-
}
|
522
|
-
if (arg == "--rope-freq-scale") {
|
523
|
-
CHECK_ARG
|
524
|
-
params.rope_freq_scale = std::stof(argv[i]);
|
525
|
-
return true;
|
526
|
-
}
|
527
|
-
if (arg == "--rope-scaling") {
|
528
|
-
CHECK_ARG
|
529
|
-
std::string value(argv[i]);
|
530
|
-
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
531
|
-
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
532
|
-
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
533
|
-
else { invalid_param = true; }
|
534
|
-
return true;
|
535
|
-
}
|
536
|
-
if (arg == "--rope-scale") {
|
537
|
-
CHECK_ARG
|
538
|
-
params.rope_freq_scale = 1.0f / std::stof(argv[i]);
|
539
|
-
return true;
|
540
|
-
}
|
541
|
-
if (arg == "--yarn-orig-ctx") {
|
542
|
-
CHECK_ARG
|
543
|
-
params.yarn_orig_ctx = std::stoi(argv[i]);
|
544
|
-
return true;
|
545
|
-
}
|
546
|
-
if (arg == "--yarn-ext-factor") {
|
547
|
-
CHECK_ARG
|
548
|
-
params.yarn_ext_factor = std::stof(argv[i]);
|
549
|
-
return true;
|
550
|
-
}
|
551
|
-
if (arg == "--yarn-attn-factor") {
|
552
|
-
CHECK_ARG
|
553
|
-
params.yarn_attn_factor = std::stof(argv[i]);
|
554
|
-
return true;
|
555
|
-
}
|
556
|
-
if (arg == "--yarn-beta-fast") {
|
557
|
-
CHECK_ARG
|
558
|
-
params.yarn_beta_fast = std::stof(argv[i]);
|
559
|
-
return true;
|
560
|
-
}
|
561
|
-
if (arg == "--yarn-beta-slow") {
|
562
|
-
CHECK_ARG
|
563
|
-
params.yarn_beta_slow = std::stof(argv[i]);
|
564
|
-
return true;
|
565
|
-
}
|
566
|
-
if (arg == "--pooling") {
|
567
|
-
CHECK_ARG
|
568
|
-
std::string value(argv[i]);
|
569
|
-
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
|
570
|
-
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
|
571
|
-
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
|
572
|
-
else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
|
573
|
-
else { invalid_param = true; }
|
574
|
-
return true;
|
575
|
-
}
|
576
|
-
if (arg == "--attention") {
|
577
|
-
CHECK_ARG
|
578
|
-
std::string value(argv[i]);
|
579
|
-
/**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
|
580
|
-
else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
|
581
|
-
else { invalid_param = true; }
|
582
|
-
return true;
|
583
|
-
}
|
584
|
-
if (arg == "--defrag-thold" || arg == "-dt") {
|
585
|
-
CHECK_ARG
|
586
|
-
params.defrag_thold = std::stof(argv[i]);
|
587
|
-
return true;
|
588
|
-
}
|
589
|
-
if (arg == "--samplers") {
|
590
|
-
CHECK_ARG
|
591
|
-
const auto sampler_names = string_split(argv[i], ';');
|
592
|
-
sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
|
593
|
-
return true;
|
594
|
-
}
|
595
|
-
if (arg == "--sampling-seq") {
|
596
|
-
CHECK_ARG
|
597
|
-
sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]);
|
598
|
-
return true;
|
599
|
-
}
|
600
|
-
if (arg == "--top-p") {
|
601
|
-
CHECK_ARG
|
602
|
-
sparams.top_p = std::stof(argv[i]);
|
603
|
-
return true;
|
604
|
-
}
|
605
|
-
if (arg == "--min-p") {
|
606
|
-
CHECK_ARG
|
607
|
-
sparams.min_p = std::stof(argv[i]);
|
608
|
-
return true;
|
609
|
-
}
|
610
|
-
if (arg == "--temp") {
|
611
|
-
CHECK_ARG
|
612
|
-
sparams.temp = std::stof(argv[i]);
|
613
|
-
sparams.temp = std::max(sparams.temp, 0.0f);
|
614
|
-
return true;
|
615
|
-
}
|
616
|
-
if (arg == "--tfs") {
|
617
|
-
CHECK_ARG
|
618
|
-
sparams.tfs_z = std::stof(argv[i]);
|
619
|
-
return true;
|
620
|
-
}
|
621
|
-
if (arg == "--typical") {
|
622
|
-
CHECK_ARG
|
623
|
-
sparams.typical_p = std::stof(argv[i]);
|
624
|
-
return true;
|
625
|
-
}
|
626
|
-
if (arg == "--repeat-last-n") {
|
627
|
-
CHECK_ARG
|
628
|
-
sparams.penalty_last_n = std::stoi(argv[i]);
|
629
|
-
sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
|
630
|
-
return true;
|
631
|
-
}
|
632
|
-
if (arg == "--repeat-penalty") {
|
633
|
-
CHECK_ARG
|
634
|
-
sparams.penalty_repeat = std::stof(argv[i]);
|
635
|
-
return true;
|
636
|
-
}
|
637
|
-
if (arg == "--frequency-penalty") {
|
638
|
-
CHECK_ARG
|
639
|
-
sparams.penalty_freq = std::stof(argv[i]);
|
640
|
-
return true;
|
641
|
-
}
|
642
|
-
if (arg == "--presence-penalty") {
|
643
|
-
CHECK_ARG
|
644
|
-
sparams.penalty_present = std::stof(argv[i]);
|
645
|
-
return true;
|
646
|
-
}
|
647
|
-
if (arg == "--dynatemp-range") {
|
648
|
-
CHECK_ARG
|
649
|
-
sparams.dynatemp_range = std::stof(argv[i]);
|
650
|
-
return true;
|
651
|
-
}
|
652
|
-
if (arg == "--dynatemp-exp") {
|
653
|
-
CHECK_ARG
|
654
|
-
sparams.dynatemp_exponent = std::stof(argv[i]);
|
655
|
-
return true;
|
656
|
-
}
|
657
|
-
if (arg == "--mirostat") {
|
658
|
-
CHECK_ARG
|
659
|
-
sparams.mirostat = std::stoi(argv[i]);
|
660
|
-
return true;
|
661
|
-
}
|
662
|
-
if (arg == "--mirostat-lr") {
|
663
|
-
CHECK_ARG
|
664
|
-
sparams.mirostat_eta = std::stof(argv[i]);
|
665
|
-
return true;
|
666
|
-
}
|
667
|
-
if (arg == "--mirostat-ent") {
|
668
|
-
CHECK_ARG
|
669
|
-
sparams.mirostat_tau = std::stof(argv[i]);
|
670
|
-
return true;
|
671
|
-
}
|
672
|
-
if (arg == "--cfg-negative-prompt") {
|
673
|
-
CHECK_ARG
|
674
|
-
sparams.cfg_negative_prompt = argv[i];
|
675
|
-
return true;
|
676
|
-
}
|
677
|
-
if (arg == "--cfg-negative-prompt-file") {
|
678
|
-
CHECK_ARG
|
679
|
-
std::ifstream file(argv[i]);
|
680
|
-
if (!file) {
|
681
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
682
|
-
invalid_param = true;
|
683
|
-
return true;
|
684
|
-
}
|
685
|
-
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
|
686
|
-
if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
|
687
|
-
sparams.cfg_negative_prompt.pop_back();
|
688
|
-
}
|
689
|
-
return true;
|
690
|
-
}
|
691
|
-
if (arg == "--cfg-scale") {
|
692
|
-
CHECK_ARG
|
693
|
-
sparams.cfg_scale = std::stof(argv[i]);
|
694
|
-
return true;
|
695
|
-
}
|
696
|
-
if (arg == "-b" || arg == "--batch-size") {
|
697
|
-
CHECK_ARG
|
698
|
-
params.n_batch = std::stoi(argv[i]);
|
699
|
-
return true;
|
700
|
-
}
|
701
|
-
if (arg == "-ub" || arg == "--ubatch-size") {
|
702
|
-
CHECK_ARG
|
703
|
-
params.n_ubatch = std::stoi(argv[i]);
|
704
|
-
return true;
|
705
|
-
}
|
706
|
-
if (arg == "--keep") {
|
707
|
-
CHECK_ARG
|
708
|
-
params.n_keep = std::stoi(argv[i]);
|
709
|
-
return true;
|
710
|
-
}
|
711
|
-
if (arg == "--draft") {
|
712
|
-
CHECK_ARG
|
713
|
-
params.n_draft = std::stoi(argv[i]);
|
714
|
-
return true;
|
715
|
-
}
|
716
|
-
if (arg == "--chunks") {
|
717
|
-
CHECK_ARG
|
718
|
-
params.n_chunks = std::stoi(argv[i]);
|
719
|
-
return true;
|
720
|
-
}
|
721
|
-
if (arg == "-np" || arg == "--parallel") {
|
722
|
-
CHECK_ARG
|
723
|
-
params.n_parallel = std::stoi(argv[i]);
|
724
|
-
return true;
|
725
|
-
}
|
726
|
-
if (arg == "-ns" || arg == "--sequences") {
|
727
|
-
CHECK_ARG
|
728
|
-
params.n_sequences = std::stoi(argv[i]);
|
729
|
-
return true;
|
730
|
-
}
|
731
|
-
if (arg == "--p-split" || arg == "-ps") {
|
732
|
-
CHECK_ARG
|
733
|
-
params.p_split = std::stof(argv[i]);
|
734
|
-
return true;
|
735
|
-
}
|
736
|
-
if (arg == "-m" || arg == "--model") {
|
737
|
-
CHECK_ARG
|
738
|
-
params.model = argv[i];
|
739
|
-
return true;
|
740
|
-
}
|
741
|
-
if (arg == "-md" || arg == "--model-draft") {
|
742
|
-
CHECK_ARG
|
743
|
-
params.model_draft = argv[i];
|
744
|
-
return true;
|
745
|
-
}
|
746
|
-
if (arg == "-a" || arg == "--alias") {
|
747
|
-
CHECK_ARG
|
748
|
-
params.model_alias = argv[i];
|
749
|
-
return true;
|
750
|
-
}
|
751
|
-
if (arg == "-mu" || arg == "--model-url") {
|
752
|
-
CHECK_ARG
|
753
|
-
params.model_url = argv[i];
|
754
|
-
return true;
|
755
|
-
}
|
756
|
-
if (arg == "-hft" || arg == "--hf-token") {
|
757
|
-
if (++i >= argc) {
|
758
|
-
invalid_param = true;
|
759
|
-
return true;
|
760
|
-
}
|
761
|
-
params.hf_token = argv[i];
|
762
|
-
return true;
|
763
|
-
}
|
764
|
-
if (arg == "-hfr" || arg == "--hf-repo") {
|
765
|
-
CHECK_ARG
|
766
|
-
params.hf_repo = argv[i];
|
767
|
-
return true;
|
768
|
-
}
|
769
|
-
if (arg == "-hff" || arg == "--hf-file") {
|
770
|
-
CHECK_ARG
|
771
|
-
params.hf_file = argv[i];
|
772
|
-
return true;
|
773
|
-
}
|
774
|
-
if (arg == "--lora") {
|
775
|
-
CHECK_ARG
|
776
|
-
params.lora_adapters.push_back({
|
777
|
-
std::string(argv[i]),
|
778
|
-
1.0,
|
779
|
-
});
|
780
|
-
return true;
|
781
|
-
}
|
782
|
-
if (arg == "--lora-scaled") {
|
783
|
-
CHECK_ARG
|
784
|
-
std::string lora_adapter = argv[i];
|
785
|
-
CHECK_ARG
|
786
|
-
params.lora_adapters.push_back({
|
787
|
-
lora_adapter,
|
788
|
-
std::stof(argv[i]),
|
789
|
-
});
|
790
|
-
return true;
|
791
|
-
}
|
792
|
-
if (arg == "--lora-init-without-apply") {
|
793
|
-
params.lora_init_without_apply = true;
|
794
|
-
return true;
|
795
|
-
}
|
796
|
-
if (arg == "--control-vector") {
|
797
|
-
CHECK_ARG
|
798
|
-
params.control_vectors.push_back({ 1.0f, argv[i], });
|
799
|
-
return true;
|
800
|
-
}
|
801
|
-
if (arg == "--control-vector-scaled") {
|
802
|
-
CHECK_ARG
|
803
|
-
const char* fname = argv[i];
|
804
|
-
CHECK_ARG
|
805
|
-
params.control_vectors.push_back({ std::stof(argv[i]), fname, });
|
806
|
-
return true;
|
807
|
-
}
|
808
|
-
if (arg == "--control-vector-layer-range") {
|
809
|
-
CHECK_ARG
|
810
|
-
params.control_vector_layer_start = std::stoi(argv[i]);
|
811
|
-
CHECK_ARG
|
812
|
-
params.control_vector_layer_end = std::stoi(argv[i]);
|
813
|
-
return true;
|
814
|
-
}
|
815
|
-
if (arg == "--mmproj") {
|
816
|
-
CHECK_ARG
|
817
|
-
params.mmproj = argv[i];
|
818
|
-
return true;
|
819
|
-
}
|
820
|
-
if (arg == "--image") {
|
821
|
-
CHECK_ARG
|
822
|
-
params.image.emplace_back(argv[i]);
|
823
|
-
return true;
|
824
|
-
}
|
825
|
-
if (arg == "-i" || arg == "--interactive") {
|
826
|
-
params.interactive = true;
|
827
|
-
return true;
|
828
|
-
}
|
829
|
-
if (arg == "-sp" || arg == "--special") {
|
830
|
-
params.special = true;
|
831
|
-
return true;
|
832
|
-
}
|
833
|
-
if (arg == "--embedding" || arg == "--embeddings") {
|
834
|
-
params.embedding = true;
|
835
|
-
return true;
|
836
|
-
}
|
837
|
-
if (arg == "--embd-normalize") {
|
838
|
-
CHECK_ARG
|
839
|
-
params.embd_normalize = std::stoi(argv[i]);
|
840
|
-
return true;
|
841
|
-
}
|
842
|
-
if (arg == "--embd-output-format") {
|
843
|
-
CHECK_ARG
|
844
|
-
params.embd_out = argv[i];
|
845
|
-
return true;
|
846
|
-
}
|
847
|
-
if (arg == "--embd-separator") {
|
848
|
-
CHECK_ARG
|
849
|
-
params.embd_sep = argv[i];
|
850
|
-
return true;
|
851
|
-
}
|
852
|
-
if (arg == "-if" || arg == "--interactive-first") {
|
853
|
-
params.interactive_first = true;
|
854
|
-
return true;
|
855
|
-
}
|
856
|
-
if (arg == "-cnv" || arg == "--conversation") {
|
857
|
-
params.conversation = true;
|
858
|
-
return true;
|
859
|
-
}
|
860
|
-
if (arg == "--infill") {
|
861
|
-
params.infill = true;
|
862
|
-
return true;
|
863
|
-
}
|
864
|
-
if (arg == "-dkvc" || arg == "--dump-kv-cache") {
|
865
|
-
params.dump_kv_cache = true;
|
866
|
-
return true;
|
867
|
-
}
|
868
|
-
if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
869
|
-
params.no_kv_offload = true;
|
870
|
-
return true;
|
871
|
-
}
|
872
|
-
if (arg == "-ctk" || arg == "--cache-type-k") {
|
873
|
-
params.cache_type_k = argv[++i];
|
874
|
-
return true;
|
875
|
-
}
|
876
|
-
if (arg == "-ctv" || arg == "--cache-type-v") {
|
877
|
-
params.cache_type_v = argv[++i];
|
878
|
-
return true;
|
879
|
-
}
|
880
|
-
if (arg == "-mli" || arg == "--multiline-input") {
|
881
|
-
params.multiline_input = true;
|
882
|
-
return true;
|
883
|
-
}
|
884
|
-
if (arg == "--simple-io") {
|
885
|
-
params.simple_io = true;
|
886
|
-
return true;
|
887
|
-
}
|
888
|
-
if (arg == "-cb" || arg == "--cont-batching") {
|
889
|
-
params.cont_batching = true;
|
890
|
-
return true;
|
891
|
-
}
|
892
|
-
if (arg == "-nocb" || arg == "--no-cont-batching") {
|
893
|
-
params.cont_batching = false;
|
894
|
-
return true;
|
895
|
-
}
|
896
|
-
if (arg == "-fa" || arg == "--flash-attn") {
|
897
|
-
params.flash_attn = true;
|
898
|
-
return true;
|
899
|
-
}
|
900
|
-
if (arg == "-co" || arg == "--color") {
|
901
|
-
params.use_color = true;
|
902
|
-
return true;
|
903
|
-
}
|
904
|
-
if (arg == "--mlock") {
|
905
|
-
params.use_mlock = true;
|
906
|
-
return true;
|
907
|
-
}
|
908
|
-
if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
|
909
|
-
CHECK_ARG
|
910
|
-
params.n_gpu_layers = std::stoi(argv[i]);
|
911
|
-
if (!llama_supports_gpu_offload()) {
|
912
|
-
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
|
913
|
-
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
914
|
-
}
|
915
|
-
return true;
|
916
|
-
}
|
917
|
-
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") {
|
918
|
-
CHECK_ARG
|
919
|
-
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
920
|
-
if (!llama_supports_gpu_offload()) {
|
921
|
-
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
922
|
-
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
923
|
-
}
|
924
|
-
return true;
|
925
|
-
}
|
926
|
-
if (arg == "--main-gpu" || arg == "-mg") {
|
927
|
-
CHECK_ARG
|
928
|
-
params.main_gpu = std::stoi(argv[i]);
|
929
|
-
#ifndef LM_GGML_USE_CUDA_SYCL_VULKAN
|
930
|
-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
|
931
|
-
#endif // LM_GGML_USE_CUDA_SYCL_VULKAN
|
932
|
-
return true;
|
933
|
-
}
|
934
|
-
if (arg == "--split-mode" || arg == "-sm") {
|
935
|
-
CHECK_ARG
|
936
|
-
std::string arg_next = argv[i];
|
937
|
-
if (arg_next == "none") {
|
938
|
-
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
939
|
-
}
|
940
|
-
else if (arg_next == "layer") {
|
941
|
-
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
942
|
-
}
|
943
|
-
else if (arg_next == "row") {
|
944
|
-
#ifdef LM_GGML_USE_SYCL
|
945
|
-
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
|
946
|
-
exit(1);
|
947
|
-
#endif // LM_GGML_USE_SYCL
|
948
|
-
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
949
|
-
}
|
950
|
-
else {
|
951
|
-
invalid_param = true;
|
952
|
-
return true;
|
953
|
-
}
|
954
|
-
#ifndef LM_GGML_USE_CUDA_SYCL_VULKAN
|
955
|
-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
|
956
|
-
#endif // LM_GGML_USE_CUDA_SYCL_VULKAN
|
957
|
-
return true;
|
958
|
-
}
|
959
|
-
if (arg == "--tensor-split" || arg == "-ts") {
|
960
|
-
CHECK_ARG
|
961
|
-
std::string arg_next = argv[i];
|
273
|
+
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
|
274
|
+
int32_t n_set = 0;
|
962
275
|
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
967
|
-
|
968
|
-
|
969
|
-
return true;
|
970
|
-
}
|
971
|
-
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
972
|
-
if (i < split_arg.size()) {
|
973
|
-
params.tensor_split[i] = std::stof(split_arg[i]);
|
974
|
-
}
|
975
|
-
else {
|
976
|
-
params.tensor_split[i] = 0.0f;
|
977
|
-
}
|
276
|
+
if (cpuparams.n_threads < 0) {
|
277
|
+
// Assuming everything about cpuparams is invalid
|
278
|
+
if (role_model != nullptr) {
|
279
|
+
cpuparams = *role_model;
|
280
|
+
} else {
|
281
|
+
cpuparams.n_threads = cpu_get_num_math();
|
978
282
|
}
|
979
|
-
#ifndef LM_GGML_USE_CUDA_SYCL_VULKAN
|
980
|
-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
|
981
|
-
#endif // LM_GGML_USE_CUDA_SYCL_VULKAN
|
982
|
-
return true;
|
983
|
-
}
|
984
|
-
if (arg == "--rpc") {
|
985
|
-
CHECK_ARG
|
986
|
-
params.rpc_servers = argv[i];
|
987
|
-
return true;
|
988
|
-
}
|
989
|
-
if (arg == "--no-mmap") {
|
990
|
-
params.use_mmap = false;
|
991
|
-
return true;
|
992
|
-
}
|
993
|
-
if (arg == "--numa") {
|
994
|
-
CHECK_ARG
|
995
|
-
std::string value(argv[i]);
|
996
|
-
/**/ if (value == "distribute" || value == "") { params.numa = LM_GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
997
|
-
else if (value == "isolate") { params.numa = LM_GGML_NUMA_STRATEGY_ISOLATE; }
|
998
|
-
else if (value == "numactl") { params.numa = LM_GGML_NUMA_STRATEGY_NUMACTL; }
|
999
|
-
else { invalid_param = true; }
|
1000
|
-
return true;
|
1001
|
-
}
|
1002
|
-
if (arg == "-v" || arg == "--verbose") {
|
1003
|
-
params.verbosity = 1;
|
1004
|
-
return true;
|
1005
|
-
}
|
1006
|
-
if (arg == "--verbosity") {
|
1007
|
-
CHECK_ARG
|
1008
|
-
params.verbosity = std::stoi(argv[i]);
|
1009
|
-
return true;
|
1010
|
-
}
|
1011
|
-
if (arg == "--verbose-prompt") {
|
1012
|
-
params.verbose_prompt = true;
|
1013
|
-
return true;
|
1014
|
-
}
|
1015
|
-
if (arg == "--no-display-prompt") {
|
1016
|
-
params.display_prompt = false;
|
1017
|
-
return true;
|
1018
|
-
}
|
1019
|
-
if (arg == "-r" || arg == "--reverse-prompt") {
|
1020
|
-
CHECK_ARG
|
1021
|
-
params.antiprompt.emplace_back(argv[i]);
|
1022
|
-
return true;
|
1023
283
|
}
|
1024
|
-
if (arg == "-ld" || arg == "--logdir") {
|
1025
|
-
CHECK_ARG
|
1026
|
-
params.logdir = argv[i];
|
1027
284
|
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
return true;
|
1032
|
-
}
|
1033
|
-
if (arg == "-lcs" || arg == "--lookup-cache-static") {
|
1034
|
-
CHECK_ARG
|
1035
|
-
params.lookup_cache_static = argv[i];
|
1036
|
-
return true;
|
1037
|
-
}
|
1038
|
-
if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
|
1039
|
-
CHECK_ARG
|
1040
|
-
params.lookup_cache_dynamic = argv[i];
|
1041
|
-
return true;
|
1042
|
-
}
|
1043
|
-
if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
|
1044
|
-
CHECK_ARG
|
1045
|
-
params.logits_file = argv[i];
|
1046
|
-
return true;
|
1047
|
-
}
|
1048
|
-
if (arg == "--perplexity" || arg == "--all-logits") {
|
1049
|
-
params.logits_all = true;
|
1050
|
-
return true;
|
1051
|
-
}
|
1052
|
-
if (arg == "--ppl-stride") {
|
1053
|
-
CHECK_ARG
|
1054
|
-
params.ppl_stride = std::stoi(argv[i]);
|
1055
|
-
return true;
|
1056
|
-
}
|
1057
|
-
if (arg == "--ppl-output-type") {
|
1058
|
-
CHECK_ARG
|
1059
|
-
params.ppl_output_type = std::stoi(argv[i]);
|
1060
|
-
return true;
|
1061
|
-
}
|
1062
|
-
if (arg == "-ptc" || arg == "--print-token-count") {
|
1063
|
-
CHECK_ARG
|
1064
|
-
params.n_print = std::stoi(argv[i]);
|
1065
|
-
return true;
|
1066
|
-
}
|
1067
|
-
if (arg == "--check-tensors") {
|
1068
|
-
params.check_tensors = true;
|
1069
|
-
return true;
|
1070
|
-
}
|
1071
|
-
if (arg == "--hellaswag") {
|
1072
|
-
params.hellaswag = true;
|
1073
|
-
return true;
|
1074
|
-
}
|
1075
|
-
if (arg == "--hellaswag-tasks") {
|
1076
|
-
CHECK_ARG
|
1077
|
-
params.hellaswag_tasks = std::stoi(argv[i]);
|
1078
|
-
return true;
|
1079
|
-
}
|
1080
|
-
if (arg == "--winogrande") {
|
1081
|
-
params.winogrande = true;
|
1082
|
-
return true;
|
1083
|
-
}
|
1084
|
-
if (arg == "--winogrande-tasks") {
|
1085
|
-
CHECK_ARG
|
1086
|
-
params.winogrande_tasks = std::stoi(argv[i]);
|
1087
|
-
return true;
|
1088
|
-
}
|
1089
|
-
if (arg == "--multiple-choice") {
|
1090
|
-
params.multiple_choice = true;
|
1091
|
-
return true;
|
1092
|
-
}
|
1093
|
-
if (arg == "--multiple-choice-tasks") {
|
1094
|
-
CHECK_ARG
|
1095
|
-
params.multiple_choice_tasks = std::stoi(argv[i]);
|
1096
|
-
return true;
|
1097
|
-
}
|
1098
|
-
if (arg == "--kl-divergence") {
|
1099
|
-
params.kl_divergence = true;
|
1100
|
-
return true;
|
1101
|
-
}
|
1102
|
-
if (arg == "--ignore-eos") {
|
1103
|
-
params.ignore_eos = true;
|
1104
|
-
return true;
|
1105
|
-
}
|
1106
|
-
if (arg == "--penalize-nl") {
|
1107
|
-
sparams.penalize_nl = true;
|
1108
|
-
return true;
|
1109
|
-
}
|
1110
|
-
if (arg == "-l" || arg == "--logit-bias") {
|
1111
|
-
CHECK_ARG
|
1112
|
-
std::stringstream ss(argv[i]);
|
1113
|
-
llama_token key;
|
1114
|
-
char sign;
|
1115
|
-
std::string value_str;
|
1116
|
-
try {
|
1117
|
-
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
1118
|
-
sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
1119
|
-
}
|
1120
|
-
else {
|
1121
|
-
throw std::exception();
|
1122
|
-
}
|
1123
|
-
}
|
1124
|
-
catch (const std::exception&) {
|
1125
|
-
invalid_param = true;
|
1126
|
-
return true;
|
1127
|
-
}
|
1128
|
-
return true;
|
1129
|
-
}
|
1130
|
-
if (arg == "-h" || arg == "--help" || arg == "--usage" ) {
|
1131
|
-
params.usage = true;
|
1132
|
-
return true;
|
1133
|
-
}
|
1134
|
-
if (arg == "--version") {
|
1135
|
-
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
1136
|
-
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
1137
|
-
exit(0);
|
1138
|
-
}
|
1139
|
-
if (arg == "--in-prefix-bos") {
|
1140
|
-
params.input_prefix_bos = true;
|
1141
|
-
params.enable_chat_template = false;
|
1142
|
-
return true;
|
1143
|
-
}
|
1144
|
-
if (arg == "--in-prefix") {
|
1145
|
-
CHECK_ARG
|
1146
|
-
params.input_prefix = argv[i];
|
1147
|
-
params.enable_chat_template = false;
|
1148
|
-
return true;
|
1149
|
-
}
|
1150
|
-
if (arg == "--in-suffix") {
|
1151
|
-
CHECK_ARG
|
1152
|
-
params.input_suffix = argv[i];
|
1153
|
-
params.enable_chat_template = false;
|
1154
|
-
return true;
|
1155
|
-
}
|
1156
|
-
if (arg == "--spm-infill") {
|
1157
|
-
params.spm_infill = true;
|
1158
|
-
return true;
|
1159
|
-
}
|
1160
|
-
if (arg == "--grammar") {
|
1161
|
-
CHECK_ARG
|
1162
|
-
sparams.grammar = argv[i];
|
1163
|
-
return true;
|
1164
|
-
}
|
1165
|
-
if (arg == "--grammar-file") {
|
1166
|
-
CHECK_ARG
|
1167
|
-
std::ifstream file(argv[i]);
|
1168
|
-
if (!file) {
|
1169
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
1170
|
-
invalid_param = true;
|
1171
|
-
return true;
|
1172
|
-
}
|
1173
|
-
std::copy(
|
1174
|
-
std::istreambuf_iterator<char>(file),
|
1175
|
-
std::istreambuf_iterator<char>(),
|
1176
|
-
std::back_inserter(sparams.grammar)
|
1177
|
-
);
|
1178
|
-
return true;
|
1179
|
-
}
|
1180
|
-
if (arg == "-j" || arg == "--json-schema") {
|
1181
|
-
CHECK_ARG
|
1182
|
-
sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
|
1183
|
-
return true;
|
1184
|
-
}
|
1185
|
-
if (arg == "--override-kv") {
|
1186
|
-
CHECK_ARG
|
1187
|
-
if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
|
1188
|
-
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
1189
|
-
invalid_param = true;
|
1190
|
-
return true;
|
285
|
+
for (int32_t i = 0; i < LM_GGML_MAX_N_THREADS; i++) {
|
286
|
+
if (cpuparams.cpumask[i]) {
|
287
|
+
n_set++;
|
1191
288
|
}
|
1192
|
-
return true;
|
1193
289
|
}
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1198
|
-
}
|
1199
|
-
if (arg == "--port") {
|
1200
|
-
CHECK_ARG
|
1201
|
-
params.port = std::stoi(argv[i]);
|
1202
|
-
return true;
|
1203
|
-
}
|
1204
|
-
if (arg == "--path") {
|
1205
|
-
CHECK_ARG
|
1206
|
-
params.public_path = argv[i];
|
1207
|
-
return true;
|
1208
|
-
}
|
1209
|
-
if (arg == "--api-key") {
|
1210
|
-
CHECK_ARG
|
1211
|
-
params.api_keys.push_back(argv[i]);
|
1212
|
-
return true;
|
1213
|
-
}
|
1214
|
-
if (arg == "--api-key-file") {
|
1215
|
-
CHECK_ARG
|
1216
|
-
std::ifstream key_file(argv[i]);
|
1217
|
-
if (!key_file) {
|
1218
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
1219
|
-
invalid_param = true;
|
1220
|
-
return true;
|
1221
|
-
}
|
1222
|
-
std::string key;
|
1223
|
-
while (std::getline(key_file, key)) {
|
1224
|
-
if (!key.empty()) {
|
1225
|
-
params.api_keys.push_back(key);
|
1226
|
-
}
|
1227
|
-
}
|
1228
|
-
key_file.close();
|
1229
|
-
return true;
|
1230
|
-
}
|
1231
|
-
if (arg == "--ssl-key-file") {
|
1232
|
-
CHECK_ARG
|
1233
|
-
params.ssl_file_key = argv[i];
|
1234
|
-
return true;
|
1235
|
-
}
|
1236
|
-
if (arg == "--ssl-cert-file") {
|
1237
|
-
CHECK_ARG
|
1238
|
-
params.ssl_file_cert = argv[i];
|
1239
|
-
return true;
|
1240
|
-
}
|
1241
|
-
if (arg == "--timeout" || arg == "-to") {
|
1242
|
-
CHECK_ARG
|
1243
|
-
params.timeout_read = std::stoi(argv[i]);
|
1244
|
-
params.timeout_write = std::stoi(argv[i]);
|
1245
|
-
return true;
|
1246
|
-
}
|
1247
|
-
if (arg == "--threads-http") {
|
1248
|
-
CHECK_ARG
|
1249
|
-
params.n_threads_http = std::stoi(argv[i]);
|
1250
|
-
return true;
|
1251
|
-
}
|
1252
|
-
if (arg == "-spf" || arg == "--system-prompt-file") {
|
1253
|
-
CHECK_ARG
|
1254
|
-
std::ifstream file(argv[i]);
|
1255
|
-
if (!file) {
|
1256
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
1257
|
-
invalid_param = true;
|
1258
|
-
return true;
|
1259
|
-
}
|
1260
|
-
std::string system_prompt;
|
1261
|
-
std::copy(
|
1262
|
-
std::istreambuf_iterator<char>(file),
|
1263
|
-
std::istreambuf_iterator<char>(),
|
1264
|
-
std::back_inserter(system_prompt)
|
1265
|
-
);
|
1266
|
-
params.system_prompt = system_prompt;
|
1267
|
-
return true;
|
1268
|
-
}
|
1269
|
-
if (arg == "--log-format") {
|
1270
|
-
CHECK_ARG
|
1271
|
-
if (std::strcmp(argv[i], "json") == 0) {
|
1272
|
-
params.log_json = true;
|
1273
|
-
} else if (std::strcmp(argv[i], "text") == 0) {
|
1274
|
-
params.log_json = false;
|
1275
|
-
} else {
|
1276
|
-
invalid_param = true;
|
1277
|
-
return true;
|
1278
|
-
}
|
1279
|
-
return true;
|
1280
|
-
}
|
1281
|
-
if (arg == "--no-slots") {
|
1282
|
-
params.endpoint_slots = false;
|
1283
|
-
return true;
|
1284
|
-
}
|
1285
|
-
if (arg == "--metrics") {
|
1286
|
-
params.endpoint_metrics = true;
|
1287
|
-
return true;
|
290
|
+
|
291
|
+
if (n_set && n_set < cpuparams.n_threads) {
|
292
|
+
// Not enough set bits, may experience performance issues.
|
293
|
+
fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
|
1288
294
|
}
|
1289
|
-
|
1290
|
-
|
1291
|
-
|
1292
|
-
|
1293
|
-
|
1294
|
-
|
1295
|
-
|
1296
|
-
return true;
|
295
|
+
}
|
296
|
+
|
297
|
+
bool parse_cpu_range(const std::string & range, bool (&boolmask)[LM_GGML_MAX_N_THREADS]) {
|
298
|
+
size_t dash_loc = range.find('-');
|
299
|
+
if (dash_loc == std::string::npos) {
|
300
|
+
fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
|
301
|
+
return false;
|
1297
302
|
}
|
1298
|
-
|
1299
|
-
|
1300
|
-
|
1301
|
-
|
1302
|
-
|
1303
|
-
|
1304
|
-
|
303
|
+
|
304
|
+
size_t start_i;
|
305
|
+
size_t end_i;
|
306
|
+
|
307
|
+
if (dash_loc == 0) {
|
308
|
+
start_i = 0;
|
309
|
+
} else {
|
310
|
+
start_i = std::stoull(range.substr(0, dash_loc));
|
311
|
+
if (start_i >= LM_GGML_MAX_N_THREADS) {
|
312
|
+
fprintf(stderr, "Start index out of bounds!\n");
|
313
|
+
return false;
|
1305
314
|
}
|
1306
|
-
params.chat_template = argv[i];
|
1307
|
-
return true;
|
1308
315
|
}
|
1309
|
-
|
1310
|
-
|
1311
|
-
|
1312
|
-
|
1313
|
-
|
1314
|
-
|
1315
|
-
|
1316
|
-
|
1317
|
-
}
|
1318
|
-
if (arg == "-npp") {
|
1319
|
-
CHECK_ARG
|
1320
|
-
auto p = string_split<int>(argv[i], split_delim);
|
1321
|
-
params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
|
1322
|
-
return true;
|
1323
|
-
}
|
1324
|
-
if (arg == "-ntg") {
|
1325
|
-
CHECK_ARG
|
1326
|
-
auto p = string_split<int>(argv[i], split_delim);
|
1327
|
-
params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
|
1328
|
-
return true;
|
1329
|
-
}
|
1330
|
-
if (arg == "-npl") {
|
1331
|
-
CHECK_ARG
|
1332
|
-
auto p = string_split<int>(argv[i], split_delim);
|
1333
|
-
params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
|
1334
|
-
return true;
|
1335
|
-
}
|
1336
|
-
if (arg == "--context-file") {
|
1337
|
-
CHECK_ARG
|
1338
|
-
std::ifstream file(argv[i], std::ios::binary);
|
1339
|
-
if (!file) {
|
1340
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
1341
|
-
invalid_param = true;
|
1342
|
-
return true;
|
316
|
+
|
317
|
+
if (dash_loc == range.length() - 1) {
|
318
|
+
end_i = LM_GGML_MAX_N_THREADS - 1;
|
319
|
+
} else {
|
320
|
+
end_i = std::stoull(range.substr(dash_loc + 1));
|
321
|
+
if (end_i >= LM_GGML_MAX_N_THREADS) {
|
322
|
+
fprintf(stderr, "End index out of bounds!\n");
|
323
|
+
return false;
|
1343
324
|
}
|
1344
|
-
params.context_files.push_back(argv[i]);
|
1345
|
-
return true;
|
1346
|
-
}
|
1347
|
-
if (arg == "--chunk-size") {
|
1348
|
-
CHECK_ARG
|
1349
|
-
params.chunk_size = std::stoi(argv[i]);
|
1350
|
-
return true;
|
1351
|
-
}
|
1352
|
-
if (arg == "--chunk-separator") {
|
1353
|
-
CHECK_ARG
|
1354
|
-
params.chunk_separator = argv[i];
|
1355
|
-
return true;
|
1356
|
-
}
|
1357
|
-
if (arg == "--junk") {
|
1358
|
-
CHECK_ARG
|
1359
|
-
params.n_junk = std::stoi(argv[i]);
|
1360
|
-
return true;
|
1361
|
-
}
|
1362
|
-
if (arg == "--pos") {
|
1363
|
-
CHECK_ARG
|
1364
|
-
params.i_pos = std::stoi(argv[i]);
|
1365
|
-
return true;
|
1366
|
-
}
|
1367
|
-
if (arg == "-o" || arg == "--output" || arg == "--output-file") {
|
1368
|
-
CHECK_ARG
|
1369
|
-
params.out_file = argv[i];
|
1370
|
-
params.cvector_outfile = argv[i];
|
1371
|
-
params.lora_outfile = argv[i];
|
1372
|
-
return true;
|
1373
|
-
}
|
1374
|
-
if (arg == "-ofreq" || arg == "--output-frequency") {
|
1375
|
-
CHECK_ARG
|
1376
|
-
params.n_out_freq = std::stoi(argv[i]);
|
1377
|
-
return true;
|
1378
|
-
}
|
1379
|
-
if (arg == "--save-frequency") {
|
1380
|
-
CHECK_ARG
|
1381
|
-
params.n_save_freq = std::stoi(argv[i]);
|
1382
|
-
return true;
|
1383
|
-
}
|
1384
|
-
if (arg == "--process-output") {
|
1385
|
-
params.process_output = true;
|
1386
|
-
return true;
|
1387
325
|
}
|
1388
|
-
|
1389
|
-
|
1390
|
-
|
1391
|
-
}
|
1392
|
-
if (arg == "--chunk" || arg == "--from-chunk") {
|
1393
|
-
CHECK_ARG
|
1394
|
-
params.i_chunk = std::stoi(argv[i]);
|
1395
|
-
return true;
|
1396
|
-
}
|
1397
|
-
// cvector params
|
1398
|
-
if (arg == "--positive-file") {
|
1399
|
-
CHECK_ARG
|
1400
|
-
params.cvector_positive_file = argv[i];
|
1401
|
-
return true;
|
1402
|
-
}
|
1403
|
-
if (arg == "--negative-file") {
|
1404
|
-
CHECK_ARG
|
1405
|
-
params.cvector_negative_file = argv[i];
|
1406
|
-
return true;
|
1407
|
-
}
|
1408
|
-
if (arg == "--pca-batch") {
|
1409
|
-
CHECK_ARG
|
1410
|
-
params.n_pca_batch = std::stoi(argv[i]);
|
1411
|
-
return true;
|
1412
|
-
}
|
1413
|
-
if (arg == "--pca-iter") {
|
1414
|
-
CHECK_ARG
|
1415
|
-
params.n_pca_iterations = std::stoi(argv[i]);
|
1416
|
-
return true;
|
1417
|
-
}
|
1418
|
-
if (arg == "--method") {
|
1419
|
-
CHECK_ARG
|
1420
|
-
std::string value(argv[i]);
|
1421
|
-
/**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
|
1422
|
-
else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
|
1423
|
-
else { invalid_param = true; }
|
1424
|
-
return true;
|
1425
|
-
}
|
1426
|
-
if (arg == "--no-warmup") {
|
1427
|
-
params.warmup = false;
|
1428
|
-
return true;
|
1429
|
-
}
|
1430
|
-
#ifndef LOG_DISABLE_LOGS
|
1431
|
-
// Parse args for logging parameters
|
1432
|
-
if (log_param_single_parse(argv[i])) {
|
1433
|
-
// Do nothing, log_param_single_parse automatically does it's thing
|
1434
|
-
// and returns if a match was found and parsed.
|
1435
|
-
return true;
|
1436
|
-
}
|
1437
|
-
if (log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i])) {
|
1438
|
-
// We have a matching known parameter requiring an argument,
|
1439
|
-
// now we need to check if there is anything after this argv
|
1440
|
-
// and flag invalid_param or parse it.
|
1441
|
-
CHECK_ARG
|
1442
|
-
if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) {
|
1443
|
-
invalid_param = true;
|
1444
|
-
return true;
|
1445
|
-
}
|
1446
|
-
return true;
|
326
|
+
|
327
|
+
for (size_t i = start_i; i <= end_i; i++) {
|
328
|
+
boolmask[i] = true;
|
1447
329
|
}
|
1448
|
-
// End of Parse args for logging parameters
|
1449
|
-
#endif // LOG_DISABLE_LOGS
|
1450
330
|
|
1451
|
-
return
|
331
|
+
return true;
|
1452
332
|
}
|
1453
333
|
|
1454
|
-
|
1455
|
-
|
1456
|
-
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
#else
|
1461
|
-
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
1462
|
-
#endif
|
334
|
+
bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[LM_GGML_MAX_N_THREADS]) {
|
335
|
+
// Discard potential 0x prefix
|
336
|
+
size_t start_i = 0;
|
337
|
+
if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
|
338
|
+
start_i = 2;
|
339
|
+
}
|
1463
340
|
|
1464
|
-
|
1465
|
-
|
341
|
+
size_t num_digits = mask.length() - start_i;
|
342
|
+
if (num_digits > 128) num_digits = 128;
|
1466
343
|
|
1467
|
-
|
1468
|
-
std::string sampler_type_names;
|
1469
|
-
for (const auto sampler_type : sparams.samplers_sequence) {
|
1470
|
-
sampler_type_chars += static_cast<char>(sampler_type);
|
1471
|
-
sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";";
|
1472
|
-
}
|
1473
|
-
sampler_type_names.pop_back();
|
344
|
+
size_t end_i = num_digits + start_i;
|
1474
345
|
|
1475
|
-
|
1476
|
-
|
1477
|
-
|
1478
|
-
va_list args_list;
|
1479
|
-
va_start(args_list, desc);
|
1480
|
-
char buffer[1024];
|
1481
|
-
vsnprintf(buffer, sizeof(buffer), desc, args_list);
|
1482
|
-
va_end(args_list);
|
1483
|
-
this->desc = buffer;
|
1484
|
-
}
|
346
|
+
for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
|
347
|
+
char c = mask.at(i);
|
348
|
+
int8_t id = c;
|
1485
349
|
|
1486
|
-
|
350
|
+
if ((c >= '0' && c <= '9')) {
|
351
|
+
id -= '0';
|
352
|
+
} else if (c >= 'a' && c <= 'f') {
|
353
|
+
id -= 'a' - 10;
|
354
|
+
} else if (c >= 'A' && c <= 'F') {
|
355
|
+
id -= 'A' - 10;
|
356
|
+
} else {
|
357
|
+
fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
|
358
|
+
return false;
|
359
|
+
}
|
1487
360
|
|
1488
|
-
|
1489
|
-
|
1490
|
-
|
1491
|
-
|
1492
|
-
}
|
361
|
+
boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
|
362
|
+
boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
|
363
|
+
boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
|
364
|
+
boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
|
365
|
+
}
|
1493
366
|
|
1494
|
-
|
1495
|
-
|
1496
|
-
// TODO: filter by tags
|
1497
|
-
|
1498
|
-
options.push_back({ "general" });
|
1499
|
-
options.push_back({ "*", "-h, --help, --usage", "print usage and exit" });
|
1500
|
-
options.push_back({ "*", " --version", "show version and build info" });
|
1501
|
-
options.push_back({ "*", "-v, --verbose", "print verbose information" });
|
1502
|
-
options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity });
|
1503
|
-
options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
|
1504
|
-
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
|
1505
|
-
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
|
1506
|
-
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
|
1507
|
-
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
|
1508
|
-
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
|
1509
|
-
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
|
1510
|
-
options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
|
1511
|
-
"number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
|
1512
|
-
options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
|
1513
|
-
options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
|
1514
|
-
options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
|
1515
|
-
"path to static lookup cache to use for lookup decoding (not updated by generation)" });
|
1516
|
-
options.push_back({ "*", "-lcd, --lookup-cache-dynamic FNAME",
|
1517
|
-
"path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
|
1518
|
-
|
1519
|
-
options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
|
1520
|
-
options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
|
1521
|
-
options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch });
|
1522
|
-
options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch });
|
1523
|
-
options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
|
1524
|
-
options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
|
1525
|
-
options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
|
1526
|
-
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n"
|
1527
|
-
"in conversation mode, this will be used as system prompt\n"
|
1528
|
-
"(default: '%s')", params.prompt.c_str() });
|
1529
|
-
options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
|
1530
|
-
options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" });
|
1531
|
-
options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
|
1532
|
-
options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
|
1533
|
-
options.push_back({ "*", " --no-escape", "do not process escape sequences" });
|
1534
|
-
options.push_back({ "main", "-ptc, --print-token-count N", "print token count every N tokens (default: %d)", params.n_print });
|
1535
|
-
options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" });
|
1536
|
-
options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n"
|
1537
|
-
"not supported with --interactive or other interactive options" });
|
1538
|
-
options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" });
|
1539
|
-
options.push_back({ "main", "-r, --reverse-prompt PROMPT",
|
1540
|
-
"halt generation at PROMPT, return control in interactive mode\n"
|
1541
|
-
"can be specified more than once for multiple prompts" });
|
1542
|
-
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
|
1543
|
-
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode, does not print special tokens and suffix/prefix\n"
|
1544
|
-
"if suffix/prefix are not specified, default chat template will be used\n"
|
1545
|
-
"(default: %s)", params.conversation ? "true" : "false" });
|
1546
|
-
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
|
1547
|
-
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
|
1548
|
-
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
|
1549
|
-
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
|
1550
|
-
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
|
1551
|
-
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
|
1552
|
-
options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" });
|
1553
|
-
options.push_back({ "server infill",
|
1554
|
-
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
|
1555
|
-
|
1556
|
-
options.push_back({ "sampling" });
|
1557
|
-
options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
|
1558
|
-
"(default: %s)", sampler_type_names.c_str() });
|
1559
|
-
options.push_back({ "*", " --sampling-seq SEQUENCE",
|
1560
|
-
"simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() });
|
1561
|
-
options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" });
|
1562
|
-
options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" });
|
1563
|
-
options.push_back({ "*", " --temp N", "temperature (default: %.1f)", (double)sparams.temp });
|
1564
|
-
options.push_back({ "*", " --top-k N", "top-k sampling (default: %d, 0 = disabled)", sparams.top_k });
|
1565
|
-
options.push_back({ "*", " --top-p N", "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p });
|
1566
|
-
options.push_back({ "*", " --min-p N", "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p });
|
1567
|
-
options.push_back({ "*", " --tfs N", "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z });
|
1568
|
-
options.push_back({ "*", " --typical N", "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p });
|
1569
|
-
options.push_back({ "*", " --repeat-last-n N", "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n });
|
1570
|
-
options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat });
|
1571
|
-
options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present });
|
1572
|
-
options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq });
|
1573
|
-
options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range });
|
1574
|
-
options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent });
|
1575
|
-
options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n"
|
1576
|
-
"Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
|
1577
|
-
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat });
|
1578
|
-
options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
|
1579
|
-
options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
|
1580
|
-
options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n"
|
1581
|
-
"i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
|
1582
|
-
"or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
|
1583
|
-
options.push_back({ "main", " --cfg-negative-prompt PROMPT",
|
1584
|
-
"negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() });
|
1585
|
-
options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
|
1586
|
-
"negative prompt file to use for guidance" });
|
1587
|
-
options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
|
1588
|
-
options.push_back({ "main", " --chat-template JINJA_TEMPLATE",
|
1589
|
-
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
1590
|
-
"if suffix/prefix are specified, template will be disabled\n"
|
1591
|
-
"only commonly used templates are accepted:\n"
|
1592
|
-
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
|
1593
|
-
options.push_back({ "grammar" });
|
1594
|
-
options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
|
1595
|
-
options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
|
1596
|
-
options.push_back({ "*", "-j, --json-schema SCHEMA",
|
1597
|
-
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n"
|
1598
|
-
"For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
|
1599
|
-
|
1600
|
-
options.push_back({ "embedding" });
|
1601
|
-
options.push_back({ "embedding", " --pooling {none,mean,cls,last}",
|
1602
|
-
"pooling type for embeddings, use model default if unspecified" });
|
1603
|
-
options.push_back({ "embedding", " --attention {causal,non-causal}",
|
1604
|
-
"attention type for embeddings, use model default if unspecified" });
|
1605
|
-
|
1606
|
-
options.push_back({ "context hacking" });
|
1607
|
-
options.push_back({ "*", " --rope-scaling {none,linear,yarn}",
|
1608
|
-
"RoPE frequency scaling method, defaults to linear unless specified by the model" });
|
1609
|
-
options.push_back({ "*", " --rope-scale N", "RoPE context scaling factor, expands context by a factor of N" });
|
1610
|
-
options.push_back({ "*", " --rope-freq-base N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" });
|
1611
|
-
options.push_back({ "*", " --rope-freq-scale N", "RoPE frequency scaling factor, expands context by a factor of 1/N" });
|
1612
|
-
options.push_back({ "*", " --yarn-orig-ctx N", "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx });
|
1613
|
-
options.push_back({ "*", " --yarn-ext-factor N", "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor });
|
1614
|
-
options.push_back({ "*", " --yarn-attn-factor N", "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor });
|
1615
|
-
options.push_back({ "*", " --yarn-beta-slow N", "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow });
|
1616
|
-
options.push_back({ "*", " --yarn-beta-fast N", "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast });
|
1617
|
-
options.push_back({ "*", "-gan, --grp-attn-n N", "group-attention factor (default: %d)", params.grp_attn_n });
|
1618
|
-
options.push_back({ "*", "-gaw, --grp-attn-w N", "group-attention width (default: %.1f)", (double)params.grp_attn_w });
|
1619
|
-
options.push_back({ "*", "-dkvc, --dump-kv-cache", "verbose print of the KV cache" });
|
1620
|
-
options.push_back({ "*", "-nkvo, --no-kv-offload", "disable KV offload" });
|
1621
|
-
options.push_back({ "*", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() });
|
1622
|
-
options.push_back({ "*", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() });
|
1623
|
-
|
1624
|
-
options.push_back({ "perplexity" });
|
1625
|
-
options.push_back({ "perplexity", " --all-logits", "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" });
|
1626
|
-
options.push_back({ "perplexity", " --hellaswag", "compute HellaSwag score over random tasks from datafile supplied with -f" });
|
1627
|
-
options.push_back({ "perplexity", " --hellaswag-tasks N", "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks });
|
1628
|
-
options.push_back({ "perplexity", " --winogrande", "compute Winogrande score over random tasks from datafile supplied with -f" });
|
1629
|
-
options.push_back({ "perplexity", " --winogrande-tasks N", "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks });
|
1630
|
-
options.push_back({ "perplexity", " --multiple-choice", "compute multiple choice score over random tasks from datafile supplied with -f" });
|
1631
|
-
options.push_back({ "perplexity", " --multiple-choice-tasks N",
|
1632
|
-
"number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks });
|
1633
|
-
options.push_back({ "perplexity", " --kl-divergence", "computes KL-divergence to logits provided via --kl-divergence-base" });
|
1634
|
-
options.push_back({ "perplexity", " --ppl-stride N", "stride for perplexity calculation (default: %d)", params.ppl_stride });
|
1635
|
-
options.push_back({ "perplexity", " --ppl-output-type {0,1}",
|
1636
|
-
"output type for perplexity calculation (default: %d)", params.ppl_output_type });
|
1637
|
-
|
1638
|
-
options.push_back({ "parallel" });
|
1639
|
-
options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold });
|
1640
|
-
options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
|
1641
|
-
options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
|
1642
|
-
options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
|
1643
|
-
options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" });
|
1644
|
-
|
1645
|
-
options.push_back({ "multi-modality" });
|
1646
|
-
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
|
1647
|
-
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
|
1648
|
-
|
1649
|
-
options.push_back({ "backend" });
|
1650
|
-
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
|
1651
|
-
|
1652
|
-
if (llama_supports_mlock()) {
|
1653
|
-
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
|
1654
|
-
}
|
1655
|
-
if (llama_supports_mmap()) {
|
1656
|
-
options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
|
1657
|
-
}
|
1658
|
-
options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n"
|
1659
|
-
" - distribute: spread execution evenly over all nodes\n"
|
1660
|
-
" - isolate: only spawn threads on CPUs on the node that execution started on\n"
|
1661
|
-
" - numactl: use the CPU map provided by numactl\n"
|
1662
|
-
"if run without this previously, it is recommended to drop the system page cache before using this\n"
|
1663
|
-
"see https://github.com/ggerganov/llama.cpp/issues/1437" });
|
1664
|
-
|
1665
|
-
if (llama_supports_gpu_offload()) {
|
1666
|
-
options.push_back({ "*", "-ngl, --gpu-layers N",
|
1667
|
-
"number of layers to store in VRAM" });
|
1668
|
-
options.push_back({ "*", "-ngld, --gpu-layers-draft N",
|
1669
|
-
"number of layers to store in VRAM for the draft model" });
|
1670
|
-
options.push_back({ "*", "-sm, --split-mode SPLIT_MODE",
|
1671
|
-
"how to split the model across multiple GPUs, one of:\n"
|
1672
|
-
" - none: use one GPU only\n"
|
1673
|
-
" - layer (default): split layers and KV across GPUs\n"
|
1674
|
-
" - row: split rows across GPUs" });
|
1675
|
-
options.push_back({ "*", "-ts, --tensor-split SPLIT",
|
1676
|
-
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
|
1677
|
-
options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n"
|
1678
|
-
"or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
|
1679
|
-
}
|
1680
|
-
|
1681
|
-
options.push_back({ "model" });
|
1682
|
-
options.push_back({ "*", " --check-tensors", "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" });
|
1683
|
-
options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
|
1684
|
-
"advanced option to override model metadata by key. may be specified multiple times.\n"
|
1685
|
-
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
|
1686
|
-
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" });
|
1687
|
-
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
1688
|
-
options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
|
1689
|
-
"note: this argument can be repeated to add multiple control vectors" });
|
1690
|
-
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
|
1691
|
-
"add a control vector with user defined scaling SCALE\n"
|
1692
|
-
"note: this argument can be repeated to add multiple scaled control vectors" });
|
1693
|
-
options.push_back({ "*", " --control-vector-layer-range START END",
|
1694
|
-
"layer range to apply the control vector(s) to, start and end inclusive" });
|
1695
|
-
options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
|
1696
|
-
"or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
|
1697
|
-
options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" });
|
1698
|
-
options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
|
1699
|
-
options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
|
1700
|
-
options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" });
|
1701
|
-
options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
|
1702
|
-
|
1703
|
-
options.push_back({ "retrieval" });
|
1704
|
-
options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" });
|
1705
|
-
options.push_back({ "retrieval", " --chunk-size N", "minimum length of embedded text chunks (default: %d)", params.chunk_size });
|
1706
|
-
options.push_back({ "retrieval", " --chunk-separator STRING",
|
1707
|
-
"separator between chunks (default: '%s')", params.chunk_separator.c_str() });
|
1708
|
-
|
1709
|
-
options.push_back({ "passkey" });
|
1710
|
-
options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk });
|
1711
|
-
options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos });
|
1712
|
-
|
1713
|
-
options.push_back({ "imatrix" });
|
1714
|
-
options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() });
|
1715
|
-
options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq });
|
1716
|
-
options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
|
1717
|
-
options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
|
1718
|
-
options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
|
1719
|
-
options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk });
|
1720
|
-
|
1721
|
-
options.push_back({ "bench" });
|
1722
|
-
options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
|
1723
|
-
options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" });
|
1724
|
-
options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" });
|
1725
|
-
options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" });
|
1726
|
-
|
1727
|
-
options.push_back({ "embedding" });
|
1728
|
-
options.push_back({ "embedding", " --embd-normalize", "normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize });
|
1729
|
-
options.push_back({ "embedding", " --embd-output-format", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix" });
|
1730
|
-
options.push_back({ "embedding", " --embd-separator", "separator of embendings (default \\n) for example \"<#sep#>\"" });
|
1731
|
-
|
1732
|
-
options.push_back({ "server" });
|
1733
|
-
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
|
1734
|
-
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
|
1735
|
-
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
|
1736
|
-
options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
|
1737
|
-
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
|
1738
|
-
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
|
1739
|
-
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
|
1740
|
-
options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" });
|
1741
|
-
options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read });
|
1742
|
-
options.push_back({ "server", " --threads-http N", "number of threads used to process HTTP requests (default: %d)", params.n_threads_http });
|
1743
|
-
options.push_back({ "server", " --system-prompt-file FNAME",
|
1744
|
-
"set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" });
|
1745
|
-
options.push_back({ "server", " --log-format {text,json}",
|
1746
|
-
"log output format: json or text (default: json)" });
|
1747
|
-
options.push_back({ "server", " --metrics", "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" });
|
1748
|
-
options.push_back({ "server", " --no-slots", "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" });
|
1749
|
-
options.push_back({ "server", " --slot-save-path PATH", "path to save slot kv cache (default: disabled)" });
|
1750
|
-
options.push_back({ "server", " --chat-template JINJA_TEMPLATE",
|
1751
|
-
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
1752
|
-
"only commonly used templates are accepted:\n"
|
1753
|
-
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
|
1754
|
-
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
|
1755
|
-
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
|
1756
|
-
options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"});
|
1757
|
-
|
1758
|
-
#ifndef LOG_DISABLE_LOGS
|
1759
|
-
options.push_back({ "logging" });
|
1760
|
-
options.push_back({ "*", " --simple-io", "use basic IO for better compatibility in subprocesses and limited consoles" });
|
1761
|
-
options.push_back({ "*", "-ld, --logdir LOGDIR", "path under which to save YAML logs (no logging if unset)" });
|
1762
|
-
options.push_back({ "logging", " --log-test", "Run simple logging test" });
|
1763
|
-
options.push_back({ "logging", " --log-disable", "Disable trace logs" });
|
1764
|
-
options.push_back({ "logging", " --log-enable", "Enable trace logs" });
|
1765
|
-
options.push_back({ "logging", " --log-file FNAME", "Specify a log filename (without extension)" });
|
1766
|
-
options.push_back({ "logging", " --log-new", "Create a separate new log file on start. "
|
1767
|
-
"Each log file will have unique name: \"<name>.<ID>.log\"" });
|
1768
|
-
options.push_back({ "logging", " --log-append", "Don't truncate the old log file." });
|
1769
|
-
#endif // LOG_DISABLE_LOGS
|
1770
|
-
|
1771
|
-
options.push_back({ "cvector" });
|
1772
|
-
options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() });
|
1773
|
-
options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
|
1774
|
-
options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
|
1775
|
-
options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
|
1776
|
-
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
|
1777
|
-
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
|
1778
|
-
|
1779
|
-
options.push_back({ "export-lora" });
|
1780
|
-
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
|
1781
|
-
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
|
1782
|
-
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
1783
|
-
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
|
1784
|
-
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
|
1785
|
-
|
1786
|
-
printf("usage: %s [options]\n", argv[0]);
|
1787
|
-
|
1788
|
-
for (const auto & o : options) {
|
1789
|
-
if (!o.grp.empty()) {
|
1790
|
-
printf("\n%s:\n\n", o.grp.c_str());
|
1791
|
-
continue;
|
1792
|
-
}
|
1793
|
-
printf(" %-32s", o.args.c_str());
|
1794
|
-
if (o.args.length() > 30) {
|
1795
|
-
printf("\n%34s", "");
|
1796
|
-
}
|
1797
|
-
|
1798
|
-
const auto desc = o.desc;
|
1799
|
-
size_t start = 0;
|
1800
|
-
size_t end = desc.find('\n');
|
1801
|
-
while (end != std::string::npos) {
|
1802
|
-
printf("%s\n%34s", desc.substr(start, end - start).c_str(), "");
|
1803
|
-
start = end + 1;
|
1804
|
-
end = desc.find('\n', start);
|
1805
|
-
}
|
1806
|
-
|
1807
|
-
printf("%s\n", desc.substr(start).c_str());
|
1808
|
-
}
|
1809
|
-
printf("\n");
|
367
|
+
return true;
|
1810
368
|
}
|
1811
369
|
|
1812
370
|
std::string gpt_params_get_system_info(const gpt_params & params) {
|
1813
371
|
std::ostringstream os;
|
1814
372
|
|
1815
|
-
os << "system_info: n_threads = " << params.n_threads;
|
1816
|
-
if (params.
|
1817
|
-
os << " (n_threads_batch = " << params.
|
373
|
+
os << "system_info: n_threads = " << params.cpuparams.n_threads;
|
374
|
+
if (params.cpuparams_batch.n_threads != -1) {
|
375
|
+
os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
|
1818
376
|
}
|
1819
377
|
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
1820
378
|
// TODO: windows + arm64 + mingw64
|
@@ -2232,8 +790,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
2232
790
|
llama_lora_adapters_apply(lctx, iparams.lora_adapters);
|
2233
791
|
}
|
2234
792
|
|
2235
|
-
if (params.ignore_eos) {
|
2236
|
-
|
793
|
+
if (params.sparams.ignore_eos && llama_token_eos(model) == -1) {
|
794
|
+
fprintf(stderr, "%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
795
|
+
params.sparams.ignore_eos = false;
|
2237
796
|
}
|
2238
797
|
|
2239
798
|
if (params.warmup) {
|
@@ -2243,10 +802,15 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
2243
802
|
llama_token bos = llama_token_bos(model);
|
2244
803
|
llama_token eos = llama_token_eos(model);
|
2245
804
|
// some models (e.g. T5) don't have a BOS token
|
2246
|
-
if (bos !=
|
805
|
+
if (bos != LLAMA_TOKEN_NULL) {
|
2247
806
|
tmp.push_back(bos);
|
2248
807
|
}
|
2249
|
-
|
808
|
+
if (eos != LLAMA_TOKEN_NULL) {
|
809
|
+
tmp.push_back(eos);
|
810
|
+
}
|
811
|
+
if (tmp.empty()) {
|
812
|
+
tmp.push_back(0);
|
813
|
+
}
|
2250
814
|
|
2251
815
|
if (llama_model_has_encoder(model)) {
|
2252
816
|
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
|
@@ -2262,7 +826,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
2262
826
|
}
|
2263
827
|
llama_kv_cache_clear(lctx);
|
2264
828
|
llama_synchronize(lctx);
|
2265
|
-
|
829
|
+
llama_perf_context_reset(lctx);
|
2266
830
|
}
|
2267
831
|
|
2268
832
|
iparams.model = model;
|
@@ -2339,9 +903,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
2339
903
|
cparams.n_seq_max = params.n_parallel;
|
2340
904
|
cparams.n_batch = params.n_batch;
|
2341
905
|
cparams.n_ubatch = params.n_ubatch;
|
2342
|
-
cparams.n_threads = params.n_threads;
|
2343
|
-
cparams.n_threads_batch = params.
|
2344
|
-
|
906
|
+
cparams.n_threads = params.cpuparams.n_threads;
|
907
|
+
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
908
|
+
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
2345
909
|
cparams.logits_all = params.logits_all;
|
2346
910
|
cparams.embeddings = params.embedding;
|
2347
911
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
@@ -2359,6 +923,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
2359
923
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
2360
924
|
cparams.offload_kqv = !params.no_kv_offload;
|
2361
925
|
cparams.flash_attn = params.flash_attn;
|
926
|
+
cparams.no_perf = params.no_perf;
|
2362
927
|
|
2363
928
|
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
2364
929
|
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
@@ -2366,13 +931,55 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
2366
931
|
return cparams;
|
2367
932
|
}
|
2368
933
|
|
934
|
+
struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
|
935
|
+
struct lm_ggml_threadpool_params tpp;
|
936
|
+
|
937
|
+
lm_ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
|
938
|
+
|
939
|
+
if (params.mask_valid) {
|
940
|
+
std::memcpy(&tpp.cpumask, ¶ms.cpumask, LM_GGML_MAX_N_THREADS);
|
941
|
+
}
|
942
|
+
|
943
|
+
tpp.prio = params.priority;
|
944
|
+
tpp.poll = params.poll;
|
945
|
+
tpp.strict_cpu = params.strict_cpu;
|
946
|
+
|
947
|
+
return tpp;
|
948
|
+
}
|
949
|
+
|
2369
950
|
#ifdef LLAMA_USE_CURL
|
2370
951
|
|
952
|
+
#define CURL_MAX_RETRY 3
|
953
|
+
#define CURL_RETRY_DELAY_SECONDS 2
|
954
|
+
|
955
|
+
|
2371
956
|
static bool starts_with(const std::string & str, const std::string & prefix) {
|
2372
957
|
// While we wait for C++20's std::string::starts_with...
|
2373
958
|
return str.rfind(prefix, 0) == 0;
|
2374
959
|
}
|
2375
960
|
|
961
|
+
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
|
962
|
+
int remaining_attempts = max_attempts;
|
963
|
+
|
964
|
+
while (remaining_attempts > 0) {
|
965
|
+
fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
966
|
+
|
967
|
+
CURLcode res = curl_easy_perform(curl);
|
968
|
+
if (res == CURLE_OK) {
|
969
|
+
return true;
|
970
|
+
}
|
971
|
+
|
972
|
+
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
|
973
|
+
fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
|
974
|
+
|
975
|
+
remaining_attempts--;
|
976
|
+
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
977
|
+
}
|
978
|
+
|
979
|
+
fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
|
980
|
+
return false;
|
981
|
+
}
|
982
|
+
|
2376
983
|
static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
2377
984
|
|
2378
985
|
// Initialize libcurl
|
@@ -2476,9 +1083,8 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
2476
1083
|
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
2477
1084
|
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
2478
1085
|
|
2479
|
-
|
2480
|
-
if (
|
2481
|
-
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
|
1086
|
+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
1087
|
+
if (!was_perform_successful) {
|
2482
1088
|
return false;
|
2483
1089
|
}
|
2484
1090
|
|
@@ -2553,11 +1159,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
2553
1159
|
};
|
2554
1160
|
|
2555
1161
|
// start the download
|
2556
|
-
fprintf(stderr, "%s:
|
2557
|
-
|
2558
|
-
|
2559
|
-
if (
|
2560
|
-
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
|
1162
|
+
fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
1163
|
+
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
1164
|
+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
1165
|
+
if (!was_perform_successful) {
|
2561
1166
|
return false;
|
2562
1167
|
}
|
2563
1168
|
|
@@ -3211,7 +1816,7 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
|
|
3211
1816
|
|
3212
1817
|
void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
3213
1818
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
3214
|
-
const
|
1819
|
+
const auto & sparams = params.sparams;
|
3215
1820
|
|
3216
1821
|
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
3217
1822
|
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
@@ -3231,6 +1836,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
3231
1836
|
fprintf(stream, "cpu_has_sve: %s\n", lm_ggml_cpu_has_sve() ? "true" : "false");
|
3232
1837
|
fprintf(stream, "cpu_has_f16c: %s\n", lm_ggml_cpu_has_f16c() ? "true" : "false");
|
3233
1838
|
fprintf(stream, "cpu_has_fp16_va: %s\n", lm_ggml_cpu_has_fp16_va() ? "true" : "false");
|
1839
|
+
fprintf(stream, "cpu_has_riscv_v: %s\n", lm_ggml_cpu_has_riscv_v() ? "true" : "false");
|
3234
1840
|
fprintf(stream, "cpu_has_wasm_simd: %s\n", lm_ggml_cpu_has_wasm_simd() ? "true" : "false");
|
3235
1841
|
fprintf(stream, "cpu_has_blas: %s\n", lm_ggml_cpu_has_blas() ? "true" : "false");
|
3236
1842
|
fprintf(stream, "cpu_has_sse3: %s\n", lm_ggml_cpu_has_sse3() ? "true" : "false");
|
@@ -3262,8 +1868,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
3262
1868
|
|
3263
1869
|
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
3264
1870
|
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
3265
|
-
yaml_dump_string_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
|
3266
|
-
fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
|
3267
1871
|
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
3268
1872
|
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
3269
1873
|
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
@@ -3274,10 +1878,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
3274
1878
|
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
3275
1879
|
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
3276
1880
|
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
3277
|
-
|
3278
|
-
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
|
3279
|
-
const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
3280
|
-
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
1881
|
+
fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");
|
3281
1882
|
|
3282
1883
|
yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
3283
1884
|
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
|
@@ -3288,11 +1889,8 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
3288
1889
|
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
3289
1890
|
|
3290
1891
|
fprintf(stream, "logit_bias:\n");
|
3291
|
-
for (
|
3292
|
-
|
3293
|
-
continue;
|
3294
|
-
}
|
3295
|
-
fprintf(stream, " %d: %f", lb.first, lb.second);
|
1892
|
+
for (const auto & logit_bias : sparams.logit_bias) {
|
1893
|
+
fprintf(stream, " %d: %f", logit_bias.token, logit_bias.bias);
|
3296
1894
|
}
|
3297
1895
|
|
3298
1896
|
fprintf(stream, "lora:\n");
|
@@ -3345,7 +1943,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
3345
1943
|
|
3346
1944
|
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
|
3347
1945
|
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
|
3348
|
-
fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
|
3349
1946
|
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
3350
1947
|
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
3351
1948
|
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
@@ -3355,11 +1952,11 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
3355
1952
|
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
|
3356
1953
|
|
3357
1954
|
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
3358
|
-
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
|
1955
|
+
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
|
3359
1956
|
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
3360
1957
|
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
3361
1958
|
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
3362
|
-
fprintf(stream, "
|
1959
|
+
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
|
3363
1960
|
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
3364
1961
|
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
3365
1962
|
}
|