@fugood/llama.node 1.4.7 → 1.4.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +23 -24
  4. package/src/LlamaContext.cpp +4 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +470 -223
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  9. package/src/llama.cpp/common/chat.cpp +140 -0
  10. package/src/llama.cpp/common/common.cpp +130 -67
  11. package/src/llama.cpp/common/common.h +44 -17
  12. package/src/llama.cpp/common/console.cpp +98 -18
  13. package/src/llama.cpp/common/console.h +30 -8
  14. package/src/llama.cpp/common/download.cpp +69 -25
  15. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  16. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  17. package/src/llama.cpp/common/log.cpp +5 -0
  18. package/src/llama.cpp/common/log.h +1 -0
  19. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  20. package/src/llama.cpp/common/preset.cpp +206 -0
  21. package/src/llama.cpp/common/preset.h +32 -0
  22. package/src/llama.cpp/common/sampling.cpp +67 -54
  23. package/src/llama.cpp/common/sampling.h +8 -0
  24. package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
  25. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  26. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  27. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  28. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  29. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  30. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
  31. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +285 -0
  32. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -45
  34. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +288 -1
  36. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
  39. package/src/llama.cpp/include/llama.h +18 -1
  40. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  41. package/src/llama.cpp/src/llama-arch.h +9 -2
  42. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  43. package/src/llama.cpp/src/llama-batch.h +4 -2
  44. package/src/llama.cpp/src/llama-context.cpp +93 -23
  45. package/src/llama.cpp/src/llama-context.h +8 -2
  46. package/src/llama.cpp/src/llama-graph.cpp +84 -16
  47. package/src/llama.cpp/src/llama-graph.h +17 -4
  48. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  49. package/src/llama.cpp/src/llama-hparams.h +5 -1
  50. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  51. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  52. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  53. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  54. package/src/llama.cpp/src/llama-mmap.cpp +123 -28
  55. package/src/llama.cpp/src/llama-mmap.h +5 -1
  56. package/src/llama.cpp/src/llama-model-loader.cpp +58 -13
  57. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  58. package/src/llama.cpp/src/llama-model.cpp +110 -49
  59. package/src/llama.cpp/src/llama-model.h +1 -0
  60. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  61. package/src/llama.cpp/src/llama-sampling.cpp +16 -0
  62. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  63. package/src/llama.cpp/src/llama.cpp +665 -1
  64. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  65. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  66. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  67. package/src/llama.cpp/src/models/models.h +5 -5
  68. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  69. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  70. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -20,6 +20,7 @@
20
20
  #include <nlohmann/json.hpp>
21
21
 
22
22
  #include <algorithm>
23
+ #include <cinttypes>
23
24
  #include <climits>
24
25
  #include <cstdarg>
25
26
  #include <fstream>
@@ -47,10 +48,12 @@
47
48
  #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
48
49
 
49
50
  using json = nlohmann::ordered_json;
51
+ using namespace common_arg_utils;
50
52
 
51
53
  static std::initializer_list<enum llama_example> mmproj_examples = {
52
54
  LLAMA_EXAMPLE_MTMD,
53
55
  LLAMA_EXAMPLE_SERVER,
56
+ LLAMA_EXAMPLE_CLI,
54
57
  };
55
58
 
56
59
  static std::string read_file(const std::string & fname) {
@@ -63,6 +66,15 @@ static std::string read_file(const std::string & fname) {
63
66
  return content;
64
67
  }
65
68
 
69
+ static const std::vector<common_arg> & get_common_arg_defs() {
70
+ static const std::vector<common_arg> options = [] {
71
+ common_params params;
72
+ auto ctx = common_params_parser_init(params, LLAMA_EXAMPLE_SERVER, nullptr);
73
+ return ctx.options;
74
+ }();
75
+ return options;
76
+ }
77
+
66
78
  common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
67
79
  this->examples = examples;
68
80
  return *this;
@@ -94,6 +106,16 @@ bool common_arg::is_exclude(enum llama_example ex) {
94
106
 
95
107
  bool common_arg::get_value_from_env(std::string & output) const {
96
108
  if (env == nullptr) return false;
109
+ if (!args_neg.empty()) {
110
+ // for compatibility, we need to check LLAMA_ARG_NO_ env as well
111
+ std::string neg_env = env;
112
+ string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
113
+ char * neg_value = std::getenv(neg_env.c_str());
114
+ if (neg_value) {
115
+ output = "0"; // falsey
116
+ return true;
117
+ }
118
+ }
97
119
  char * value = std::getenv(env);
98
120
  if (value) {
99
121
  output = value;
@@ -103,6 +125,14 @@ bool common_arg::get_value_from_env(std::string & output) const {
103
125
  }
104
126
 
105
127
  bool common_arg::has_value_from_env() const {
128
+ if (env != nullptr && !args_neg.empty()) {
129
+ // for compatibility, we need to check LLAMA_ARG_NO_ env as well
130
+ std::string neg_env = env;
131
+ string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
132
+ if (std::getenv(neg_env.c_str())) {
133
+ return true;
134
+ }
135
+ }
106
136
  return env != nullptr && std::getenv(env);
107
137
  }
108
138
 
@@ -133,16 +163,17 @@ static std::vector<std::string> break_str_into_lines(std::string input, size_t m
133
163
  return result;
134
164
  }
135
165
 
136
- std::string common_arg::to_string() {
166
+ std::string common_arg::to_string() const {
137
167
  // params for printing to console
138
168
  const static int n_leading_spaces = 40;
139
169
  const static int n_char_per_line_help = 70; // TODO: detect this based on current console
140
170
  std::string leading_spaces(n_leading_spaces, ' ');
141
171
 
142
172
  std::ostringstream ss;
143
- for (const auto arg : args) {
144
- if (arg == args.front()) {
145
- if (args.size() == 1) {
173
+ auto all_args = get_args(); // also contains args_neg
174
+ for (const auto & arg : all_args) {
175
+ if (arg == all_args.front()) {
176
+ if (all_args.size() == 1) {
146
177
  ss << arg;
147
178
  } else {
148
179
  // first arg is usually abbreviation, we need padding to make it more beautiful
@@ -151,7 +182,7 @@ std::string common_arg::to_string() {
151
182
  ss << tmp << spaces;
152
183
  }
153
184
  } else {
154
- ss << arg << (arg != args.back() ? ", " : "");
185
+ ss << arg << (arg != all_args.back() ? ", " : "");
155
186
  }
156
187
  }
157
188
  if (value_hint) ss << " " << value_hint;
@@ -170,6 +201,31 @@ std::string common_arg::to_string() {
170
201
  return ss.str();
171
202
  }
172
203
 
204
+ std::vector<std::string> common_arg::get_args() const {
205
+ std::vector<std::string> result;
206
+ for (const auto & arg : args) {
207
+ result.push_back(std::string(arg));
208
+ }
209
+ for (const auto & arg : args_neg) {
210
+ result.push_back(std::string(arg));
211
+ }
212
+ return result;
213
+ }
214
+
215
+ std::vector<std::string> common_arg::get_env() const {
216
+ std::vector<std::string> result;
217
+ if (env) {
218
+ result.push_back(std::string(env));
219
+ }
220
+ if (!args_neg.empty() && env) {
221
+ // for compatibility, we need to add LLAMA_ARG_NO_ variant
222
+ std::string neg_env = env;
223
+ string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
224
+ result.push_back(neg_env);
225
+ }
226
+ return result;
227
+ }
228
+
173
229
  //
174
230
  // utils
175
231
  //
@@ -305,6 +361,16 @@ static std::string get_all_kv_cache_types() {
305
361
  return msg.str();
306
362
  }
307
363
 
364
+ static bool parse_bool_value(const std::string & value) {
365
+ if (is_truthy(value)) {
366
+ return true;
367
+ } else if (is_falsey(value)) {
368
+ return false;
369
+ } else {
370
+ throw std::invalid_argument("invalid boolean value");
371
+ }
372
+ }
373
+
308
374
  //
309
375
  // CLI argument parsing functions
310
376
  //
@@ -312,10 +378,13 @@ static std::string get_all_kv_cache_types() {
312
378
  static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
313
379
  common_params & params = ctx_arg.params;
314
380
 
315
- std::unordered_map<std::string, common_arg *> arg_to_options;
381
+ std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
316
382
  for (auto & opt : ctx_arg.options) {
317
383
  for (const auto & arg : opt.args) {
318
- arg_to_options[arg] = &opt;
384
+ arg_to_options[arg] = {&opt, /* is_positive */ true};
385
+ }
386
+ for (const auto & arg : opt.args_neg) {
387
+ arg_to_options[arg] = {&opt, /* is_positive */ false};
319
388
  }
320
389
  }
321
390
 
@@ -324,12 +393,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
324
393
  std::string value;
325
394
  if (opt.get_value_from_env(value)) {
326
395
  try {
327
- if (opt.handler_void && (value == "1" || value == "true")) {
396
+ if (opt.handler_void && is_truthy(value)) {
328
397
  opt.handler_void(params);
329
398
  }
330
399
  if (opt.handler_int) {
331
400
  opt.handler_int(params, std::stoi(value));
332
401
  }
402
+ if (opt.handler_bool) {
403
+ opt.handler_bool(params, parse_bool_value(value));
404
+ }
333
405
  if (opt.handler_string) {
334
406
  opt.handler_string(params, value);
335
407
  continue;
@@ -348,6 +420,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
348
420
  }
349
421
  };
350
422
 
423
+ std::set<std::string> seen_args;
424
+
351
425
  for (int i = 1; i < argc; i++) {
352
426
  const std::string arg_prefix = "--";
353
427
 
@@ -358,7 +432,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
358
432
  if (arg_to_options.find(arg) == arg_to_options.end()) {
359
433
  throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
360
434
  }
361
- auto opt = *arg_to_options[arg];
435
+ if (!seen_args.insert(arg).second) {
436
+ LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
437
+ }
438
+ auto & tmp = arg_to_options[arg];
439
+ auto opt = *tmp.first;
440
+ bool is_positive = tmp.second;
362
441
  if (opt.has_value_from_env()) {
363
442
  fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
364
443
  }
@@ -367,6 +446,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
367
446
  opt.handler_void(params);
368
447
  continue;
369
448
  }
449
+ if (opt.handler_bool) {
450
+ opt.handler_bool(params, is_positive);
451
+ continue;
452
+ }
370
453
 
371
454
  // arg with single value
372
455
  check_arg(i);
@@ -391,7 +474,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
391
474
  throw std::invalid_argument(string_format(
392
475
  "error while handling argument \"%s\": %s\n\n"
393
476
  "usage:\n%s\n\nto show complete usage, run with -h",
394
- arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
477
+ arg.c_str(), e.what(), opt.to_string().c_str()));
395
478
  }
396
479
  }
397
480
 
@@ -427,7 +510,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
427
510
 
428
511
  // model is required (except for server)
429
512
  // TODO @ngxson : maybe show a list of available models in CLI in this case
430
- if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) {
513
+ if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) {
431
514
  throw std::invalid_argument("error: --model is required\n");
432
515
  }
433
516
 
@@ -452,7 +535,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
452
535
  params.kv_overrides.back().key[0] = 0;
453
536
  }
454
537
 
455
- if (!params.tensor_buft_overrides.empty()) {
538
+ // pad tensor_buft_overrides for llama_params_fit:
539
+ const size_t ntbo = llama_max_tensor_buft_overrides();
540
+ while (params.tensor_buft_overrides.size() < ntbo) {
456
541
  params.tensor_buft_overrides.push_back({nullptr, nullptr});
457
542
  }
458
543
 
@@ -468,6 +553,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
468
553
  ));
469
554
  }
470
555
 
556
+ common_log_set_verbosity_thold(params.verbosity);
557
+
471
558
  return true;
472
559
  }
473
560
 
@@ -560,6 +647,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
560
647
  "llama-batched-bench",
561
648
  "llama-bench",
562
649
  "llama-cli",
650
+ "llama-completion",
563
651
  "llama-convert-llama2c-to-ggml",
564
652
  "llama-cvector-generator",
565
653
  "llama-embedding",
@@ -644,6 +732,61 @@ static void add_rpc_devices(const std::string & servers) {
644
732
  }
645
733
  }
646
734
 
735
+ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
736
+ common_params dummy_params;
737
+ common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
738
+
739
+ std::unordered_map<std::string, common_arg *> arg_to_options;
740
+ for (auto & opt : ctx_arg.options) {
741
+ for (const auto & arg : opt.args) {
742
+ arg_to_options[arg] = &opt;
743
+ }
744
+ for (const auto & arg : opt.args_neg) {
745
+ arg_to_options[arg] = &opt;
746
+ }
747
+ }
748
+
749
+ // TODO @ngxson : find a way to deduplicate this code
750
+
751
+ // handle command line arguments
752
+ auto check_arg = [&](int i) {
753
+ if (i+1 >= argc) {
754
+ throw std::invalid_argument("expected value for argument");
755
+ }
756
+ };
757
+
758
+ std::set<std::string> seen_args;
759
+
760
+ for (int i = 1; i < argc; i++) {
761
+ const std::string arg_prefix = "--";
762
+
763
+ std::string arg = argv[i];
764
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
765
+ std::replace(arg.begin(), arg.end(), '_', '-');
766
+ }
767
+ if (arg_to_options.find(arg) == arg_to_options.end()) {
768
+ throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
769
+ }
770
+ if (!seen_args.insert(arg).second) {
771
+ LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
772
+ }
773
+ auto opt = *arg_to_options[arg];
774
+ std::string val;
775
+ if (opt.value_hint != nullptr) {
776
+ // arg with single value
777
+ check_arg(i);
778
+ val = argv[++i];
779
+ }
780
+ if (opt.value_hint_2 != nullptr) {
781
+ // TODO: support arg with 2 values
782
+ throw std::invalid_argument("error: argument with 2 values is not yet supported\n");
783
+ }
784
+ out_map[opt] = val;
785
+ }
786
+
787
+ return true;
788
+ }
789
+
647
790
  bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
648
791
  auto ctx_arg = common_params_parser_init(params, ex, print_usage);
649
792
  const common_params params_org = ctx_arg.params; // the example can modify the default params
@@ -689,23 +832,30 @@ static std::string list_builtin_chat_templates() {
689
832
  return msg.str();
690
833
  }
691
834
 
692
- static bool is_truthy(const std::string & value) {
693
- return value == "on" || value == "enabled" || value == "1";
835
+ bool common_arg_utils::is_truthy(const std::string & value) {
836
+ return value == "on" || value == "enabled" || value == "true" || value == "1";
694
837
  }
695
838
 
696
- static bool is_falsey(const std::string & value) {
697
- return value == "off" || value == "disabled" || value == "0";
839
+ bool common_arg_utils::is_falsey(const std::string & value) {
840
+ return value == "off" || value == "disabled" || value == "false" || value == "0";
698
841
  }
699
842
 
700
- static bool is_autoy(const std::string & value) {
843
+ bool common_arg_utils::is_autoy(const std::string & value) {
701
844
  return value == "auto" || value == "-1";
702
845
  }
703
846
 
704
847
  common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
705
- // default values specific to example
706
- // note: we place it here instead of inside server.cpp to allow llama-gen-docs to pick it up
707
- if (ex == LLAMA_EXAMPLE_SERVER) {
708
- params.use_jinja = true;
848
+ // per-example default params
849
+ // we define here to make sure it's included in llama-gen-docs
850
+ if (ex == LLAMA_EXAMPLE_COMPLETION) {
851
+ params.use_jinja = false; // disable jinja by default
852
+
853
+ } else if (ex == LLAMA_EXAMPLE_MTMD) {
854
+ params.use_jinja = false; // disable jinja by default
855
+ params.sampling.temp = 0.2; // lower temp by default for better quality
856
+
857
+ } else if (ex == LLAMA_EXAMPLE_SERVER) {
858
+ params.n_parallel = -1; // auto by default
709
859
  }
710
860
 
711
861
  params.use_color = tty_can_use_colors();
@@ -723,7 +873,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
723
873
  sampler_type_chars += common_sampler_type_to_chr(sampler);
724
874
  sampler_type_names += common_sampler_type_to_str(sampler) + ";";
725
875
  }
726
- sampler_type_names.pop_back();
876
+ if (!sampler_type_names.empty()) {
877
+ sampler_type_names.pop_back(); // remove last semicolon
878
+ }
727
879
 
728
880
 
729
881
  /**
@@ -785,12 +937,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
785
937
  }
786
938
  ));
787
939
  add_opt(common_arg(
940
+ {"--display-prompt"},
788
941
  {"--no-display-prompt"},
789
- string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
790
- [](common_params & params) {
791
- params.display_prompt = false;
942
+ string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
943
+ [](common_params & params, bool value) {
944
+ params.display_prompt = value;
792
945
  }
793
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
946
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
794
947
  add_opt(common_arg(
795
948
  {"-co", "--color"}, "[on|off|auto]",
796
949
  "Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
@@ -807,7 +960,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
807
960
  string_format("error: unknown value for --color: '%s'\n", value.c_str()));
808
961
  }
809
962
  }
810
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
963
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
811
964
  add_opt(common_arg(
812
965
  {"-t", "--threads"}, "N",
813
966
  string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
@@ -940,7 +1093,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
940
1093
  add_opt(common_arg(
941
1094
  {"-n", "--predict", "--n-predict"}, "N",
942
1095
  string_format(
943
- ex == LLAMA_EXAMPLE_MAIN
1096
+ ex == LLAMA_EXAMPLE_COMPLETION
944
1097
  ? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
945
1098
  : "number of tokens to predict (default: %d, -1 = infinity)",
946
1099
  params.n_predict),
@@ -979,42 +1132,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
979
1132
  ).set_env("LLAMA_ARG_SWA_FULL"));
980
1133
  add_opt(common_arg(
981
1134
  {"--ctx-checkpoints", "--swa-checkpoints"}, "N",
982
- string_format("max number of context checkpoints to create per slot (default: %d)\n"
1135
+ string_format("max number of context checkpoints to create per slot (default: %d)"
983
1136
  "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
984
1137
  [](common_params & params, int value) {
985
1138
  params.n_ctx_checkpoints = value;
986
1139
  }
987
- ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
1140
+ ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
988
1141
  add_opt(common_arg(
989
1142
  {"--cache-ram", "-cram"}, "N",
990
- string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
1143
+ string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
991
1144
  "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
992
1145
  [](common_params & params, int value) {
993
1146
  params.cache_ram_mib = value;
994
1147
  }
995
- ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
1148
+ ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
996
1149
  add_opt(common_arg(
997
1150
  {"--kv-unified", "-kvu"},
998
- string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
999
- "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
1151
+ "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
1000
1152
  [](common_params & params) {
1001
1153
  params.kv_unified = true;
1002
1154
  }
1003
- ).set_env("LLAMA_ARG_KV_UNIFIED"));
1004
- add_opt(common_arg(
1005
- {"--no-context-shift"},
1006
- string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
1007
- [](common_params & params) {
1008
- params.ctx_shift = false;
1009
- }
1010
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
1155
+ ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY}));
1011
1156
  add_opt(common_arg(
1012
1157
  {"--context-shift"},
1013
- string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
1014
- [](common_params & params) {
1015
- params.ctx_shift = true;
1158
+ {"--no-context-shift"},
1159
+ string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
1160
+ [](common_params & params, bool value) {
1161
+ params.ctx_shift = value;
1016
1162
  }
1017
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
1163
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
1018
1164
  add_opt(common_arg(
1019
1165
  {"--chunks"}, "N",
1020
1166
  string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -1050,15 +1196,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1050
1196
  [](common_params & params, const std::string & value) {
1051
1197
  params.system_prompt = value;
1052
1198
  }
1053
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
1199
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
1054
1200
  add_opt(common_arg(
1201
+ {"--perf"},
1055
1202
  {"--no-perf"},
1056
- string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
1057
- [](common_params & params) {
1058
- params.no_perf = true;
1059
- params.sampling.no_perf = true;
1203
+ string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
1204
+ [](common_params & params, bool value) {
1205
+ params.no_perf = !value;
1206
+ params.sampling.no_perf = !value;
1060
1207
  }
1061
- ).set_env("LLAMA_ARG_NO_PERF"));
1208
+ ).set_env("LLAMA_ARG_PERF"));
1209
+ add_opt(common_arg(
1210
+ {"--show-timings"},
1211
+ {"--no-show-timings"},
1212
+ string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
1213
+ [](common_params & params, bool value) {
1214
+ params.show_timings = value;
1215
+ }
1216
+ ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
1062
1217
  add_opt(common_arg(
1063
1218
  {"-f", "--file"}, "FNAME",
1064
1219
  "a file containing the prompt (default: none)",
@@ -1080,16 +1235,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1080
1235
  params.system_prompt.pop_back();
1081
1236
  }
1082
1237
  }
1083
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
1238
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
1084
1239
  add_opt(common_arg(
1085
1240
  {"--in-file"}, "FNAME",
1086
- "an input file (repeat to specify multiple files)",
1241
+ "an input file (use comma-separated values to specify multiple files)",
1087
1242
  [](common_params & params, const std::string & value) {
1088
- std::ifstream file(value);
1089
- if (!file) {
1090
- throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1243
+ for (const auto & item : string_split<std::string>(value, ',')) {
1244
+ std::ifstream file(item);
1245
+ if (!file) {
1246
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
1247
+ }
1248
+ params.in_files.push_back(item);
1091
1249
  }
1092
- params.in_files.push_back(value);
1093
1250
  }
1094
1251
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1095
1252
  add_opt(common_arg(
@@ -1110,16 +1267,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1110
1267
  ).set_excludes({LLAMA_EXAMPLE_SERVER}));
1111
1268
  add_opt(common_arg(
1112
1269
  {"-e", "--escape"},
1113
- string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
1114
- [](common_params & params) {
1115
- params.escape = true;
1116
- }
1117
- ));
1118
- add_opt(common_arg(
1119
1270
  {"--no-escape"},
1120
- "do not process escape sequences",
1121
- [](common_params & params) {
1122
- params.escape = false;
1271
+ string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
1272
+ [](common_params & params, bool value) {
1273
+ params.escape = value;
1123
1274
  }
1124
1275
  ));
1125
1276
  add_opt(common_arg(
@@ -1128,59 +1279,53 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1128
1279
  [](common_params & params, int value) {
1129
1280
  params.n_print = value;
1130
1281
  }
1131
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1282
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1132
1283
  add_opt(common_arg(
1133
1284
  {"--prompt-cache"}, "FNAME",
1134
1285
  "file to cache prompt state for faster startup (default: none)",
1135
1286
  [](common_params & params, const std::string & value) {
1136
1287
  params.path_prompt_cache = value;
1137
1288
  }
1138
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1289
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1139
1290
  add_opt(common_arg(
1140
1291
  {"--prompt-cache-all"},
1141
1292
  "if specified, saves user input and generations to cache as well\n",
1142
1293
  [](common_params & params) {
1143
1294
  params.prompt_cache_all = true;
1144
1295
  }
1145
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1296
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1146
1297
  add_opt(common_arg(
1147
1298
  {"--prompt-cache-ro"},
1148
1299
  "if specified, uses the prompt cache but does not update it",
1149
1300
  [](common_params & params) {
1150
1301
  params.prompt_cache_ro = true;
1151
1302
  }
1152
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1303
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1153
1304
  add_opt(common_arg(
1154
1305
  {"-r", "--reverse-prompt"}, "PROMPT",
1155
1306
  "halt generation at PROMPT, return control in interactive mode\n",
1156
1307
  [](common_params & params, const std::string & value) {
1157
1308
  params.antiprompt.emplace_back(value);
1158
1309
  }
1159
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
1310
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
1160
1311
  add_opt(common_arg(
1161
1312
  {"-sp", "--special"},
1162
1313
  string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
1163
1314
  [](common_params & params) {
1164
1315
  params.special = true;
1165
1316
  }
1166
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
1317
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
1167
1318
  add_opt(common_arg(
1168
1319
  {"-cnv", "--conversation"},
1169
- "run in conversation mode:\n"
1320
+ {"-no-cnv", "--no-conversation"},
1321
+ "whether to run in conversation mode:\n"
1170
1322
  "- does not print special tokens and suffix/prefix\n"
1171
1323
  "- interactive mode is also enabled\n"
1172
1324
  "(default: auto enabled if chat template is available)",
1173
- [](common_params & params) {
1174
- params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
1325
+ [](common_params & params, bool value) {
1326
+ params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
1175
1327
  }
1176
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1177
- add_opt(common_arg(
1178
- {"-no-cnv", "--no-conversation"},
1179
- "force disable conversation mode (default: false)",
1180
- [](common_params & params) {
1181
- params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
1182
- }
1183
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1328
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
1184
1329
  add_opt(common_arg(
1185
1330
  {"-st", "--single-turn"},
1186
1331
  "run conversation for a single turn only, then exit when done\n"
@@ -1189,28 +1334,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1189
1334
  [](common_params & params) {
1190
1335
  params.single_turn = true;
1191
1336
  }
1192
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1337
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
1193
1338
  add_opt(common_arg(
1194
1339
  {"-i", "--interactive"},
1195
1340
  string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
1196
1341
  [](common_params & params) {
1197
1342
  params.interactive = true;
1198
1343
  }
1199
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1344
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1200
1345
  add_opt(common_arg(
1201
1346
  {"-if", "--interactive-first"},
1202
1347
  string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
1203
1348
  [](common_params & params) {
1204
1349
  params.interactive_first = true;
1205
1350
  }
1206
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1351
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1207
1352
  add_opt(common_arg(
1208
1353
  {"-mli", "--multiline-input"},
1209
1354
  "allows you to write or paste multiple lines without ending each in '\\'",
1210
1355
  [](common_params & params) {
1211
1356
  params.multiline_input = true;
1212
1357
  }
1213
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1358
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
1214
1359
  add_opt(common_arg(
1215
1360
  {"--in-prefix-bos"},
1216
1361
  "prefix BOS to user inputs, preceding the `--in-prefix` string",
@@ -1218,7 +1363,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1218
1363
  params.input_prefix_bos = true;
1219
1364
  params.enable_chat_template = false;
1220
1365
  }
1221
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1366
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1222
1367
  add_opt(common_arg(
1223
1368
  {"--in-prefix"}, "STRING",
1224
1369
  "string to prefix user inputs with (default: empty)",
@@ -1226,7 +1371,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1226
1371
  params.input_prefix = value;
1227
1372
  params.enable_chat_template = false;
1228
1373
  }
1229
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1374
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1230
1375
  add_opt(common_arg(
1231
1376
  {"--in-suffix"}, "STRING",
1232
1377
  "string to suffix after user inputs with (default: empty)",
@@ -1234,14 +1379,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1234
1379
  params.input_suffix = value;
1235
1380
  params.enable_chat_template = false;
1236
1381
  }
1237
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1382
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1238
1383
  add_opt(common_arg(
1384
+ {"--warmup"},
1239
1385
  {"--no-warmup"},
1240
- "skip warming up the model with an empty run",
1241
- [](common_params & params) {
1242
- params.warmup = false;
1386
+ string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
1387
+ [](common_params & params, bool value) {
1388
+ params.warmup = value;
1243
1389
  }
1244
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
1390
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
1245
1391
  add_opt(common_arg(
1246
1392
  {"--spm-infill"},
1247
1393
  string_format(
@@ -1298,7 +1444,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1298
1444
  params.sampling.top_k = value;
1299
1445
  params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
1300
1446
  }
1301
- ).set_sparam());
1447
+ ).set_sparam().set_env("LLAMA_ARG_TOP_K"));
1302
1448
  add_opt(common_arg(
1303
1449
  {"--top-p"}, "N",
1304
1450
  string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
@@ -1632,28 +1778,30 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1632
1778
  [](common_params & params, int value) {
1633
1779
  params.grp_attn_n = value;
1634
1780
  }
1635
- ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
1781
+ ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_PASSKEY}));
1636
1782
  add_opt(common_arg(
1637
1783
  {"-gaw", "--grp-attn-w"}, "N",
1638
1784
  string_format("group-attention width (default: %d)", params.grp_attn_w),
1639
1785
  [](common_params & params, int value) {
1640
1786
  params.grp_attn_w = value;
1641
1787
  }
1642
- ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
1788
+ ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
1643
1789
  add_opt(common_arg(
1790
+ {"-kvo", "--kv-offload"},
1644
1791
  {"-nkvo", "--no-kv-offload"},
1645
- "disable KV offload",
1646
- [](common_params & params) {
1647
- params.no_kv_offload = true;
1792
+ string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
1793
+ [](common_params & params, bool value) {
1794
+ params.no_kv_offload = !value;
1648
1795
  }
1649
- ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
1796
+ ).set_env("LLAMA_ARG_KV_OFFLOAD"));
1650
1797
  add_opt(common_arg(
1798
+ {"--repack"},
1651
1799
  {"-nr", "--no-repack"},
1652
- "disable weight repacking",
1653
- [](common_params & params) {
1654
- params.no_extra_bufts = true;
1800
+ string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
1801
+ [](common_params & params, bool value) {
1802
+ params.no_extra_bufts = !value;
1655
1803
  }
1656
- ).set_env("LLAMA_ARG_NO_REPACK"));
1804
+ ).set_env("LLAMA_ARG_REPACK"));
1657
1805
  add_opt(common_arg(
1658
1806
  {"--no-host"},
1659
1807
  "bypass host buffer allowing extra buffers to be used",
@@ -1766,13 +1914,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1766
1914
  LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
1767
1915
  }
1768
1916
  ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
1769
- add_opt(common_arg(
1770
- {"-np", "--parallel"}, "N",
1771
- string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
1772
- [](common_params & params, int value) {
1773
- params.n_parallel = value;
1774
- }
1775
- ).set_env("LLAMA_ARG_N_PARALLEL"));
1917
+ if (ex == LLAMA_EXAMPLE_SERVER) {
1918
+ // this is to make sure this option appears in the server-specific section of the help message
1919
+ add_opt(common_arg(
1920
+ {"-np", "--parallel"}, "N",
1921
+ string_format("number of server slots (default: %d, -1 = auto)", params.n_parallel),
1922
+ [](common_params & params, int value) {
1923
+ if (value == 0) {
1924
+ throw std::invalid_argument("error: invalid value for n_parallel\n");
1925
+ }
1926
+ params.n_parallel = value;
1927
+ }
1928
+ ).set_env("LLAMA_ARG_N_PARALLEL").set_examples({LLAMA_EXAMPLE_SERVER}));
1929
+ } else {
1930
+ add_opt(common_arg(
1931
+ {"-np", "--parallel"}, "N",
1932
+ string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
1933
+ [](common_params & params, int value) {
1934
+ params.n_parallel = value;
1935
+ }
1936
+ ).set_env("LLAMA_ARG_N_PARALLEL"));
1937
+ }
1776
1938
  add_opt(common_arg(
1777
1939
  {"-ns", "--sequences"}, "N",
1778
1940
  string_format("number of sequences to decode (default: %d)", params.n_sequences),
@@ -1782,20 +1944,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1782
1944
  ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
1783
1945
  add_opt(common_arg(
1784
1946
  {"-cb", "--cont-batching"},
1785
- string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
1786
- [](common_params & params) {
1787
- params.cont_batching = true;
1788
- }
1789
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
1790
- add_opt(common_arg(
1791
1947
  {"-nocb", "--no-cont-batching"},
1792
- "disable continuous batching",
1793
- [](common_params & params) {
1794
- params.cont_batching = false;
1948
+ string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
1949
+ [](common_params & params, bool value) {
1950
+ params.cont_batching = value;
1795
1951
  }
1796
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
1952
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
1797
1953
  add_opt(common_arg(
1798
- {"--mmproj"}, "FILE",
1954
+ {"-mm", "--mmproj"}, "FILE",
1799
1955
  "path to a multimodal projector file. see tools/mtmd/README.md\n"
1800
1956
  "note: if -hf is used, this argument can be omitted",
1801
1957
  [](common_params & params, const std::string & value) {
@@ -1803,33 +1959,37 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1803
1959
  }
1804
1960
  ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
1805
1961
  add_opt(common_arg(
1806
- {"--mmproj-url"}, "URL",
1962
+ {"-mmu", "--mmproj-url"}, "URL",
1807
1963
  "URL to a multimodal projector file. see tools/mtmd/README.md",
1808
1964
  [](common_params & params, const std::string & value) {
1809
1965
  params.mmproj.url = value;
1810
1966
  }
1811
1967
  ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
1812
1968
  add_opt(common_arg(
1813
- {"--no-mmproj"},
1814
- "explicitly disable multimodal projector, useful when using -hf",
1815
- [](common_params & params) {
1816
- params.no_mmproj = true;
1969
+ {"--mmproj-auto"},
1970
+ {"--no-mmproj", "--no-mmproj-auto"},
1971
+ string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
1972
+ [](common_params & params, bool value) {
1973
+ params.no_mmproj = !value;
1817
1974
  }
1818
- ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
1975
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
1819
1976
  add_opt(common_arg(
1977
+ {"--mmproj-offload"},
1820
1978
  {"--no-mmproj-offload"},
1821
- "do not offload multimodal projector to GPU",
1822
- [](common_params & params) {
1823
- params.mmproj_use_gpu = false;
1979
+ string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
1980
+ [](common_params & params, bool value) {
1981
+ params.mmproj_use_gpu = value;
1824
1982
  }
1825
- ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
1983
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
1826
1984
  add_opt(common_arg(
1827
1985
  {"--image", "--audio"}, "FILE",
1828
- "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
1986
+ "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
1829
1987
  [](common_params & params, const std::string & value) {
1830
- params.image.emplace_back(value);
1988
+ for (const auto & item : string_split<std::string>(value, ',')) {
1989
+ params.image.emplace_back(item);
1990
+ }
1831
1991
  }
1832
- ).set_examples({LLAMA_EXAMPLE_MTMD}));
1992
+ ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
1833
1993
  add_opt(common_arg(
1834
1994
  {"--image-min-tokens"}, "N",
1835
1995
  "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
@@ -1862,12 +2022,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1862
2022
  }
1863
2023
  ).set_env("LLAMA_ARG_MLOCK"));
1864
2024
  add_opt(common_arg(
2025
+ {"--mmap"},
1865
2026
  {"--no-mmap"},
1866
- "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
1867
- [](common_params & params) {
1868
- params.use_mmap = false;
2027
+ string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
2028
+ [](common_params & params, bool value) {
2029
+ params.use_mmap = value;
1869
2030
  }
1870
- ).set_env("LLAMA_ARG_NO_MMAP"));
2031
+ ).set_env("LLAMA_ARG_MMAP"));
1871
2032
  add_opt(common_arg(
1872
2033
  {"--numa"}, "TYPE",
1873
2034
  "attempt optimizations that help on some NUMA systems\n"
@@ -1922,7 +2083,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1922
2083
  "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
1923
2084
  parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
1924
2085
  }
1925
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2086
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
1926
2087
  add_opt(common_arg(
1927
2088
  {"--cpu-moe", "-cmoe"},
1928
2089
  "keep all Mixture of Experts (MoE) weights in the CPU",
@@ -1951,7 +2112,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1951
2112
  [](common_params & params) {
1952
2113
  params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
1953
2114
  }
1954
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
2115
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
1955
2116
  add_opt(common_arg(
1956
2117
  {"--n-cpu-moe-draft", "-ncmoed"}, "N",
1957
2118
  "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
@@ -1965,7 +2126,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1965
2126
  params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
1966
2127
  }
1967
2128
  }
1968
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
2129
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
1969
2130
  add_opt(common_arg(
1970
2131
  {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
1971
2132
  string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
@@ -2037,6 +2198,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2037
2198
  }
2038
2199
  }
2039
2200
  ).set_env("LLAMA_ARG_MAIN_GPU"));
2201
+ add_opt(common_arg(
2202
+ { "-fit", "--fit" }, "[on|off]",
2203
+ string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"),
2204
+ [](common_params & params, const std::string & value) {
2205
+ if (is_truthy(value)) {
2206
+ params.fit_params = true;
2207
+ } else if (is_falsey(value)) {
2208
+ params.fit_params = false;
2209
+ } else {
2210
+ throw std::runtime_error(
2211
+ string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
2212
+ }
2213
+ }
2214
+ ).set_env("LLAMA_ARG_FIT"));
2215
+ add_opt(common_arg(
2216
+ { "-fitt", "--fit-target" }, "MiB",
2217
+ string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
2218
+ [](common_params & params, int value) {
2219
+ params.fit_params_target = value * size_t(1024*1024);
2220
+ }
2221
+ ).set_env("LLAMA_ARG_FIT_TARGET"));
2222
+ add_opt(common_arg(
2223
+ { "-fitc", "--fit-ctx" }, "N",
2224
+ string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx),
2225
+ [](common_params & params, int value) {
2226
+ params.fit_params_min_ctx = value;
2227
+ }
2228
+ ).set_env("LLAMA_ARG_FIT_CTX"));
2040
2229
  add_opt(common_arg(
2041
2230
  {"--check-tensors"},
2042
2231
  string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
@@ -2045,51 +2234,96 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2045
2234
  }
2046
2235
  ));
2047
2236
  add_opt(common_arg(
2048
- {"--override-kv"}, "KEY=TYPE:VALUE",
2049
- "advanced option to override model metadata by key. may be specified multiple times.\n"
2050
- "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
2237
+ {"--override-kv"}, "KEY=TYPE:VALUE,...",
2238
+ "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.\n"
2239
+ "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
2051
2240
  [](common_params & params, const std::string & value) {
2052
- if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
2053
- throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str()));
2241
+ std::vector<std::string> kv_overrides;
2242
+
2243
+ std::string current;
2244
+ bool escaping = false;
2245
+
2246
+ for (const char c : value) {
2247
+ if (escaping) {
2248
+ current.push_back(c);
2249
+ escaping = false;
2250
+ } else if (c == '\\') {
2251
+ escaping = true;
2252
+ } else if (c == ',') {
2253
+ kv_overrides.push_back(current);
2254
+ current.clear();
2255
+ } else {
2256
+ current.push_back(c);
2257
+ }
2258
+ }
2259
+
2260
+ if (escaping) {
2261
+ current.push_back('\\');
2262
+ }
2263
+
2264
+ kv_overrides.push_back(current);
2265
+
2266
+ for (const auto & kv_override : kv_overrides) {
2267
+ if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
2268
+ throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
2269
+ }
2054
2270
  }
2055
2271
  }
2056
2272
  ));
2057
2273
  add_opt(common_arg(
2274
+ {"--op-offload"},
2058
2275
  {"--no-op-offload"},
2059
- string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
2060
- [](common_params & params) {
2061
- params.no_op_offload = true;
2276
+ string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
2277
+ [](common_params & params, bool value) {
2278
+ params.no_op_offload = !value;
2062
2279
  }
2063
2280
  ));
2064
2281
  add_opt(common_arg(
2065
2282
  {"--lora"}, "FNAME",
2066
- "path to LoRA adapter (can be repeated to use multiple adapters)",
2283
+ "path to LoRA adapter (use comma-separated values to load multiple adapters)",
2067
2284
  [](common_params & params, const std::string & value) {
2068
- params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
2285
+ for (const auto & item : string_split<std::string>(value, ',')) {
2286
+ params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
2287
+ }
2069
2288
  }
2070
2289
  // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2071
2290
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
2072
2291
  add_opt(common_arg(
2073
- {"--lora-scaled"}, "FNAME", "SCALE",
2074
- "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
2075
- [](common_params & params, const std::string & fname, const std::string & scale) {
2076
- params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
2292
+ {"--lora-scaled"}, "FNAME:SCALE,...",
2293
+ "path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
2294
+ "note: use comma-separated values",
2295
+ [](common_params & params, const std::string & value) {
2296
+ for (const auto & item : string_split<std::string>(value, ',')) {
2297
+ auto parts = string_split<std::string>(item, ':');
2298
+ if (parts.size() != 2) {
2299
+ throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
2300
+ }
2301
+ params.lora_adapters.push_back({ parts[0], std::stof(parts[1]), "", "", nullptr });
2302
+ }
2077
2303
  }
2078
2304
  // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2079
2305
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
2080
2306
  add_opt(common_arg(
2081
2307
  {"--control-vector"}, "FNAME",
2082
- "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
2308
+ "add a control vector\nnote: use comma-separated values to add multiple control vectors",
2083
2309
  [](common_params & params, const std::string & value) {
2084
- params.control_vectors.push_back({ 1.0f, value, });
2310
+ for (const auto & item : string_split<std::string>(value, ',')) {
2311
+ params.control_vectors.push_back({ 1.0f, item, });
2312
+ }
2085
2313
  }
2086
2314
  ));
2087
2315
  add_opt(common_arg(
2088
- {"--control-vector-scaled"}, "FNAME", "SCALE",
2316
+ {"--control-vector-scaled"}, "FNAME:SCALE,...",
2089
2317
  "add a control vector with user defined scaling SCALE\n"
2090
- "note: this argument can be repeated to add multiple scaled control vectors",
2091
- [](common_params & params, const std::string & fname, const std::string & scale) {
2092
- params.control_vectors.push_back({ std::stof(scale), fname });
2318
+ "note: use comma-separated values (format: FNAME:SCALE,...)",
2319
+ [](common_params & params, const std::string & value) {
2320
+ for (const auto & item : string_split<std::string>(value, ',')) {
2321
+ auto parts = string_split<std::string>(item, ':');
2322
+ if (parts.size() != 2) {
2323
+ throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
2324
+ }
2325
+ params.control_vectors.push_back({ std::stof(parts[1]), parts[0] });
2326
+ }
2093
2327
  }
2094
2328
  ));
2095
2329
  add_opt(common_arg(
@@ -2179,13 +2413,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2179
2413
  ).set_env("HF_TOKEN"));
2180
2414
  add_opt(common_arg(
2181
2415
  {"--context-file"}, "FNAME",
2182
- "file to load context from (repeat to specify multiple files)",
2416
+ "file to load context from (use comma-separated values to specify multiple files)",
2183
2417
  [](common_params & params, const std::string & value) {
2184
- std::ifstream file(value, std::ios::binary);
2185
- if (!file) {
2186
- throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
2418
+ for (const auto & item : string_split<std::string>(value, ',')) {
2419
+ std::ifstream file(item, std::ios::binary);
2420
+ if (!file) {
2421
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
2422
+ }
2423
+ params.context_files.push_back(item);
2187
2424
  }
2188
- params.context_files.push_back(value);
2189
2425
  }
2190
2426
  ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
2191
2427
  add_opt(common_arg(
@@ -2254,10 +2490,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2254
2490
  }
2255
2491
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2256
2492
  add_opt(common_arg(
2493
+ {"--ppl"},
2257
2494
  {"--no-ppl"},
2258
- string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
2259
- [](common_params & params) {
2260
- params.compute_ppl = false;
2495
+ string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
2496
+ [](common_params & params, bool value) {
2497
+ params.compute_ppl = value;
2261
2498
  }
2262
2499
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2263
2500
  add_opt(common_arg(
@@ -2376,12 +2613,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2376
2613
  }
2377
2614
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
2378
2615
  add_opt(common_arg(
2616
+ {"--webui-config"}, "JSON",
2617
+ "JSON that provides default WebUI settings (overrides WebUI defaults)",
2618
+ [](common_params & params, const std::string & value) {
2619
+ params.webui_config_json = value;
2620
+ }
2621
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
2622
+ add_opt(common_arg(
2623
+ {"--webui-config-file"}, "PATH",
2624
+ "JSON file that provides default WebUI settings (overrides WebUI defaults)",
2625
+ [](common_params & params, const std::string & value) {
2626
+ params.webui_config_json = read_file(value);
2627
+ }
2628
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
2629
+ add_opt(common_arg(
2630
+ {"--webui"},
2379
2631
  {"--no-webui"},
2380
- string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
2381
- [](common_params & params) {
2382
- params.webui = false;
2632
+ string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
2633
+ [](common_params & params, bool value) {
2634
+ params.webui = value;
2383
2635
  }
2384
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
2636
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
2385
2637
  add_opt(common_arg(
2386
2638
  {"--embedding", "--embeddings"},
2387
2639
  string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@@ -2444,7 +2696,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2444
2696
  params.default_template_kwargs[item.key()] = item.value().dump();
2445
2697
  }
2446
2698
  }
2447
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
2699
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
2448
2700
  add_opt(common_arg(
2449
2701
  {"-to", "--timeout"}, "N",
2450
2702
  string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@@ -2486,18 +2738,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2486
2738
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
2487
2739
  add_opt(common_arg(
2488
2740
  {"--slots"},
2489
- string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
2490
- [](common_params & params) {
2491
- params.endpoint_slots = true;
2492
- }
2493
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
2494
- add_opt(common_arg(
2495
2741
  {"--no-slots"},
2496
- "disables slots monitoring endpoint",
2497
- [](common_params & params) {
2498
- params.endpoint_slots = false;
2742
+ string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
2743
+ [](common_params & params, bool value) {
2744
+ params.endpoint_slots = value;
2499
2745
  }
2500
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
2746
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
2501
2747
  add_opt(common_arg(
2502
2748
  {"--slot-save-path"}, "PATH",
2503
2749
  "path to save slot kv cache (default: disabled)",
@@ -2533,6 +2779,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2533
2779
  params.models_dir = value;
2534
2780
  }
2535
2781
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
2782
+ add_opt(common_arg(
2783
+ {"--models-preset"}, "PATH",
2784
+ "path to INI file containing model presets for the router server (default: disabled)",
2785
+ [](common_params & params, const std::string & value) {
2786
+ params.models_preset = value;
2787
+ }
2788
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_PRESET"));
2536
2789
  add_opt(common_arg(
2537
2790
  {"--models-max"}, "N",
2538
2791
  string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
@@ -2541,26 +2794,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2541
2794
  }
2542
2795
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
2543
2796
  add_opt(common_arg(
2797
+ {"--models-autoload"},
2544
2798
  {"--no-models-autoload"},
2545
- "disables automatic loading of models (default: enabled)",
2546
- [](common_params & params) {
2547
- params.models_autoload = false;
2799
+ string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
2800
+ [](common_params & params, bool value) {
2801
+ params.models_autoload = value;
2548
2802
  }
2549
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
2803
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
2550
2804
  add_opt(common_arg(
2551
2805
  {"--jinja"},
2552
- string_format("use jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
2553
- [](common_params & params) {
2554
- params.use_jinja = true;
2555
- }
2556
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
2557
- add_opt(common_arg(
2558
2806
  {"--no-jinja"},
2559
- string_format("disable jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
2560
- [](common_params & params) {
2561
- params.use_jinja = false;
2807
+ string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
2808
+ [](common_params & params, bool value) {
2809
+ params.use_jinja = value;
2562
2810
  }
2563
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
2811
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
2564
2812
  add_opt(common_arg(
2565
2813
  {"--reasoning-format"}, "FORMAT",
2566
2814
  "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@@ -2571,7 +2819,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2571
2819
  [](common_params & params, const std::string & value) {
2572
2820
  params.reasoning_format = common_reasoning_format_from_name(value);
2573
2821
  }
2574
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
2822
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
2575
2823
  add_opt(common_arg(
2576
2824
  {"--reasoning-budget"}, "N",
2577
2825
  "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
@@ -2579,7 +2827,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2579
2827
  if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
2580
2828
  params.reasoning_budget = value;
2581
2829
  }
2582
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
2830
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
2583
2831
  add_opt(common_arg(
2584
2832
  {"--chat-template"}, "JINJA_TEMPLATE",
2585
2833
  string_format(
@@ -2591,7 +2839,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2591
2839
  [](common_params & params, const std::string & value) {
2592
2840
  params.chat_template = value;
2593
2841
  }
2594
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
2842
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
2595
2843
  add_opt(common_arg(
2596
2844
  {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
2597
2845
  string_format(
@@ -2603,17 +2851,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2603
2851
  [](common_params & params, const std::string & value) {
2604
2852
  params.chat_template = read_file(value);
2605
2853
  }
2606
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
2854
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
2607
2855
  add_opt(common_arg(
2856
+ {"--prefill-assistant"},
2608
2857
  {"--no-prefill-assistant"},
2609
2858
  string_format(
2610
2859
  "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
2611
2860
  "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
2612
2861
  ),
2613
- [](common_params & params) {
2614
- params.prefill_assistant = false;
2862
+ [](common_params & params, bool value) {
2863
+ params.prefill_assistant = value;
2615
2864
  }
2616
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
2865
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
2617
2866
  add_opt(common_arg(
2618
2867
  {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
2619
2868
  string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
@@ -2634,7 +2883,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2634
2883
  [](common_params & params) {
2635
2884
  params.simple_io = true;
2636
2885
  }
2637
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
2886
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
2638
2887
  add_opt(common_arg(
2639
2888
  {"--positive-file"}, "FNAME",
2640
2889
  string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
@@ -2717,7 +2966,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2717
2966
  "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
2718
2967
  [](common_params & params) {
2719
2968
  params.verbosity = INT_MAX;
2720
- common_log_set_verbosity_thold(INT_MAX);
2721
2969
  }
2722
2970
  ));
2723
2971
  add_opt(common_arg(
@@ -2738,7 +2986,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2738
2986
  "(default: %d)\n", params.verbosity),
2739
2987
  [](common_params & params, int value) {
2740
2988
  params.verbosity = value;
2741
- common_log_set_verbosity_thold(value);
2742
2989
  }
2743
2990
  ).set_env("LLAMA_LOG_VERBOSITY"));
2744
2991
  add_opt(common_arg(
@@ -2871,14 +3118,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2871
3118
  [](common_params & params, int value) {
2872
3119
  params.speculative.n_max = value;
2873
3120
  }
2874
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
3121
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX"));
2875
3122
  add_opt(common_arg(
2876
3123
  {"--draft-min", "--draft-n-min"}, "N",
2877
3124
  string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
2878
3125
  [](common_params & params, int value) {
2879
3126
  params.speculative.n_min = value;
2880
3127
  }
2881
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
3128
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN"));
2882
3129
  add_opt(common_arg(
2883
3130
  {"--draft-p-split"}, "P",
2884
3131
  string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
@@ -2892,14 +3139,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2892
3139
  [](common_params & params, const std::string & value) {
2893
3140
  params.speculative.p_min = std::stof(value);
2894
3141
  }
2895
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
3142
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
2896
3143
  add_opt(common_arg(
2897
3144
  {"-cd", "--ctx-size-draft"}, "N",
2898
3145
  string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
2899
3146
  [](common_params & params, int value) {
2900
3147
  params.speculative.n_ctx = value;
2901
3148
  }
2902
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
3149
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
2903
3150
  add_opt(common_arg(
2904
3151
  {"-devd", "--device-draft"}, "<dev1,dev2,..>",
2905
3152
  "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@@ -2907,7 +3154,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2907
3154
  [](common_params & params, const std::string & value) {
2908
3155
  params.speculative.devices = parse_device_list(value);
2909
3156
  }
2910
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
3157
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
2911
3158
  add_opt(common_arg(
2912
3159
  {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
2913
3160
  "number of layers to store in VRAM for the draft model",
@@ -2919,21 +3166,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2919
3166
  fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
2920
3167
  }
2921
3168
  }
2922
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
3169
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
2923
3170
  add_opt(common_arg(
2924
3171
  {"-md", "--model-draft"}, "FNAME",
2925
3172
  "draft model for speculative decoding (default: unused)",
2926
3173
  [](common_params & params, const std::string & value) {
2927
3174
  params.speculative.model.path = value;
2928
3175
  }
2929
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3176
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_MODEL_DRAFT"));
2930
3177
  add_opt(common_arg(
2931
3178
  {"--spec-replace"}, "TARGET", "DRAFT",
2932
3179
  "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
2933
3180
  [](common_params & params, const std::string & tgt, const std::string & dft) {
2934
3181
  params.speculative.replacements.push_back({ tgt, dft });
2935
3182
  }
2936
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
3183
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
2937
3184
  add_opt(common_arg(
2938
3185
  {"-ctkd", "--cache-type-k-draft"}, "TYPE",
2939
3186
  string_format(
@@ -3197,7 +3444,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3197
3444
  params.use_jinja = true;
3198
3445
  //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
3199
3446
  }
3200
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
3447
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3201
3448
 
3202
3449
  add_opt(common_arg(
3203
3450
  {"--gpt-oss-120b-default"},
@@ -3216,7 +3463,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3216
3463
  params.use_jinja = true;
3217
3464
  //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
3218
3465
  }
3219
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
3466
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3220
3467
 
3221
3468
  add_opt(common_arg(
3222
3469
  {"--vision-gemma-4b-default"},
@@ -3227,7 +3474,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3227
3474
  params.n_ctx = 0;
3228
3475
  params.use_jinja = true;
3229
3476
  }
3230
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
3477
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3231
3478
 
3232
3479
  add_opt(common_arg(
3233
3480
  {"--vision-gemma-12b-default"},
@@ -3238,7 +3485,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3238
3485
  params.n_ctx = 0;
3239
3486
  params.use_jinja = true;
3240
3487
  }
3241
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
3488
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3242
3489
 
3243
3490
  return ctx_arg;
3244
3491
  }