@fugood/llama.node 1.4.7 → 1.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +22 -23
  4. package/src/LlamaContext.cpp +2 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +364 -193
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  9. package/src/llama.cpp/common/chat.cpp +140 -0
  10. package/src/llama.cpp/common/common.cpp +130 -67
  11. package/src/llama.cpp/common/common.h +40 -16
  12. package/src/llama.cpp/common/console.cpp +98 -18
  13. package/src/llama.cpp/common/console.h +30 -8
  14. package/src/llama.cpp/common/download.cpp +69 -25
  15. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  16. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  17. package/src/llama.cpp/common/log.cpp +5 -0
  18. package/src/llama.cpp/common/log.h +1 -0
  19. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  20. package/src/llama.cpp/common/preset.cpp +206 -0
  21. package/src/llama.cpp/common/preset.h +32 -0
  22. package/src/llama.cpp/common/sampling.cpp +91 -92
  23. package/src/llama.cpp/common/sampling.h +11 -6
  24. package/src/llama.cpp/common/speculative.cpp +1 -1
  25. package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
  26. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  27. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  28. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  29. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  30. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  31. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +60 -39
  33. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
  35. package/src/llama.cpp/include/llama.h +18 -1
  36. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  37. package/src/llama.cpp/src/llama-arch.h +9 -2
  38. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  39. package/src/llama.cpp/src/llama-batch.h +4 -2
  40. package/src/llama.cpp/src/llama-context.cpp +93 -23
  41. package/src/llama.cpp/src/llama-context.h +8 -2
  42. package/src/llama.cpp/src/llama-graph.cpp +84 -16
  43. package/src/llama.cpp/src/llama-graph.h +17 -4
  44. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  45. package/src/llama.cpp/src/llama-hparams.h +5 -1
  46. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  47. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  48. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  49. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  50. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  51. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  52. package/src/llama.cpp/src/llama-model.cpp +103 -44
  53. package/src/llama.cpp/src/llama-model.h +1 -0
  54. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  55. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  56. package/src/llama.cpp/src/llama.cpp +675 -1
  57. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  58. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  59. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  60. package/src/llama.cpp/src/models/models.h +5 -5
  61. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  62. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  63. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -20,6 +20,7 @@
20
20
  #include <nlohmann/json.hpp>
21
21
 
22
22
  #include <algorithm>
23
+ #include <cinttypes>
23
24
  #include <climits>
24
25
  #include <cstdarg>
25
26
  #include <fstream>
@@ -47,10 +48,12 @@
47
48
  #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
48
49
 
49
50
  using json = nlohmann::ordered_json;
51
+ using namespace common_arg_utils;
50
52
 
51
53
  static std::initializer_list<enum llama_example> mmproj_examples = {
52
54
  LLAMA_EXAMPLE_MTMD,
53
55
  LLAMA_EXAMPLE_SERVER,
56
+ LLAMA_EXAMPLE_CLI,
54
57
  };
55
58
 
56
59
  static std::string read_file(const std::string & fname) {
@@ -63,6 +66,15 @@ static std::string read_file(const std::string & fname) {
63
66
  return content;
64
67
  }
65
68
 
69
+ static const std::vector<common_arg> & get_common_arg_defs() {
70
+ static const std::vector<common_arg> options = [] {
71
+ common_params params;
72
+ auto ctx = common_params_parser_init(params, LLAMA_EXAMPLE_SERVER, nullptr);
73
+ return ctx.options;
74
+ }();
75
+ return options;
76
+ }
77
+
66
78
  common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
67
79
  this->examples = examples;
68
80
  return *this;
@@ -94,6 +106,16 @@ bool common_arg::is_exclude(enum llama_example ex) {
94
106
 
95
107
  bool common_arg::get_value_from_env(std::string & output) const {
96
108
  if (env == nullptr) return false;
109
+ if (!args_neg.empty()) {
110
+ // for compatibility, we need to check LLAMA_ARG_NO_ env as well
111
+ std::string neg_env = env;
112
+ string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
113
+ char * neg_value = std::getenv(neg_env.c_str());
114
+ if (neg_value) {
115
+ output = "0"; // falsey
116
+ return true;
117
+ }
118
+ }
97
119
  char * value = std::getenv(env);
98
120
  if (value) {
99
121
  output = value;
@@ -103,6 +125,14 @@ bool common_arg::get_value_from_env(std::string & output) const {
103
125
  }
104
126
 
105
127
  bool common_arg::has_value_from_env() const {
128
+ if (env != nullptr && !args_neg.empty()) {
129
+ // for compatibility, we need to check LLAMA_ARG_NO_ env as well
130
+ std::string neg_env = env;
131
+ string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
132
+ if (std::getenv(neg_env.c_str())) {
133
+ return true;
134
+ }
135
+ }
106
136
  return env != nullptr && std::getenv(env);
107
137
  }
108
138
 
@@ -133,16 +163,17 @@ static std::vector<std::string> break_str_into_lines(std::string input, size_t m
133
163
  return result;
134
164
  }
135
165
 
136
- std::string common_arg::to_string() {
166
+ std::string common_arg::to_string() const {
137
167
  // params for printing to console
138
168
  const static int n_leading_spaces = 40;
139
169
  const static int n_char_per_line_help = 70; // TODO: detect this based on current console
140
170
  std::string leading_spaces(n_leading_spaces, ' ');
141
171
 
142
172
  std::ostringstream ss;
143
- for (const auto arg : args) {
144
- if (arg == args.front()) {
145
- if (args.size() == 1) {
173
+ auto all_args = get_args(); // also contains args_neg
174
+ for (const auto & arg : all_args) {
175
+ if (arg == all_args.front()) {
176
+ if (all_args.size() == 1) {
146
177
  ss << arg;
147
178
  } else {
148
179
  // first arg is usually abbreviation, we need padding to make it more beautiful
@@ -151,7 +182,7 @@ std::string common_arg::to_string() {
151
182
  ss << tmp << spaces;
152
183
  }
153
184
  } else {
154
- ss << arg << (arg != args.back() ? ", " : "");
185
+ ss << arg << (arg != all_args.back() ? ", " : "");
155
186
  }
156
187
  }
157
188
  if (value_hint) ss << " " << value_hint;
@@ -170,6 +201,31 @@ std::string common_arg::to_string() {
170
201
  return ss.str();
171
202
  }
172
203
 
204
+ std::vector<std::string> common_arg::get_args() const {
205
+ std::vector<std::string> result;
206
+ for (const auto & arg : args) {
207
+ result.push_back(std::string(arg));
208
+ }
209
+ for (const auto & arg : args_neg) {
210
+ result.push_back(std::string(arg));
211
+ }
212
+ return result;
213
+ }
214
+
215
+ std::vector<std::string> common_arg::get_env() const {
216
+ std::vector<std::string> result;
217
+ if (env) {
218
+ result.push_back(std::string(env));
219
+ }
220
+ if (!args_neg.empty() && env) {
221
+ // for compatibility, we need to add LLAMA_ARG_NO_ variant
222
+ std::string neg_env = env;
223
+ string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
224
+ result.push_back(neg_env);
225
+ }
226
+ return result;
227
+ }
228
+
173
229
  //
174
230
  // utils
175
231
  //
@@ -305,6 +361,16 @@ static std::string get_all_kv_cache_types() {
305
361
  return msg.str();
306
362
  }
307
363
 
364
+ static bool parse_bool_value(const std::string & value) {
365
+ if (is_truthy(value)) {
366
+ return true;
367
+ } else if (is_falsey(value)) {
368
+ return false;
369
+ } else {
370
+ throw std::invalid_argument("invalid boolean value");
371
+ }
372
+ }
373
+
308
374
  //
309
375
  // CLI argument parsing functions
310
376
  //
@@ -312,10 +378,13 @@ static std::string get_all_kv_cache_types() {
312
378
  static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
313
379
  common_params & params = ctx_arg.params;
314
380
 
315
- std::unordered_map<std::string, common_arg *> arg_to_options;
381
+ std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
316
382
  for (auto & opt : ctx_arg.options) {
317
383
  for (const auto & arg : opt.args) {
318
- arg_to_options[arg] = &opt;
384
+ arg_to_options[arg] = {&opt, /* is_positive */ true};
385
+ }
386
+ for (const auto & arg : opt.args_neg) {
387
+ arg_to_options[arg] = {&opt, /* is_positive */ false};
319
388
  }
320
389
  }
321
390
 
@@ -324,12 +393,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
324
393
  std::string value;
325
394
  if (opt.get_value_from_env(value)) {
326
395
  try {
327
- if (opt.handler_void && (value == "1" || value == "true")) {
396
+ if (opt.handler_void && is_truthy(value)) {
328
397
  opt.handler_void(params);
329
398
  }
330
399
  if (opt.handler_int) {
331
400
  opt.handler_int(params, std::stoi(value));
332
401
  }
402
+ if (opt.handler_bool) {
403
+ opt.handler_bool(params, parse_bool_value(value));
404
+ }
333
405
  if (opt.handler_string) {
334
406
  opt.handler_string(params, value);
335
407
  continue;
@@ -358,7 +430,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
358
430
  if (arg_to_options.find(arg) == arg_to_options.end()) {
359
431
  throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
360
432
  }
361
- auto opt = *arg_to_options[arg];
433
+ auto & tmp = arg_to_options[arg];
434
+ auto opt = *tmp.first;
435
+ bool is_positive = tmp.second;
362
436
  if (opt.has_value_from_env()) {
363
437
  fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
364
438
  }
@@ -367,6 +441,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
367
441
  opt.handler_void(params);
368
442
  continue;
369
443
  }
444
+ if (opt.handler_bool) {
445
+ opt.handler_bool(params, is_positive);
446
+ continue;
447
+ }
370
448
 
371
449
  // arg with single value
372
450
  check_arg(i);
@@ -391,7 +469,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
391
469
  throw std::invalid_argument(string_format(
392
470
  "error while handling argument \"%s\": %s\n\n"
393
471
  "usage:\n%s\n\nto show complete usage, run with -h",
394
- arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
472
+ arg.c_str(), e.what(), opt.to_string().c_str()));
395
473
  }
396
474
  }
397
475
 
@@ -427,7 +505,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
427
505
 
428
506
  // model is required (except for server)
429
507
  // TODO @ngxson : maybe show a list of available models in CLI in this case
430
- if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) {
508
+ if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) {
431
509
  throw std::invalid_argument("error: --model is required\n");
432
510
  }
433
511
 
@@ -452,7 +530,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
452
530
  params.kv_overrides.back().key[0] = 0;
453
531
  }
454
532
 
455
- if (!params.tensor_buft_overrides.empty()) {
533
+ // pad tensor_buft_overrides for llama_params_fit:
534
+ const size_t ntbo = llama_max_tensor_buft_overrides();
535
+ while (params.tensor_buft_overrides.size() < ntbo) {
456
536
  params.tensor_buft_overrides.push_back({nullptr, nullptr});
457
537
  }
458
538
 
@@ -468,6 +548,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
468
548
  ));
469
549
  }
470
550
 
551
+ common_log_set_verbosity_thold(params.verbosity);
552
+
471
553
  return true;
472
554
  }
473
555
 
@@ -560,6 +642,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
560
642
  "llama-batched-bench",
561
643
  "llama-bench",
562
644
  "llama-cli",
645
+ "llama-completion",
563
646
  "llama-convert-llama2c-to-ggml",
564
647
  "llama-cvector-generator",
565
648
  "llama-embedding",
@@ -644,6 +727,56 @@ static void add_rpc_devices(const std::string & servers) {
644
727
  }
645
728
  }
646
729
 
730
+ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
731
+ common_params dummy_params;
732
+ common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
733
+
734
+ std::unordered_map<std::string, common_arg *> arg_to_options;
735
+ for (auto & opt : ctx_arg.options) {
736
+ for (const auto & arg : opt.args) {
737
+ arg_to_options[arg] = &opt;
738
+ }
739
+ for (const auto & arg : opt.args_neg) {
740
+ arg_to_options[arg] = &opt;
741
+ }
742
+ }
743
+
744
+ // TODO @ngxson : find a way to deduplicate this code
745
+
746
+ // handle command line arguments
747
+ auto check_arg = [&](int i) {
748
+ if (i+1 >= argc) {
749
+ throw std::invalid_argument("expected value for argument");
750
+ }
751
+ };
752
+
753
+ for (int i = 1; i < argc; i++) {
754
+ const std::string arg_prefix = "--";
755
+
756
+ std::string arg = argv[i];
757
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
758
+ std::replace(arg.begin(), arg.end(), '_', '-');
759
+ }
760
+ if (arg_to_options.find(arg) == arg_to_options.end()) {
761
+ throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
762
+ }
763
+ auto opt = *arg_to_options[arg];
764
+ std::string val;
765
+ if (opt.value_hint != nullptr) {
766
+ // arg with single value
767
+ check_arg(i);
768
+ val = argv[++i];
769
+ }
770
+ if (opt.value_hint_2 != nullptr) {
771
+ // TODO: support arg with 2 values
772
+ throw std::invalid_argument("error: argument with 2 values is not yet supported\n");
773
+ }
774
+ out_map[opt] = val;
775
+ }
776
+
777
+ return true;
778
+ }
779
+
647
780
  bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
648
781
  auto ctx_arg = common_params_parser_init(params, ex, print_usage);
649
782
  const common_params params_org = ctx_arg.params; // the example can modify the default params
@@ -689,23 +822,30 @@ static std::string list_builtin_chat_templates() {
689
822
  return msg.str();
690
823
  }
691
824
 
692
- static bool is_truthy(const std::string & value) {
693
- return value == "on" || value == "enabled" || value == "1";
825
+ bool common_arg_utils::is_truthy(const std::string & value) {
826
+ return value == "on" || value == "enabled" || value == "true" || value == "1";
694
827
  }
695
828
 
696
- static bool is_falsey(const std::string & value) {
697
- return value == "off" || value == "disabled" || value == "0";
829
+ bool common_arg_utils::is_falsey(const std::string & value) {
830
+ return value == "off" || value == "disabled" || value == "false" || value == "0";
698
831
  }
699
832
 
700
- static bool is_autoy(const std::string & value) {
833
+ bool common_arg_utils::is_autoy(const std::string & value) {
701
834
  return value == "auto" || value == "-1";
702
835
  }
703
836
 
704
837
  common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
705
- // default values specific to example
706
- // note: we place it here instead of inside server.cpp to allow llama-gen-docs to pick it up
707
- if (ex == LLAMA_EXAMPLE_SERVER) {
708
- params.use_jinja = true;
838
+ // per-example default params
839
+ // we define here to make sure it's included in llama-gen-docs
840
+ if (ex == LLAMA_EXAMPLE_COMPLETION) {
841
+ params.use_jinja = false; // disable jinja by default
842
+
843
+ } else if (ex == LLAMA_EXAMPLE_MTMD) {
844
+ params.use_jinja = false; // disable jinja by default
845
+ params.sampling.temp = 0.2; // lower temp by default for better quality
846
+
847
+ } else if (ex == LLAMA_EXAMPLE_SERVER) {
848
+ params.n_parallel = -1; // auto by default
709
849
  }
710
850
 
711
851
  params.use_color = tty_can_use_colors();
@@ -785,12 +925,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
785
925
  }
786
926
  ));
787
927
  add_opt(common_arg(
928
+ {"--display-prompt"},
788
929
  {"--no-display-prompt"},
789
- string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
790
- [](common_params & params) {
791
- params.display_prompt = false;
930
+ string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
931
+ [](common_params & params, bool value) {
932
+ params.display_prompt = value;
792
933
  }
793
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
934
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
794
935
  add_opt(common_arg(
795
936
  {"-co", "--color"}, "[on|off|auto]",
796
937
  "Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
@@ -807,7 +948,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
807
948
  string_format("error: unknown value for --color: '%s'\n", value.c_str()));
808
949
  }
809
950
  }
810
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
951
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
811
952
  add_opt(common_arg(
812
953
  {"-t", "--threads"}, "N",
813
954
  string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
@@ -940,7 +1081,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
940
1081
  add_opt(common_arg(
941
1082
  {"-n", "--predict", "--n-predict"}, "N",
942
1083
  string_format(
943
- ex == LLAMA_EXAMPLE_MAIN
1084
+ ex == LLAMA_EXAMPLE_COMPLETION
944
1085
  ? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
945
1086
  : "number of tokens to predict (default: %d, -1 = infinity)",
946
1087
  params.n_predict),
@@ -979,42 +1120,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
979
1120
  ).set_env("LLAMA_ARG_SWA_FULL"));
980
1121
  add_opt(common_arg(
981
1122
  {"--ctx-checkpoints", "--swa-checkpoints"}, "N",
982
- string_format("max number of context checkpoints to create per slot (default: %d)\n"
1123
+ string_format("max number of context checkpoints to create per slot (default: %d)"
983
1124
  "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
984
1125
  [](common_params & params, int value) {
985
1126
  params.n_ctx_checkpoints = value;
986
1127
  }
987
- ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
1128
+ ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
988
1129
  add_opt(common_arg(
989
1130
  {"--cache-ram", "-cram"}, "N",
990
- string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
1131
+ string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
991
1132
  "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
992
1133
  [](common_params & params, int value) {
993
1134
  params.cache_ram_mib = value;
994
1135
  }
995
- ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
1136
+ ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
996
1137
  add_opt(common_arg(
997
1138
  {"--kv-unified", "-kvu"},
998
- string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
999
- "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
1139
+ "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
1000
1140
  [](common_params & params) {
1001
1141
  params.kv_unified = true;
1002
1142
  }
1003
- ).set_env("LLAMA_ARG_KV_UNIFIED"));
1004
- add_opt(common_arg(
1005
- {"--no-context-shift"},
1006
- string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
1007
- [](common_params & params) {
1008
- params.ctx_shift = false;
1009
- }
1010
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
1143
+ ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY}));
1011
1144
  add_opt(common_arg(
1012
1145
  {"--context-shift"},
1013
- string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
1014
- [](common_params & params) {
1015
- params.ctx_shift = true;
1146
+ {"--no-context-shift"},
1147
+ string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
1148
+ [](common_params & params, bool value) {
1149
+ params.ctx_shift = value;
1016
1150
  }
1017
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
1151
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
1018
1152
  add_opt(common_arg(
1019
1153
  {"--chunks"}, "N",
1020
1154
  string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -1050,15 +1184,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1050
1184
  [](common_params & params, const std::string & value) {
1051
1185
  params.system_prompt = value;
1052
1186
  }
1053
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
1187
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
1054
1188
  add_opt(common_arg(
1189
+ {"--perf"},
1055
1190
  {"--no-perf"},
1056
- string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
1057
- [](common_params & params) {
1058
- params.no_perf = true;
1059
- params.sampling.no_perf = true;
1191
+ string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
1192
+ [](common_params & params, bool value) {
1193
+ params.no_perf = !value;
1194
+ params.sampling.no_perf = !value;
1060
1195
  }
1061
- ).set_env("LLAMA_ARG_NO_PERF"));
1196
+ ).set_env("LLAMA_ARG_PERF"));
1197
+ add_opt(common_arg(
1198
+ {"--show-timings"},
1199
+ {"--no-show-timings"},
1200
+ string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
1201
+ [](common_params & params, bool value) {
1202
+ params.show_timings = value;
1203
+ }
1204
+ ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
1062
1205
  add_opt(common_arg(
1063
1206
  {"-f", "--file"}, "FNAME",
1064
1207
  "a file containing the prompt (default: none)",
@@ -1080,7 +1223,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1080
1223
  params.system_prompt.pop_back();
1081
1224
  }
1082
1225
  }
1083
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
1226
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
1084
1227
  add_opt(common_arg(
1085
1228
  {"--in-file"}, "FNAME",
1086
1229
  "an input file (repeat to specify multiple files)",
@@ -1110,16 +1253,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1110
1253
  ).set_excludes({LLAMA_EXAMPLE_SERVER}));
1111
1254
  add_opt(common_arg(
1112
1255
  {"-e", "--escape"},
1113
- string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
1114
- [](common_params & params) {
1115
- params.escape = true;
1116
- }
1117
- ));
1118
- add_opt(common_arg(
1119
1256
  {"--no-escape"},
1120
- "do not process escape sequences",
1121
- [](common_params & params) {
1122
- params.escape = false;
1257
+ string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
1258
+ [](common_params & params, bool value) {
1259
+ params.escape = value;
1123
1260
  }
1124
1261
  ));
1125
1262
  add_opt(common_arg(
@@ -1128,59 +1265,53 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1128
1265
  [](common_params & params, int value) {
1129
1266
  params.n_print = value;
1130
1267
  }
1131
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1268
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1132
1269
  add_opt(common_arg(
1133
1270
  {"--prompt-cache"}, "FNAME",
1134
1271
  "file to cache prompt state for faster startup (default: none)",
1135
1272
  [](common_params & params, const std::string & value) {
1136
1273
  params.path_prompt_cache = value;
1137
1274
  }
1138
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1275
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1139
1276
  add_opt(common_arg(
1140
1277
  {"--prompt-cache-all"},
1141
1278
  "if specified, saves user input and generations to cache as well\n",
1142
1279
  [](common_params & params) {
1143
1280
  params.prompt_cache_all = true;
1144
1281
  }
1145
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1282
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1146
1283
  add_opt(common_arg(
1147
1284
  {"--prompt-cache-ro"},
1148
1285
  "if specified, uses the prompt cache but does not update it",
1149
1286
  [](common_params & params) {
1150
1287
  params.prompt_cache_ro = true;
1151
1288
  }
1152
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1289
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1153
1290
  add_opt(common_arg(
1154
1291
  {"-r", "--reverse-prompt"}, "PROMPT",
1155
1292
  "halt generation at PROMPT, return control in interactive mode\n",
1156
1293
  [](common_params & params, const std::string & value) {
1157
1294
  params.antiprompt.emplace_back(value);
1158
1295
  }
1159
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
1296
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
1160
1297
  add_opt(common_arg(
1161
1298
  {"-sp", "--special"},
1162
1299
  string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
1163
1300
  [](common_params & params) {
1164
1301
  params.special = true;
1165
1302
  }
1166
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
1303
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
1167
1304
  add_opt(common_arg(
1168
1305
  {"-cnv", "--conversation"},
1169
- "run in conversation mode:\n"
1306
+ {"-no-cnv", "--no-conversation"},
1307
+ "whether to run in conversation mode:\n"
1170
1308
  "- does not print special tokens and suffix/prefix\n"
1171
1309
  "- interactive mode is also enabled\n"
1172
1310
  "(default: auto enabled if chat template is available)",
1173
- [](common_params & params) {
1174
- params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
1175
- }
1176
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1177
- add_opt(common_arg(
1178
- {"-no-cnv", "--no-conversation"},
1179
- "force disable conversation mode (default: false)",
1180
- [](common_params & params) {
1181
- params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
1311
+ [](common_params & params, bool value) {
1312
+ params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
1182
1313
  }
1183
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1314
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
1184
1315
  add_opt(common_arg(
1185
1316
  {"-st", "--single-turn"},
1186
1317
  "run conversation for a single turn only, then exit when done\n"
@@ -1189,28 +1320,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1189
1320
  [](common_params & params) {
1190
1321
  params.single_turn = true;
1191
1322
  }
1192
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1323
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
1193
1324
  add_opt(common_arg(
1194
1325
  {"-i", "--interactive"},
1195
1326
  string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
1196
1327
  [](common_params & params) {
1197
1328
  params.interactive = true;
1198
1329
  }
1199
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1330
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1200
1331
  add_opt(common_arg(
1201
1332
  {"-if", "--interactive-first"},
1202
1333
  string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
1203
1334
  [](common_params & params) {
1204
1335
  params.interactive_first = true;
1205
1336
  }
1206
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1337
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1207
1338
  add_opt(common_arg(
1208
1339
  {"-mli", "--multiline-input"},
1209
1340
  "allows you to write or paste multiple lines without ending each in '\\'",
1210
1341
  [](common_params & params) {
1211
1342
  params.multiline_input = true;
1212
1343
  }
1213
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1344
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
1214
1345
  add_opt(common_arg(
1215
1346
  {"--in-prefix-bos"},
1216
1347
  "prefix BOS to user inputs, preceding the `--in-prefix` string",
@@ -1218,7 +1349,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1218
1349
  params.input_prefix_bos = true;
1219
1350
  params.enable_chat_template = false;
1220
1351
  }
1221
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1352
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1222
1353
  add_opt(common_arg(
1223
1354
  {"--in-prefix"}, "STRING",
1224
1355
  "string to prefix user inputs with (default: empty)",
@@ -1226,7 +1357,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1226
1357
  params.input_prefix = value;
1227
1358
  params.enable_chat_template = false;
1228
1359
  }
1229
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1360
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1230
1361
  add_opt(common_arg(
1231
1362
  {"--in-suffix"}, "STRING",
1232
1363
  "string to suffix after user inputs with (default: empty)",
@@ -1234,14 +1365,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1234
1365
  params.input_suffix = value;
1235
1366
  params.enable_chat_template = false;
1236
1367
  }
1237
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1368
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1238
1369
  add_opt(common_arg(
1370
+ {"--warmup"},
1239
1371
  {"--no-warmup"},
1240
- "skip warming up the model with an empty run",
1241
- [](common_params & params) {
1242
- params.warmup = false;
1372
+ string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
1373
+ [](common_params & params, bool value) {
1374
+ params.warmup = value;
1243
1375
  }
1244
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
1376
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
1245
1377
  add_opt(common_arg(
1246
1378
  {"--spm-infill"},
1247
1379
  string_format(
@@ -1298,7 +1430,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1298
1430
  params.sampling.top_k = value;
1299
1431
  params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
1300
1432
  }
1301
- ).set_sparam());
1433
+ ).set_sparam().set_env("LLAMA_ARG_TOP_K"));
1302
1434
  add_opt(common_arg(
1303
1435
  {"--top-p"}, "N",
1304
1436
  string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
@@ -1632,28 +1764,30 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1632
1764
  [](common_params & params, int value) {
1633
1765
  params.grp_attn_n = value;
1634
1766
  }
1635
- ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
1767
+ ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_PASSKEY}));
1636
1768
  add_opt(common_arg(
1637
1769
  {"-gaw", "--grp-attn-w"}, "N",
1638
1770
  string_format("group-attention width (default: %d)", params.grp_attn_w),
1639
1771
  [](common_params & params, int value) {
1640
1772
  params.grp_attn_w = value;
1641
1773
  }
1642
- ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
1774
+ ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
1643
1775
  add_opt(common_arg(
1776
+ {"-kvo", "--kv-offload"},
1644
1777
  {"-nkvo", "--no-kv-offload"},
1645
- "disable KV offload",
1646
- [](common_params & params) {
1647
- params.no_kv_offload = true;
1778
+ string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
1779
+ [](common_params & params, bool value) {
1780
+ params.no_kv_offload = !value;
1648
1781
  }
1649
- ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
1782
+ ).set_env("LLAMA_ARG_KV_OFFLOAD"));
1650
1783
  add_opt(common_arg(
1784
+ {"--repack"},
1651
1785
  {"-nr", "--no-repack"},
1652
- "disable weight repacking",
1653
- [](common_params & params) {
1654
- params.no_extra_bufts = true;
1786
+ string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
1787
+ [](common_params & params, bool value) {
1788
+ params.no_extra_bufts = !value;
1655
1789
  }
1656
- ).set_env("LLAMA_ARG_NO_REPACK"));
1790
+ ).set_env("LLAMA_ARG_REPACK"));
1657
1791
  add_opt(common_arg(
1658
1792
  {"--no-host"},
1659
1793
  "bypass host buffer allowing extra buffers to be used",
@@ -1766,13 +1900,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1766
1900
  LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
1767
1901
  }
1768
1902
  ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
1769
- add_opt(common_arg(
1770
- {"-np", "--parallel"}, "N",
1771
- string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
1772
- [](common_params & params, int value) {
1773
- params.n_parallel = value;
1774
- }
1775
- ).set_env("LLAMA_ARG_N_PARALLEL"));
1903
+ if (ex == LLAMA_EXAMPLE_SERVER) {
1904
+ // this is to make sure this option appears in the server-specific section of the help message
1905
+ add_opt(common_arg(
1906
+ {"-np", "--parallel"}, "N",
1907
+ string_format("number of server slots (default: %d, -1 = auto)", params.n_parallel),
1908
+ [](common_params & params, int value) {
1909
+ if (value == 0) {
1910
+ throw std::invalid_argument("error: invalid value for n_parallel\n");
1911
+ }
1912
+ params.n_parallel = value;
1913
+ }
1914
+ ).set_env("LLAMA_ARG_N_PARALLEL").set_examples({LLAMA_EXAMPLE_SERVER}));
1915
+ } else {
1916
+ add_opt(common_arg(
1917
+ {"-np", "--parallel"}, "N",
1918
+ string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
1919
+ [](common_params & params, int value) {
1920
+ params.n_parallel = value;
1921
+ }
1922
+ ).set_env("LLAMA_ARG_N_PARALLEL"));
1923
+ }
1776
1924
  add_opt(common_arg(
1777
1925
  {"-ns", "--sequences"}, "N",
1778
1926
  string_format("number of sequences to decode (default: %d)", params.n_sequences),
@@ -1782,20 +1930,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1782
1930
  ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
1783
1931
  add_opt(common_arg(
1784
1932
  {"-cb", "--cont-batching"},
1785
- string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
1786
- [](common_params & params) {
1787
- params.cont_batching = true;
1788
- }
1789
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
1790
- add_opt(common_arg(
1791
1933
  {"-nocb", "--no-cont-batching"},
1792
- "disable continuous batching",
1793
- [](common_params & params) {
1794
- params.cont_batching = false;
1934
+ string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
1935
+ [](common_params & params, bool value) {
1936
+ params.cont_batching = value;
1795
1937
  }
1796
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
1938
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
1797
1939
  add_opt(common_arg(
1798
- {"--mmproj"}, "FILE",
1940
+ {"-mm", "--mmproj"}, "FILE",
1799
1941
  "path to a multimodal projector file. see tools/mtmd/README.md\n"
1800
1942
  "note: if -hf is used, this argument can be omitted",
1801
1943
  [](common_params & params, const std::string & value) {
@@ -1803,33 +1945,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1803
1945
  }
1804
1946
  ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
1805
1947
  add_opt(common_arg(
1806
- {"--mmproj-url"}, "URL",
1948
+ {"-mmu", "--mmproj-url"}, "URL",
1807
1949
  "URL to a multimodal projector file. see tools/mtmd/README.md",
1808
1950
  [](common_params & params, const std::string & value) {
1809
1951
  params.mmproj.url = value;
1810
1952
  }
1811
1953
  ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
1812
1954
  add_opt(common_arg(
1813
- {"--no-mmproj"},
1814
- "explicitly disable multimodal projector, useful when using -hf",
1815
- [](common_params & params) {
1816
- params.no_mmproj = true;
1955
+ {"--mmproj-auto"},
1956
+ {"--no-mmproj", "--no-mmproj-auto"},
1957
+ string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
1958
+ [](common_params & params, bool value) {
1959
+ params.no_mmproj = !value;
1817
1960
  }
1818
- ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
1961
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
1819
1962
  add_opt(common_arg(
1963
+ {"--mmproj-offload"},
1820
1964
  {"--no-mmproj-offload"},
1821
- "do not offload multimodal projector to GPU",
1822
- [](common_params & params) {
1823
- params.mmproj_use_gpu = false;
1965
+ string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
1966
+ [](common_params & params, bool value) {
1967
+ params.mmproj_use_gpu = value;
1824
1968
  }
1825
- ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
1969
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
1826
1970
  add_opt(common_arg(
1827
1971
  {"--image", "--audio"}, "FILE",
1828
1972
  "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
1829
1973
  [](common_params & params, const std::string & value) {
1830
1974
  params.image.emplace_back(value);
1831
1975
  }
1832
- ).set_examples({LLAMA_EXAMPLE_MTMD}));
1976
+ ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
1833
1977
  add_opt(common_arg(
1834
1978
  {"--image-min-tokens"}, "N",
1835
1979
  "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
@@ -1862,12 +2006,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1862
2006
  }
1863
2007
  ).set_env("LLAMA_ARG_MLOCK"));
1864
2008
  add_opt(common_arg(
2009
+ {"--mmap"},
1865
2010
  {"--no-mmap"},
1866
- "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
1867
- [](common_params & params) {
1868
- params.use_mmap = false;
2011
+ string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
2012
+ [](common_params & params, bool value) {
2013
+ params.use_mmap = value;
1869
2014
  }
1870
- ).set_env("LLAMA_ARG_NO_MMAP"));
2015
+ ).set_env("LLAMA_ARG_MMAP"));
1871
2016
  add_opt(common_arg(
1872
2017
  {"--numa"}, "TYPE",
1873
2018
  "attempt optimizations that help on some NUMA systems\n"
@@ -1922,7 +2067,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1922
2067
  "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
1923
2068
  parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
1924
2069
  }
1925
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2070
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
1926
2071
  add_opt(common_arg(
1927
2072
  {"--cpu-moe", "-cmoe"},
1928
2073
  "keep all Mixture of Experts (MoE) weights in the CPU",
@@ -1951,7 +2096,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1951
2096
  [](common_params & params) {
1952
2097
  params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
1953
2098
  }
1954
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
2099
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
1955
2100
  add_opt(common_arg(
1956
2101
  {"--n-cpu-moe-draft", "-ncmoed"}, "N",
1957
2102
  "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
@@ -1965,7 +2110,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1965
2110
  params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
1966
2111
  }
1967
2112
  }
1968
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
2113
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
1969
2114
  add_opt(common_arg(
1970
2115
  {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
1971
2116
  string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
@@ -2037,6 +2182,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2037
2182
  }
2038
2183
  }
2039
2184
  ).set_env("LLAMA_ARG_MAIN_GPU"));
2185
+ add_opt(common_arg(
2186
+ { "-fit", "--fit" }, "[on|off]",
2187
+ string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"),
2188
+ [](common_params & params, const std::string & value) {
2189
+ if (is_truthy(value)) {
2190
+ params.fit_params = true;
2191
+ } else if (is_falsey(value)) {
2192
+ params.fit_params = false;
2193
+ } else {
2194
+ throw std::runtime_error(
2195
+ string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
2196
+ }
2197
+ }
2198
+ ).set_env("LLAMA_ARG_FIT"));
2199
+ add_opt(common_arg(
2200
+ { "-fitt", "--fit-target" }, "MiB",
2201
+ string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
2202
+ [](common_params & params, int value) {
2203
+ params.fit_params_target = value * size_t(1024*1024);
2204
+ }
2205
+ ).set_env("LLAMA_ARG_FIT_TARGET"));
2206
+ add_opt(common_arg(
2207
+ { "-fitc", "--fit-ctx" }, "N",
2208
+ string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx),
2209
+ [](common_params & params, int value) {
2210
+ params.fit_params_min_ctx = value;
2211
+ }
2212
+ ).set_env("LLAMA_ARG_FIT_CTX"));
2040
2213
  add_opt(common_arg(
2041
2214
  {"--check-tensors"},
2042
2215
  string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
@@ -2055,10 +2228,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2055
2228
  }
2056
2229
  ));
2057
2230
  add_opt(common_arg(
2231
+ {"--op-offload"},
2058
2232
  {"--no-op-offload"},
2059
- string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
2060
- [](common_params & params) {
2061
- params.no_op_offload = true;
2233
+ string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
2234
+ [](common_params & params, bool value) {
2235
+ params.no_op_offload = !value;
2062
2236
  }
2063
2237
  ));
2064
2238
  add_opt(common_arg(
@@ -2254,10 +2428,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2254
2428
  }
2255
2429
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2256
2430
  add_opt(common_arg(
2431
+ {"--ppl"},
2257
2432
  {"--no-ppl"},
2258
- string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
2259
- [](common_params & params) {
2260
- params.compute_ppl = false;
2433
+ string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
2434
+ [](common_params & params, bool value) {
2435
+ params.compute_ppl = value;
2261
2436
  }
2262
2437
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2263
2438
  add_opt(common_arg(
@@ -2376,12 +2551,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2376
2551
  }
2377
2552
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
2378
2553
  add_opt(common_arg(
2554
+ {"--webui"},
2379
2555
  {"--no-webui"},
2380
- string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
2381
- [](common_params & params) {
2382
- params.webui = false;
2556
+ string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
2557
+ [](common_params & params, bool value) {
2558
+ params.webui = value;
2383
2559
  }
2384
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
2560
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
2385
2561
  add_opt(common_arg(
2386
2562
  {"--embedding", "--embeddings"},
2387
2563
  string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@@ -2444,7 +2620,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2444
2620
  params.default_template_kwargs[item.key()] = item.value().dump();
2445
2621
  }
2446
2622
  }
2447
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
2623
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
2448
2624
  add_opt(common_arg(
2449
2625
  {"-to", "--timeout"}, "N",
2450
2626
  string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@@ -2486,18 +2662,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2486
2662
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
2487
2663
  add_opt(common_arg(
2488
2664
  {"--slots"},
2489
- string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
2490
- [](common_params & params) {
2491
- params.endpoint_slots = true;
2492
- }
2493
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
2494
- add_opt(common_arg(
2495
2665
  {"--no-slots"},
2496
- "disables slots monitoring endpoint",
2497
- [](common_params & params) {
2498
- params.endpoint_slots = false;
2666
+ string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
2667
+ [](common_params & params, bool value) {
2668
+ params.endpoint_slots = value;
2499
2669
  }
2500
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
2670
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
2501
2671
  add_opt(common_arg(
2502
2672
  {"--slot-save-path"}, "PATH",
2503
2673
  "path to save slot kv cache (default: disabled)",
@@ -2533,6 +2703,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2533
2703
  params.models_dir = value;
2534
2704
  }
2535
2705
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
2706
+ add_opt(common_arg(
2707
+ {"--models-preset"}, "PATH",
2708
+ "path to INI file containing model presets for the router server (default: disabled)",
2709
+ [](common_params & params, const std::string & value) {
2710
+ params.models_preset = value;
2711
+ }
2712
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_PRESET"));
2536
2713
  add_opt(common_arg(
2537
2714
  {"--models-max"}, "N",
2538
2715
  string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
@@ -2541,26 +2718,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2541
2718
  }
2542
2719
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
2543
2720
  add_opt(common_arg(
2721
+ {"--models-autoload"},
2544
2722
  {"--no-models-autoload"},
2545
- "disables automatic loading of models (default: enabled)",
2546
- [](common_params & params) {
2547
- params.models_autoload = false;
2723
+ string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
2724
+ [](common_params & params, bool value) {
2725
+ params.models_autoload = value;
2548
2726
  }
2549
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
2727
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
2550
2728
  add_opt(common_arg(
2551
2729
  {"--jinja"},
2552
- string_format("use jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
2553
- [](common_params & params) {
2554
- params.use_jinja = true;
2555
- }
2556
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
2557
- add_opt(common_arg(
2558
2730
  {"--no-jinja"},
2559
- string_format("disable jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
2560
- [](common_params & params) {
2561
- params.use_jinja = false;
2731
+ string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
2732
+ [](common_params & params, bool value) {
2733
+ params.use_jinja = value;
2562
2734
  }
2563
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
2735
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
2564
2736
  add_opt(common_arg(
2565
2737
  {"--reasoning-format"}, "FORMAT",
2566
2738
  "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@@ -2571,7 +2743,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2571
2743
  [](common_params & params, const std::string & value) {
2572
2744
  params.reasoning_format = common_reasoning_format_from_name(value);
2573
2745
  }
2574
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
2746
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
2575
2747
  add_opt(common_arg(
2576
2748
  {"--reasoning-budget"}, "N",
2577
2749
  "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
@@ -2579,7 +2751,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2579
2751
  if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
2580
2752
  params.reasoning_budget = value;
2581
2753
  }
2582
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
2754
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
2583
2755
  add_opt(common_arg(
2584
2756
  {"--chat-template"}, "JINJA_TEMPLATE",
2585
2757
  string_format(
@@ -2591,7 +2763,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2591
2763
  [](common_params & params, const std::string & value) {
2592
2764
  params.chat_template = value;
2593
2765
  }
2594
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
2766
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
2595
2767
  add_opt(common_arg(
2596
2768
  {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
2597
2769
  string_format(
@@ -2603,17 +2775,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2603
2775
  [](common_params & params, const std::string & value) {
2604
2776
  params.chat_template = read_file(value);
2605
2777
  }
2606
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
2778
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
2607
2779
  add_opt(common_arg(
2780
+ {"--prefill-assistant"},
2608
2781
  {"--no-prefill-assistant"},
2609
2782
  string_format(
2610
2783
  "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
2611
2784
  "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
2612
2785
  ),
2613
- [](common_params & params) {
2614
- params.prefill_assistant = false;
2786
+ [](common_params & params, bool value) {
2787
+ params.prefill_assistant = value;
2615
2788
  }
2616
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
2789
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
2617
2790
  add_opt(common_arg(
2618
2791
  {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
2619
2792
  string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
@@ -2634,7 +2807,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2634
2807
  [](common_params & params) {
2635
2808
  params.simple_io = true;
2636
2809
  }
2637
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
2810
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
2638
2811
  add_opt(common_arg(
2639
2812
  {"--positive-file"}, "FNAME",
2640
2813
  string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
@@ -2717,7 +2890,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2717
2890
  "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
2718
2891
  [](common_params & params) {
2719
2892
  params.verbosity = INT_MAX;
2720
- common_log_set_verbosity_thold(INT_MAX);
2721
2893
  }
2722
2894
  ));
2723
2895
  add_opt(common_arg(
@@ -2738,7 +2910,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2738
2910
  "(default: %d)\n", params.verbosity),
2739
2911
  [](common_params & params, int value) {
2740
2912
  params.verbosity = value;
2741
- common_log_set_verbosity_thold(value);
2742
2913
  }
2743
2914
  ).set_env("LLAMA_LOG_VERBOSITY"));
2744
2915
  add_opt(common_arg(
@@ -2871,14 +3042,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2871
3042
  [](common_params & params, int value) {
2872
3043
  params.speculative.n_max = value;
2873
3044
  }
2874
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
3045
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX"));
2875
3046
  add_opt(common_arg(
2876
3047
  {"--draft-min", "--draft-n-min"}, "N",
2877
3048
  string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
2878
3049
  [](common_params & params, int value) {
2879
3050
  params.speculative.n_min = value;
2880
3051
  }
2881
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
3052
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN"));
2882
3053
  add_opt(common_arg(
2883
3054
  {"--draft-p-split"}, "P",
2884
3055
  string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
@@ -2892,14 +3063,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2892
3063
  [](common_params & params, const std::string & value) {
2893
3064
  params.speculative.p_min = std::stof(value);
2894
3065
  }
2895
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
3066
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
2896
3067
  add_opt(common_arg(
2897
3068
  {"-cd", "--ctx-size-draft"}, "N",
2898
3069
  string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
2899
3070
  [](common_params & params, int value) {
2900
3071
  params.speculative.n_ctx = value;
2901
3072
  }
2902
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
3073
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
2903
3074
  add_opt(common_arg(
2904
3075
  {"-devd", "--device-draft"}, "<dev1,dev2,..>",
2905
3076
  "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@@ -2907,7 +3078,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2907
3078
  [](common_params & params, const std::string & value) {
2908
3079
  params.speculative.devices = parse_device_list(value);
2909
3080
  }
2910
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
3081
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
2911
3082
  add_opt(common_arg(
2912
3083
  {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
2913
3084
  "number of layers to store in VRAM for the draft model",
@@ -2919,21 +3090,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2919
3090
  fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
2920
3091
  }
2921
3092
  }
2922
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
3093
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
2923
3094
  add_opt(common_arg(
2924
3095
  {"-md", "--model-draft"}, "FNAME",
2925
3096
  "draft model for speculative decoding (default: unused)",
2926
3097
  [](common_params & params, const std::string & value) {
2927
3098
  params.speculative.model.path = value;
2928
3099
  }
2929
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3100
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_MODEL_DRAFT"));
2930
3101
  add_opt(common_arg(
2931
3102
  {"--spec-replace"}, "TARGET", "DRAFT",
2932
3103
  "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
2933
3104
  [](common_params & params, const std::string & tgt, const std::string & dft) {
2934
3105
  params.speculative.replacements.push_back({ tgt, dft });
2935
3106
  }
2936
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
3107
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
2937
3108
  add_opt(common_arg(
2938
3109
  {"-ctkd", "--cache-type-k-draft"}, "TYPE",
2939
3110
  string_format(
@@ -3197,7 +3368,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3197
3368
  params.use_jinja = true;
3198
3369
  //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
3199
3370
  }
3200
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
3371
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3201
3372
 
3202
3373
  add_opt(common_arg(
3203
3374
  {"--gpt-oss-120b-default"},
@@ -3216,7 +3387,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3216
3387
  params.use_jinja = true;
3217
3388
  //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
3218
3389
  }
3219
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
3390
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3220
3391
 
3221
3392
  add_opt(common_arg(
3222
3393
  {"--vision-gemma-4b-default"},
@@ -3227,7 +3398,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3227
3398
  params.n_ctx = 0;
3228
3399
  params.use_jinja = true;
3229
3400
  }
3230
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
3401
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3231
3402
 
3232
3403
  add_opt(common_arg(
3233
3404
  {"--vision-gemma-12b-default"},
@@ -3238,7 +3409,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3238
3409
  params.n_ctx = 0;
3239
3410
  params.use_jinja = true;
3240
3411
  }
3241
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
3412
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3242
3413
 
3243
3414
  return ctx_arg;
3244
3415
  }