@fugood/llama.node 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +1 -1
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +8 -8
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +8 -9
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +4 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +43 -9
  25. package/src/llama.cpp/.github/workflows/docker.yml +3 -0
  26. package/src/llama.cpp/CMakeLists.txt +7 -4
  27. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  28. package/src/llama.cpp/common/CMakeLists.txt +0 -2
  29. package/src/llama.cpp/common/arg.cpp +642 -607
  30. package/src/llama.cpp/common/arg.h +22 -22
  31. package/src/llama.cpp/common/common.cpp +79 -281
  32. package/src/llama.cpp/common/common.h +130 -100
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  34. package/src/llama.cpp/common/log.cpp +50 -50
  35. package/src/llama.cpp/common/log.h +18 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  37. package/src/llama.cpp/common/ngram-cache.h +19 -19
  38. package/src/llama.cpp/common/sampling.cpp +116 -108
  39. package/src/llama.cpp/common/sampling.h +20 -20
  40. package/src/llama.cpp/docs/build.md +37 -17
  41. package/src/llama.cpp/examples/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +14 -14
  43. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  47. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  48. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  49. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  50. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  51. package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
  52. package/src/llama.cpp/examples/infill/infill.cpp +40 -86
  53. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
  54. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  55. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  56. package/src/llama.cpp/examples/llava/clip.cpp +1 -0
  57. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  58. package/src/llama.cpp/examples/llava/llava.cpp +37 -3
  59. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  60. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  61. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  62. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  63. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
  64. package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
  65. package/src/llama.cpp/examples/main/main.cpp +64 -109
  66. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  67. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  68. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  69. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  70. package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
  71. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  72. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
  73. package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
  74. package/src/llama.cpp/examples/server/server.cpp +553 -691
  75. package/src/llama.cpp/examples/server/utils.hpp +312 -25
  76. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  77. package/src/llama.cpp/examples/simple/simple.cpp +128 -96
  78. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  79. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  80. package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
  81. package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
  82. package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
  83. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  84. package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
  85. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  86. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  87. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  88. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  89. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  90. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  91. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  92. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  93. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  94. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  95. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  96. package/src/llama.cpp/ggml/include/ggml.h +53 -393
  97. package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
  98. package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
  99. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  100. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
  101. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  102. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  103. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  104. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  105. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  106. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
  107. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  108. package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
  109. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  110. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
  111. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  112. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
  113. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  114. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  115. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  116. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
  117. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  118. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  120. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  121. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
  122. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  123. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  124. package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
  125. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  126. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
  127. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  128. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  129. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  130. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  131. package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
  132. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  133. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  134. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
  135. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  136. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  137. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
  138. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
  141. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  142. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  143. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
  144. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
  145. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
  146. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  148. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  149. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  150. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  151. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  152. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  153. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  154. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  155. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
  156. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
  157. package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
  158. package/src/llama.cpp/include/llama.h +67 -33
  159. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  160. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  161. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  162. package/src/llama.cpp/src/llama-sampling.cpp +745 -105
  163. package/src/llama.cpp/src/llama-sampling.h +21 -2
  164. package/src/llama.cpp/src/llama-vocab.cpp +49 -9
  165. package/src/llama.cpp/src/llama-vocab.h +35 -11
  166. package/src/llama.cpp/src/llama.cpp +2636 -2406
  167. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  168. package/src/llama.cpp/tests/CMakeLists.txt +1 -2
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
  171. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  172. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  173. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  174. package/src/llama.cpp/tests/test-log.cpp +2 -2
  175. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  176. package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
  177. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  178. package/src/llama.cpp/tests/test-rope.cpp +1 -0
  179. package/src/llama.cpp/tests/test-sampling.cpp +162 -137
  180. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  181. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  182. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  183. package/src/llama.cpp/common/train.cpp +0 -1515
  184. package/src/llama.cpp/common/train.h +0 -233
  185. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  186. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  187. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  188. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  189. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
  190. /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
@@ -17,27 +17,27 @@
17
17
 
18
18
  using json = nlohmann::ordered_json;
19
19
 
20
- llama_arg & llama_arg::set_examples(std::initializer_list<enum llama_example> examples) {
20
+ common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
21
21
  this->examples = std::move(examples);
22
22
  return *this;
23
23
  }
24
24
 
25
- llama_arg & llama_arg::set_env(const char * env) {
25
+ common_arg & common_arg::set_env(const char * env) {
26
26
  help = help + "\n(env: " + env + ")";
27
27
  this->env = env;
28
28
  return *this;
29
29
  }
30
30
 
31
- llama_arg & llama_arg::set_sparam() {
31
+ common_arg & common_arg::set_sparam() {
32
32
  is_sparam = true;
33
33
  return *this;
34
34
  }
35
35
 
36
- bool llama_arg::in_example(enum llama_example ex) {
36
+ bool common_arg::in_example(enum llama_example ex) {
37
37
  return examples.find(ex) != examples.end();
38
38
  }
39
39
 
40
- bool llama_arg::get_value_from_env(std::string & output) {
40
+ bool common_arg::get_value_from_env(std::string & output) {
41
41
  if (env == nullptr) return false;
42
42
  char * value = std::getenv(env);
43
43
  if (value) {
@@ -47,7 +47,7 @@ bool llama_arg::get_value_from_env(std::string & output) {
47
47
  return false;
48
48
  }
49
49
 
50
- bool llama_arg::has_value_from_env() {
50
+ bool common_arg::has_value_from_env() {
51
51
  return env != nullptr && std::getenv(env);
52
52
  }
53
53
 
@@ -78,7 +78,7 @@ static std::vector<std::string> break_str_into_lines(std::string input, size_t m
78
78
  return result;
79
79
  }
80
80
 
81
- std::string llama_arg::to_string() {
81
+ std::string common_arg::to_string() {
82
82
  // params for printing to console
83
83
  const static int n_leading_spaces = 40;
84
84
  const static int n_char_per_line_help = 70; // TODO: detect this based on current console
@@ -119,33 +119,7 @@ std::string llama_arg::to_string() {
119
119
  // utils
120
120
  //
121
121
 
122
- #ifdef __GNUC__
123
- #ifdef __MINGW32__
124
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
125
- #else
126
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
127
- #endif
128
- #else
129
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
130
- #endif
131
-
132
- LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
133
- static std::string format(const char * fmt, ...) {
134
- va_list ap;
135
- va_list ap2;
136
- va_start(ap, fmt);
137
- va_copy(ap2, ap);
138
- int size = vsnprintf(NULL, 0, fmt, ap);
139
- GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
140
- std::vector<char> buf(size + 1);
141
- int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
142
- GGML_ASSERT(size2 == size);
143
- va_end(ap2);
144
- va_end(ap);
145
- return std::string(buf.data(), size);
146
- }
147
-
148
- static void gpt_params_handle_model_default(gpt_params & params) {
122
+ static void common_params_handle_model_default(common_params & params) {
149
123
  if (!params.hf_repo.empty()) {
150
124
  // short-hand to avoid specifying --hf-file -> default it to --model
151
125
  if (params.hf_file.empty()) {
@@ -154,13 +128,13 @@ static void gpt_params_handle_model_default(gpt_params & params) {
154
128
  }
155
129
  params.hf_file = params.model;
156
130
  } else if (params.model.empty()) {
157
- params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
131
+ params.model = fs_get_cache_file(string_split<std::string>(params.hf_file, '/').back());
158
132
  }
159
133
  } else if (!params.model_url.empty()) {
160
134
  if (params.model.empty()) {
161
- auto f = string_split(params.model_url, '#').front();
162
- f = string_split(f, '?').front();
163
- params.model = fs_get_cache_file(string_split(f, '/').back());
135
+ auto f = string_split<std::string>(params.model_url, '#').front();
136
+ f = string_split<std::string>(f, '?').front();
137
+ params.model = fs_get_cache_file(string_split<std::string>(f, '/').back());
164
138
  }
165
139
  } else if (params.model.empty()) {
166
140
  params.model = DEFAULT_MODEL_PATH;
@@ -171,12 +145,12 @@ static void gpt_params_handle_model_default(gpt_params & params) {
171
145
  // CLI argument parsing functions
172
146
  //
173
147
 
174
- static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx_arg) {
148
+ static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
175
149
  std::string arg;
176
150
  const std::string arg_prefix = "--";
177
- gpt_params & params = ctx_arg.params;
151
+ common_params & params = ctx_arg.params;
178
152
 
179
- std::unordered_map<std::string, llama_arg *> arg_to_options;
153
+ std::unordered_map<std::string, common_arg *> arg_to_options;
180
154
  for (auto & opt : ctx_arg.options) {
181
155
  for (const auto & arg : opt.args) {
182
156
  arg_to_options[arg] = &opt;
@@ -199,7 +173,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
199
173
  continue;
200
174
  }
201
175
  } catch (std::exception & e) {
202
- throw std::invalid_argument(format(
176
+ throw std::invalid_argument(string_format(
203
177
  "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
204
178
  }
205
179
  }
@@ -220,7 +194,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
220
194
  std::replace(arg.begin(), arg.end(), '_', '-');
221
195
  }
222
196
  if (arg_to_options.find(arg) == arg_to_options.end()) {
223
- throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str()));
197
+ throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
224
198
  }
225
199
  auto opt = *arg_to_options[arg];
226
200
  if (opt.has_value_from_env()) {
@@ -252,7 +226,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
252
226
  continue;
253
227
  }
254
228
  } catch (std::exception & e) {
255
- throw std::invalid_argument(format(
229
+ throw std::invalid_argument(string_format(
256
230
  "error while handling argument \"%s\": %s\n\n"
257
231
  "usage:\n%s\n\nto show complete usage, run with -h",
258
232
  arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
@@ -268,7 +242,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
268
242
  throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
269
243
  }
270
244
 
271
- gpt_params_handle_model_default(params);
245
+ common_params_handle_model_default(params);
272
246
 
273
247
  if (params.escape) {
274
248
  string_process_escapes(params.prompt);
@@ -277,6 +251,9 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
277
251
  for (auto & antiprompt : params.antiprompt) {
278
252
  string_process_escapes(antiprompt);
279
253
  }
254
+ for (auto & seq_breaker : params.sparams.dry_sequence_breakers) {
255
+ string_process_escapes(seq_breaker);
256
+ }
280
257
  }
281
258
 
282
259
  if (!params.kv_overrides.empty()) {
@@ -291,16 +268,16 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
291
268
  return true;
292
269
  }
293
270
 
294
- static void gpt_params_print_usage(gpt_params_context & ctx_arg) {
295
- auto print_options = [](std::vector<llama_arg *> & options) {
296
- for (llama_arg * opt : options) {
271
+ static void common_params_print_usage(common_params_context & ctx_arg) {
272
+ auto print_options = [](std::vector<common_arg *> & options) {
273
+ for (common_arg * opt : options) {
297
274
  printf("%s", opt->to_string().c_str());
298
275
  }
299
276
  };
300
277
 
301
- std::vector<llama_arg *> common_options;
302
- std::vector<llama_arg *> sparam_options;
303
- std::vector<llama_arg *> specific_options;
278
+ std::vector<common_arg *> common_options;
279
+ std::vector<common_arg *> sparam_options;
280
+ std::vector<common_arg *> specific_options;
304
281
  for (auto & opt : ctx_arg.options) {
305
282
  // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
306
283
  if (opt.is_sparam) {
@@ -320,17 +297,17 @@ static void gpt_params_print_usage(gpt_params_context & ctx_arg) {
320
297
  print_options(specific_options);
321
298
  }
322
299
 
323
- bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) {
324
- auto ctx_arg = gpt_params_parser_init(params, ex, print_usage);
325
- const gpt_params params_org = ctx_arg.params; // the example can modify the default params
300
+ bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
301
+ auto ctx_arg = common_params_parser_init(params, ex, print_usage);
302
+ const common_params params_org = ctx_arg.params; // the example can modify the default params
326
303
 
327
304
  try {
328
- if (!gpt_params_parse_ex(argc, argv, ctx_arg)) {
305
+ if (!common_params_parse_ex(argc, argv, ctx_arg)) {
329
306
  ctx_arg.params = params_org;
330
307
  return false;
331
308
  }
332
309
  if (ctx_arg.params.usage) {
333
- gpt_params_print_usage(ctx_arg);
310
+ common_params_print_usage(ctx_arg);
334
311
  if (ctx_arg.print_usage) {
335
312
  ctx_arg.print_usage(argc, argv);
336
313
  }
@@ -345,16 +322,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example
345
322
  return true;
346
323
  }
347
324
 
348
- gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) {
349
- gpt_params_context ctx_arg(params);
325
+ common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
326
+ common_params_context ctx_arg(params);
350
327
  ctx_arg.print_usage = print_usage;
351
328
  ctx_arg.ex = ex;
352
329
 
353
330
  std::string sampler_type_chars;
354
331
  std::string sampler_type_names;
355
332
  for (const auto & sampler : params.sparams.samplers) {
356
- sampler_type_chars += gpt_sampler_type_to_chr(sampler);
357
- sampler_type_names += gpt_sampler_type_to_str(sampler) + ";";
333
+ sampler_type_chars += common_sampler_type_to_chr(sampler);
334
+ sampler_type_names += common_sampler_type_to_str(sampler) + ";";
358
335
  }
359
336
  sampler_type_names.pop_back();
360
337
 
@@ -366,374 +343,374 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
366
343
  * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
367
344
  * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
368
345
  */
369
- auto add_opt = [&](llama_arg arg) {
346
+ auto add_opt = [&](common_arg arg) {
370
347
  if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
371
348
  ctx_arg.options.push_back(std::move(arg));
372
349
  }
373
350
  };
374
351
 
375
352
 
376
- add_opt(llama_arg(
353
+ add_opt(common_arg(
377
354
  {"-h", "--help", "--usage"},
378
355
  "print usage and exit",
379
- [](gpt_params & params) {
356
+ [](common_params & params) {
380
357
  params.usage = true;
381
358
  }
382
359
  ));
383
- add_opt(llama_arg(
360
+ add_opt(common_arg(
384
361
  {"--version"},
385
362
  "show version and build info",
386
- [](gpt_params &) {
363
+ [](common_params &) {
387
364
  fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
388
365
  fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
389
366
  exit(0);
390
367
  }
391
368
  ));
392
- add_opt(llama_arg(
369
+ add_opt(common_arg(
393
370
  {"--verbose-prompt"},
394
- format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
395
- [](gpt_params & params) {
371
+ string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
372
+ [](common_params & params) {
396
373
  params.verbose_prompt = true;
397
374
  }
398
375
  ));
399
- add_opt(llama_arg(
376
+ add_opt(common_arg(
400
377
  {"--no-display-prompt"},
401
- format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
402
- [](gpt_params & params) {
378
+ string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
379
+ [](common_params & params) {
403
380
  params.display_prompt = false;
404
381
  }
405
382
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
406
- add_opt(llama_arg(
383
+ add_opt(common_arg(
407
384
  {"-co", "--color"},
408
- format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
409
- [](gpt_params & params) {
385
+ string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
386
+ [](common_params & params) {
410
387
  params.use_color = true;
411
388
  }
412
389
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
413
- add_opt(llama_arg(
390
+ add_opt(common_arg(
414
391
  {"-t", "--threads"}, "N",
415
- format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
416
- [](gpt_params & params, int value) {
392
+ string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
393
+ [](common_params & params, int value) {
417
394
  params.cpuparams.n_threads = value;
418
395
  if (params.cpuparams.n_threads <= 0) {
419
396
  params.cpuparams.n_threads = std::thread::hardware_concurrency();
420
397
  }
421
398
  }
422
399
  ).set_env("LLAMA_ARG_THREADS"));
423
- add_opt(llama_arg(
400
+ add_opt(common_arg(
424
401
  {"-tb", "--threads-batch"}, "N",
425
402
  "number of threads to use during batch and prompt processing (default: same as --threads)",
426
- [](gpt_params & params, int value) {
403
+ [](common_params & params, int value) {
427
404
  params.cpuparams_batch.n_threads = value;
428
405
  if (params.cpuparams_batch.n_threads <= 0) {
429
406
  params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
430
407
  }
431
408
  }
432
409
  ));
433
- add_opt(llama_arg(
410
+ add_opt(common_arg(
434
411
  {"-td", "--threads-draft"}, "N",
435
412
  "number of threads to use during generation (default: same as --threads)",
436
- [](gpt_params & params, int value) {
413
+ [](common_params & params, int value) {
437
414
  params.draft_cpuparams.n_threads = value;
438
415
  if (params.draft_cpuparams.n_threads <= 0) {
439
416
  params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
440
417
  }
441
418
  }
442
419
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
443
- add_opt(llama_arg(
420
+ add_opt(common_arg(
444
421
  {"-tbd", "--threads-batch-draft"}, "N",
445
422
  "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
446
- [](gpt_params & params, int value) {
423
+ [](common_params & params, int value) {
447
424
  params.draft_cpuparams_batch.n_threads = value;
448
425
  if (params.draft_cpuparams_batch.n_threads <= 0) {
449
426
  params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
450
427
  }
451
428
  }
452
429
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
453
- add_opt(llama_arg(
430
+ add_opt(common_arg(
454
431
  {"-C", "--cpu-mask"}, "M",
455
432
  "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
456
- [](gpt_params & params, const std::string & mask) {
433
+ [](common_params & params, const std::string & mask) {
457
434
  params.cpuparams.mask_valid = true;
458
435
  if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) {
459
436
  throw std::invalid_argument("invalid cpumask");
460
437
  }
461
438
  }
462
439
  ));
463
- add_opt(llama_arg(
440
+ add_opt(common_arg(
464
441
  {"-Cr", "--cpu-range"}, "lo-hi",
465
442
  "range of CPUs for affinity. Complements --cpu-mask",
466
- [](gpt_params & params, const std::string & range) {
443
+ [](common_params & params, const std::string & range) {
467
444
  params.cpuparams.mask_valid = true;
468
445
  if (!parse_cpu_range(range, params.cpuparams.cpumask)) {
469
446
  throw std::invalid_argument("invalid range");
470
447
  }
471
448
  }
472
449
  ));
473
- add_opt(llama_arg(
450
+ add_opt(common_arg(
474
451
  {"--cpu-strict"}, "<0|1>",
475
- format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
476
- [](gpt_params & params, const std::string & value) {
452
+ string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
453
+ [](common_params & params, const std::string & value) {
477
454
  params.cpuparams.strict_cpu = std::stoul(value);
478
455
  }
479
456
  ));
480
- add_opt(llama_arg(
457
+ add_opt(common_arg(
481
458
  {"--prio"}, "N",
482
- format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
483
- [](gpt_params & params, int prio) {
459
+ string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
460
+ [](common_params & params, int prio) {
484
461
  if (prio < 0 || prio > 3) {
485
462
  throw std::invalid_argument("invalid value");
486
463
  }
487
464
  params.cpuparams.priority = (enum ggml_sched_priority) prio;
488
465
  }
489
466
  ));
490
- add_opt(llama_arg(
467
+ add_opt(common_arg(
491
468
  {"--poll"}, "<0...100>",
492
- format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
493
- [](gpt_params & params, const std::string & value) {
469
+ string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
470
+ [](common_params & params, const std::string & value) {
494
471
  params.cpuparams.poll = std::stoul(value);
495
472
  }
496
473
  ));
497
- add_opt(llama_arg(
474
+ add_opt(common_arg(
498
475
  {"-Cb", "--cpu-mask-batch"}, "M",
499
476
  "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
500
- [](gpt_params & params, const std::string & mask) {
477
+ [](common_params & params, const std::string & mask) {
501
478
  params.cpuparams_batch.mask_valid = true;
502
479
  if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) {
503
480
  throw std::invalid_argument("invalid cpumask");
504
481
  }
505
482
  }
506
483
  ));
507
- add_opt(llama_arg(
484
+ add_opt(common_arg(
508
485
  {"-Crb", "--cpu-range-batch"}, "lo-hi",
509
486
  "ranges of CPUs for affinity. Complements --cpu-mask-batch",
510
- [](gpt_params & params, const std::string & range) {
487
+ [](common_params & params, const std::string & range) {
511
488
  params.cpuparams_batch.mask_valid = true;
512
489
  if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) {
513
490
  throw std::invalid_argument("invalid range");
514
491
  }
515
492
  }
516
493
  ));
517
- add_opt(llama_arg(
494
+ add_opt(common_arg(
518
495
  {"--cpu-strict-batch"}, "<0|1>",
519
496
  "use strict CPU placement (default: same as --cpu-strict)",
520
- [](gpt_params & params, int value) {
497
+ [](common_params & params, int value) {
521
498
  params.cpuparams_batch.strict_cpu = value;
522
499
  }
523
500
  ));
524
- add_opt(llama_arg(
501
+ add_opt(common_arg(
525
502
  {"--prio-batch"}, "N",
526
- format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
527
- [](gpt_params & params, int prio) {
503
+ string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
504
+ [](common_params & params, int prio) {
528
505
  if (prio < 0 || prio > 3) {
529
506
  throw std::invalid_argument("invalid value");
530
507
  }
531
508
  params.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
532
509
  }
533
510
  ));
534
- add_opt(llama_arg(
511
+ add_opt(common_arg(
535
512
  {"--poll-batch"}, "<0|1>",
536
513
  "use polling to wait for work (default: same as --poll)",
537
- [](gpt_params & params, int value) {
514
+ [](common_params & params, int value) {
538
515
  params.cpuparams_batch.poll = value;
539
516
  }
540
517
  ));
541
- add_opt(llama_arg(
518
+ add_opt(common_arg(
542
519
  {"-Cd", "--cpu-mask-draft"}, "M",
543
520
  "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
544
- [](gpt_params & params, const std::string & mask) {
521
+ [](common_params & params, const std::string & mask) {
545
522
  params.draft_cpuparams.mask_valid = true;
546
523
  if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
547
524
  throw std::invalid_argument("invalid cpumask");
548
525
  }
549
526
  }
550
527
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
551
- add_opt(llama_arg(
528
+ add_opt(common_arg(
552
529
  {"-Crd", "--cpu-range-draft"}, "lo-hi",
553
530
  "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
554
- [](gpt_params & params, const std::string & range) {
531
+ [](common_params & params, const std::string & range) {
555
532
  params.draft_cpuparams.mask_valid = true;
556
533
  if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
557
534
  throw std::invalid_argument("invalid range");
558
535
  }
559
536
  }
560
537
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
561
- add_opt(llama_arg(
538
+ add_opt(common_arg(
562
539
  {"--cpu-strict-draft"}, "<0|1>",
563
540
  "Use strict CPU placement for draft model (default: same as --cpu-strict)",
564
- [](gpt_params & params, int value) {
541
+ [](common_params & params, int value) {
565
542
  params.draft_cpuparams.strict_cpu = value;
566
543
  }
567
544
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
568
- add_opt(llama_arg(
545
+ add_opt(common_arg(
569
546
  {"--prio-draft"}, "N",
570
- format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
571
- [](gpt_params & params, int prio) {
547
+ string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
548
+ [](common_params & params, int prio) {
572
549
  if (prio < 0 || prio > 3) {
573
550
  throw std::invalid_argument("invalid value");
574
551
  }
575
552
  params.draft_cpuparams.priority = (enum ggml_sched_priority) prio;
576
553
  }
577
554
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
578
- add_opt(llama_arg(
555
+ add_opt(common_arg(
579
556
  {"--poll-draft"}, "<0|1>",
580
557
  "Use polling to wait for draft model work (default: same as --poll])",
581
- [](gpt_params & params, int value) {
558
+ [](common_params & params, int value) {
582
559
  params.draft_cpuparams.poll = value;
583
560
  }
584
561
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
585
- add_opt(llama_arg(
562
+ add_opt(common_arg(
586
563
  {"-Cbd", "--cpu-mask-batch-draft"}, "M",
587
564
  "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
588
- [](gpt_params & params, const std::string & mask) {
565
+ [](common_params & params, const std::string & mask) {
589
566
  params.draft_cpuparams_batch.mask_valid = true;
590
567
  if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) {
591
568
  throw std::invalid_argument("invalid cpumask");
592
569
  }
593
570
  }
594
571
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
595
- add_opt(llama_arg(
572
+ add_opt(common_arg(
596
573
  {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
597
574
  "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
598
- [](gpt_params & params, const std::string & range) {
575
+ [](common_params & params, const std::string & range) {
599
576
  params.draft_cpuparams_batch.mask_valid = true;
600
577
  if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
601
578
  throw std::invalid_argument("invalid cpumask");
602
579
  }
603
580
  }
604
581
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
605
- add_opt(llama_arg(
582
+ add_opt(common_arg(
606
583
  {"--cpu-strict-batch-draft"}, "<0|1>",
607
584
  "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
608
- [](gpt_params & params, int value) {
585
+ [](common_params & params, int value) {
609
586
  params.draft_cpuparams_batch.strict_cpu = value;
610
587
  }
611
588
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
612
- add_opt(llama_arg(
589
+ add_opt(common_arg(
613
590
  {"--prio-batch-draft"}, "N",
614
- format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
615
- [](gpt_params & params, int prio) {
591
+ string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
592
+ [](common_params & params, int prio) {
616
593
  if (prio < 0 || prio > 3) {
617
594
  throw std::invalid_argument("invalid value");
618
595
  }
619
596
  params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio;
620
597
  }
621
598
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
622
- add_opt(llama_arg(
599
+ add_opt(common_arg(
623
600
  {"--poll-batch-draft"}, "<0|1>",
624
601
  "Use polling to wait for draft model work (default: --poll-draft)",
625
- [](gpt_params & params, int value) {
602
+ [](common_params & params, int value) {
626
603
  params.draft_cpuparams_batch.poll = value;
627
604
  }
628
605
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
629
- add_opt(llama_arg(
606
+ add_opt(common_arg(
630
607
  {"--draft"}, "N",
631
- format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
632
- [](gpt_params & params, int value) {
608
+ string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
609
+ [](common_params & params, int value) {
633
610
  params.n_draft = value;
634
611
  }
635
612
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
636
- add_opt(llama_arg(
613
+ add_opt(common_arg(
637
614
  {"-ps", "--p-split"}, "N",
638
- format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
639
- [](gpt_params & params, const std::string & value) {
615
+ string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
616
+ [](common_params & params, const std::string & value) {
640
617
  params.p_split = std::stof(value);
641
618
  }
642
619
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
643
- add_opt(llama_arg(
620
+ add_opt(common_arg(
644
621
  {"-lcs", "--lookup-cache-static"}, "FNAME",
645
622
  "path to static lookup cache to use for lookup decoding (not updated by generation)",
646
- [](gpt_params & params, const std::string & value) {
623
+ [](common_params & params, const std::string & value) {
647
624
  params.lookup_cache_static = value;
648
625
  }
649
626
  ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
650
- add_opt(llama_arg(
627
+ add_opt(common_arg(
651
628
  {"-lcd", "--lookup-cache-dynamic"}, "FNAME",
652
629
  "path to dynamic lookup cache to use for lookup decoding (updated by generation)",
653
- [](gpt_params & params, const std::string & value) {
630
+ [](common_params & params, const std::string & value) {
654
631
  params.lookup_cache_dynamic = value;
655
632
  }
656
633
  ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
657
- add_opt(llama_arg(
634
+ add_opt(common_arg(
658
635
  {"-c", "--ctx-size"}, "N",
659
- format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
660
- [](gpt_params & params, int value) {
636
+ string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
637
+ [](common_params & params, int value) {
661
638
  params.n_ctx = value;
662
639
  }
663
640
  ).set_env("LLAMA_ARG_CTX_SIZE"));
664
- add_opt(llama_arg(
641
+ add_opt(common_arg(
665
642
  {"-n", "--predict", "--n-predict"}, "N",
666
- format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
667
- [](gpt_params & params, int value) {
643
+ string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
644
+ [](common_params & params, int value) {
668
645
  params.n_predict = value;
669
646
  }
670
647
  ).set_env("LLAMA_ARG_N_PREDICT"));
671
- add_opt(llama_arg(
648
+ add_opt(common_arg(
672
649
  {"-b", "--batch-size"}, "N",
673
- format("logical maximum batch size (default: %d)", params.n_batch),
674
- [](gpt_params & params, int value) {
650
+ string_format("logical maximum batch size (default: %d)", params.n_batch),
651
+ [](common_params & params, int value) {
675
652
  params.n_batch = value;
676
653
  }
677
654
  ).set_env("LLAMA_ARG_BATCH"));
678
- add_opt(llama_arg(
655
+ add_opt(common_arg(
679
656
  {"-ub", "--ubatch-size"}, "N",
680
- format("physical maximum batch size (default: %d)", params.n_ubatch),
681
- [](gpt_params & params, int value) {
657
+ string_format("physical maximum batch size (default: %d)", params.n_ubatch),
658
+ [](common_params & params, int value) {
682
659
  params.n_ubatch = value;
683
660
  }
684
661
  ).set_env("LLAMA_ARG_UBATCH"));
685
- add_opt(llama_arg(
662
+ add_opt(common_arg(
686
663
  {"--keep"}, "N",
687
- format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
688
- [](gpt_params & params, int value) {
664
+ string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
665
+ [](common_params & params, int value) {
689
666
  params.n_keep = value;
690
667
  }
691
668
  ));
692
- add_opt(llama_arg(
669
+ add_opt(common_arg(
693
670
  {"--no-context-shift"},
694
- format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
695
- [](gpt_params & params) {
671
+ string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
672
+ [](common_params & params) {
696
673
  params.ctx_shift = false;
697
674
  }
698
675
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
699
- add_opt(llama_arg(
676
+ add_opt(common_arg(
700
677
  {"--chunks"}, "N",
701
- format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
702
- [](gpt_params & params, int value) {
678
+ string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
679
+ [](common_params & params, int value) {
703
680
  params.n_chunks = value;
704
681
  }
705
682
  ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
706
- add_opt(llama_arg(
683
+ add_opt(common_arg(
707
684
  {"-fa", "--flash-attn"},
708
- format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
709
- [](gpt_params & params) {
685
+ string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
686
+ [](common_params & params) {
710
687
  params.flash_attn = true;
711
688
  }
712
689
  ).set_env("LLAMA_ARG_FLASH_ATTN"));
713
- add_opt(llama_arg(
690
+ add_opt(common_arg(
714
691
  {"-p", "--prompt"}, "PROMPT",
715
692
  ex == LLAMA_EXAMPLE_MAIN
716
693
  ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
717
694
  : "prompt to start generation with",
718
- [](gpt_params & params, const std::string & value) {
695
+ [](common_params & params, const std::string & value) {
719
696
  params.prompt = value;
720
697
  }
721
698
  ));
722
- add_opt(llama_arg(
699
+ add_opt(common_arg(
723
700
  {"--no-perf"},
724
- format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
725
- [](gpt_params & params) {
701
+ string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
702
+ [](common_params & params) {
726
703
  params.no_perf = true;
727
704
  params.sparams.no_perf = true;
728
705
  }
729
706
  ).set_env("LLAMA_ARG_NO_PERF"));
730
- add_opt(llama_arg(
707
+ add_opt(common_arg(
731
708
  {"-f", "--file"}, "FNAME",
732
709
  "a file containing the prompt (default: none)",
733
- [](gpt_params & params, const std::string & value) {
710
+ [](common_params & params, const std::string & value) {
734
711
  std::ifstream file(value);
735
712
  if (!file) {
736
- throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
713
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
737
714
  }
738
715
  // store the external file name in params
739
716
  params.prompt_file = value;
@@ -743,24 +720,24 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
743
720
  }
744
721
  }
745
722
  ));
746
- add_opt(llama_arg(
723
+ add_opt(common_arg(
747
724
  {"--in-file"}, "FNAME",
748
725
  "an input file (repeat to specify multiple files)",
749
- [](gpt_params & params, const std::string & value) {
726
+ [](common_params & params, const std::string & value) {
750
727
  std::ifstream file(value);
751
728
  if (!file) {
752
- throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
729
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
753
730
  }
754
731
  params.in_files.push_back(value);
755
732
  }
756
733
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
757
- add_opt(llama_arg(
734
+ add_opt(common_arg(
758
735
  {"-bf", "--binary-file"}, "FNAME",
759
736
  "binary file containing the prompt (default: none)",
760
- [](gpt_params & params, const std::string & value) {
737
+ [](common_params & params, const std::string & value) {
761
738
  std::ifstream file(value, std::ios::binary);
762
739
  if (!file) {
763
- throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
740
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
764
741
  }
765
742
  // store the external file name in params
766
743
  params.prompt_file = value;
@@ -770,287 +747,352 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
770
747
  fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
771
748
  }
772
749
  ));
773
- add_opt(llama_arg(
750
+ add_opt(common_arg(
774
751
  {"-e", "--escape"},
775
- format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
776
- [](gpt_params & params) {
752
+ string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
753
+ [](common_params & params) {
777
754
  params.escape = true;
778
755
  }
779
756
  ));
780
- add_opt(llama_arg(
757
+ add_opt(common_arg(
781
758
  {"--no-escape"},
782
759
  "do not process escape sequences",
783
- [](gpt_params & params) {
760
+ [](common_params & params) {
784
761
  params.escape = false;
785
762
  }
786
763
  ));
787
- add_opt(llama_arg(
764
+ add_opt(common_arg(
788
765
  {"-ptc", "--print-token-count"}, "N",
789
- format("print token count every N tokens (default: %d)", params.n_print),
790
- [](gpt_params & params, int value) {
766
+ string_format("print token count every N tokens (default: %d)", params.n_print),
767
+ [](common_params & params, int value) {
791
768
  params.n_print = value;
792
769
  }
793
770
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
794
- add_opt(llama_arg(
771
+ add_opt(common_arg(
795
772
  {"--prompt-cache"}, "FNAME",
796
773
  "file to cache prompt state for faster startup (default: none)",
797
- [](gpt_params & params, const std::string & value) {
774
+ [](common_params & params, const std::string & value) {
798
775
  params.path_prompt_cache = value;
799
776
  }
800
777
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
801
- add_opt(llama_arg(
778
+ add_opt(common_arg(
802
779
  {"--prompt-cache-all"},
803
780
  "if specified, saves user input and generations to cache as well\n",
804
- [](gpt_params & params) {
781
+ [](common_params & params) {
805
782
  params.prompt_cache_all = true;
806
783
  }
807
784
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
808
- add_opt(llama_arg(
785
+ add_opt(common_arg(
809
786
  {"--prompt-cache-ro"},
810
787
  "if specified, uses the prompt cache but does not update it",
811
- [](gpt_params & params) {
788
+ [](common_params & params) {
812
789
  params.prompt_cache_ro = true;
813
790
  }
814
791
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
815
- add_opt(llama_arg(
792
+ add_opt(common_arg(
816
793
  {"-r", "--reverse-prompt"}, "PROMPT",
817
794
  "halt generation at PROMPT, return control in interactive mode\n",
818
- [](gpt_params & params, const std::string & value) {
795
+ [](common_params & params, const std::string & value) {
819
796
  params.antiprompt.emplace_back(value);
820
797
  }
821
798
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
822
- add_opt(llama_arg(
799
+ add_opt(common_arg(
823
800
  {"-sp", "--special"},
824
- format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
825
- [](gpt_params & params) {
801
+ string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
802
+ [](common_params & params) {
826
803
  params.special = true;
827
804
  }
828
805
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
829
- add_opt(llama_arg(
806
+ add_opt(common_arg(
830
807
  {"-cnv", "--conversation"},
831
- format(
808
+ string_format(
832
809
  "run in conversation mode:\n"
833
810
  "- does not print special tokens and suffix/prefix\n"
834
811
  "- interactive mode is also enabled\n"
835
812
  "(default: %s)",
836
813
  params.conversation ? "true" : "false"
837
814
  ),
838
- [](gpt_params & params) {
815
+ [](common_params & params) {
839
816
  params.conversation = true;
840
817
  }
841
818
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
842
- add_opt(llama_arg(
819
+ add_opt(common_arg(
843
820
  {"-i", "--interactive"},
844
- format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
845
- [](gpt_params & params) {
821
+ string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
822
+ [](common_params & params) {
846
823
  params.interactive = true;
847
824
  }
848
825
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
849
- add_opt(llama_arg(
826
+ add_opt(common_arg(
850
827
  {"-if", "--interactive-first"},
851
- format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
852
- [](gpt_params & params) {
828
+ string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
829
+ [](common_params & params) {
853
830
  params.interactive_first = true;
854
831
  }
855
832
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
856
- add_opt(llama_arg(
833
+ add_opt(common_arg(
857
834
  {"-mli", "--multiline-input"},
858
835
  "allows you to write or paste multiple lines without ending each in '\\'",
859
- [](gpt_params & params) {
836
+ [](common_params & params) {
860
837
  params.multiline_input = true;
861
838
  }
862
839
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
863
- add_opt(llama_arg(
840
+ add_opt(common_arg(
864
841
  {"--in-prefix-bos"},
865
842
  "prefix BOS to user inputs, preceding the `--in-prefix` string",
866
- [](gpt_params & params) {
843
+ [](common_params & params) {
867
844
  params.input_prefix_bos = true;
868
845
  params.enable_chat_template = false;
869
846
  }
870
847
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
871
- add_opt(llama_arg(
848
+ add_opt(common_arg(
872
849
  {"--in-prefix"}, "STRING",
873
850
  "string to prefix user inputs with (default: empty)",
874
- [](gpt_params & params, const std::string & value) {
851
+ [](common_params & params, const std::string & value) {
875
852
  params.input_prefix = value;
876
853
  params.enable_chat_template = false;
877
854
  }
878
855
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
879
- add_opt(llama_arg(
856
+ add_opt(common_arg(
880
857
  {"--in-suffix"}, "STRING",
881
858
  "string to suffix after user inputs with (default: empty)",
882
- [](gpt_params & params, const std::string & value) {
859
+ [](common_params & params, const std::string & value) {
883
860
  params.input_suffix = value;
884
861
  params.enable_chat_template = false;
885
862
  }
886
863
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
887
- add_opt(llama_arg(
864
+ add_opt(common_arg(
888
865
  {"--no-warmup"},
889
866
  "skip warming up the model with an empty run",
890
- [](gpt_params & params) {
867
+ [](common_params & params) {
891
868
  params.warmup = false;
892
869
  }
893
870
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
894
- add_opt(llama_arg(
871
+ add_opt(common_arg(
895
872
  {"--spm-infill"},
896
- format(
873
+ string_format(
897
874
  "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
898
875
  params.spm_infill ? "enabled" : "disabled"
899
876
  ),
900
- [](gpt_params & params) {
877
+ [](common_params & params) {
901
878
  params.spm_infill = true;
902
879
  }
903
880
  ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
904
- add_opt(llama_arg(
881
+ add_opt(common_arg(
905
882
  {"--samplers"}, "SAMPLERS",
906
- format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
907
- [](gpt_params & params, const std::string & value) {
908
- const auto sampler_names = string_split(value, ';');
909
- params.sparams.samplers = gpt_sampler_types_from_names(sampler_names, true);
883
+ string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
884
+ [](common_params & params, const std::string & value) {
885
+ const auto sampler_names = string_split<std::string>(value, ';');
886
+ params.sparams.samplers = common_sampler_types_from_names(sampler_names, true);
910
887
  }
911
888
  ).set_sparam());
912
- add_opt(llama_arg(
889
+ add_opt(common_arg(
913
890
  {"-s", "--seed"}, "SEED",
914
- format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
915
- [](gpt_params & params, const std::string & value) {
891
+ string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
892
+ [](common_params & params, const std::string & value) {
916
893
  params.sparams.seed = std::stoul(value);
917
894
  }
918
895
  ).set_sparam());
919
- add_opt(llama_arg(
896
+ add_opt(common_arg(
920
897
  {"--sampling-seq"}, "SEQUENCE",
921
- format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
922
- [](gpt_params & params, const std::string & value) {
923
- params.sparams.samplers = gpt_sampler_types_from_chars(value);
898
+ string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
899
+ [](common_params & params, const std::string & value) {
900
+ params.sparams.samplers = common_sampler_types_from_chars(value);
924
901
  }
925
902
  ).set_sparam());
926
- add_opt(llama_arg(
903
+ add_opt(common_arg(
927
904
  {"--ignore-eos"},
928
905
  "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
929
- [](gpt_params & params) {
906
+ [](common_params & params) {
930
907
  params.sparams.ignore_eos = true;
931
908
  }
932
909
  ).set_sparam());
933
- add_opt(llama_arg(
910
+ add_opt(common_arg(
934
911
  {"--penalize-nl"},
935
- format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
936
- [](gpt_params & params) {
912
+ string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
913
+ [](common_params & params) {
937
914
  params.sparams.penalize_nl = true;
938
915
  }
939
916
  ).set_sparam());
940
- add_opt(llama_arg(
917
+ add_opt(common_arg(
941
918
  {"--temp"}, "N",
942
- format("temperature (default: %.1f)", (double)params.sparams.temp),
943
- [](gpt_params & params, const std::string & value) {
919
+ string_format("temperature (default: %.1f)", (double)params.sparams.temp),
920
+ [](common_params & params, const std::string & value) {
944
921
  params.sparams.temp = std::stof(value);
945
922
  params.sparams.temp = std::max(params.sparams.temp, 0.0f);
946
923
  }
947
924
  ).set_sparam());
948
- add_opt(llama_arg(
925
+ add_opt(common_arg(
949
926
  {"--top-k"}, "N",
950
- format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
951
- [](gpt_params & params, int value) {
927
+ string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
928
+ [](common_params & params, int value) {
952
929
  params.sparams.top_k = value;
953
930
  }
954
931
  ).set_sparam());
955
- add_opt(llama_arg(
932
+ add_opt(common_arg(
956
933
  {"--top-p"}, "N",
957
- format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
958
- [](gpt_params & params, const std::string & value) {
934
+ string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
935
+ [](common_params & params, const std::string & value) {
959
936
  params.sparams.top_p = std::stof(value);
960
937
  }
961
938
  ).set_sparam());
962
- add_opt(llama_arg(
939
+ add_opt(common_arg(
963
940
  {"--min-p"}, "N",
964
- format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
965
- [](gpt_params & params, const std::string & value) {
941
+ string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
942
+ [](common_params & params, const std::string & value) {
966
943
  params.sparams.min_p = std::stof(value);
967
944
  }
968
945
  ).set_sparam());
969
- add_opt(llama_arg(
970
- {"--tfs"}, "N",
971
- format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
972
- [](gpt_params & params, const std::string & value) {
973
- params.sparams.tfs_z = std::stof(value);
946
+ add_opt(common_arg(
947
+ {"--xtc-probability"}, "N",
948
+ string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
949
+ [](common_params & params, const std::string & value) {
950
+ params.sparams.xtc_probability = std::stof(value);
974
951
  }
975
952
  ).set_sparam());
976
- add_opt(llama_arg(
953
+ add_opt(common_arg(
954
+ {"--xtc-threshold"}, "N",
955
+ string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
956
+ [](common_params & params, const std::string & value) {
957
+ params.sparams.xtc_threshold = std::stof(value);
958
+ }
959
+ ).set_sparam());
960
+ add_opt(common_arg(
977
961
  {"--typical"}, "N",
978
- format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
979
- [](gpt_params & params, const std::string & value) {
962
+ string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
963
+ [](common_params & params, const std::string & value) {
980
964
  params.sparams.typ_p = std::stof(value);
981
965
  }
982
966
  ).set_sparam());
983
- add_opt(llama_arg(
967
+ add_opt(common_arg(
984
968
  {"--repeat-last-n"}, "N",
985
- format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
986
- [](gpt_params & params, int value) {
969
+ string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
970
+ [](common_params & params, int value) {
987
971
  params.sparams.penalty_last_n = value;
988
972
  params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
989
973
  }
990
974
  ).set_sparam());
991
- add_opt(llama_arg(
975
+ add_opt(common_arg(
992
976
  {"--repeat-penalty"}, "N",
993
- format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
994
- [](gpt_params & params, const std::string & value) {
977
+ string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
978
+ [](common_params & params, const std::string & value) {
995
979
  params.sparams.penalty_repeat = std::stof(value);
996
980
  }
997
981
  ).set_sparam());
998
- add_opt(llama_arg(
982
+ add_opt(common_arg(
999
983
  {"--presence-penalty"}, "N",
1000
- format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
1001
- [](gpt_params & params, const std::string & value) {
984
+ string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
985
+ [](common_params & params, const std::string & value) {
1002
986
  params.sparams.penalty_present = std::stof(value);
1003
987
  }
1004
988
  ).set_sparam());
1005
- add_opt(llama_arg(
989
+ add_opt(common_arg(
1006
990
  {"--frequency-penalty"}, "N",
1007
- format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
1008
- [](gpt_params & params, const std::string & value) {
991
+ string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
992
+ [](common_params & params, const std::string & value) {
1009
993
  params.sparams.penalty_freq = std::stof(value);
1010
994
  }
1011
995
  ).set_sparam());
1012
- add_opt(llama_arg(
996
+ add_opt(common_arg(
997
+ {"--dry-multiplier"}, "N",
998
+ string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sparams.dry_multiplier),
999
+ [](common_params & params, const std::string & value) {
1000
+ params.sparams.dry_multiplier = std::stof(value);
1001
+ }
1002
+ ).set_sparam());
1003
+ add_opt(common_arg(
1004
+ {"--dry-base"}, "N",
1005
+ string_format("set DRY sampling base value (default: %.2f)", (double)params.sparams.dry_base),
1006
+ [](common_params & params, const std::string & value) {
1007
+ float potential_base = std::stof(value);
1008
+ if (potential_base >= 1.0f)
1009
+ {
1010
+ params.sparams.dry_base = potential_base;
1011
+ }
1012
+ }
1013
+ ).set_sparam());
1014
+ add_opt(common_arg(
1015
+ {"--dry-allowed-length"}, "N",
1016
+ string_format("set allowed length for DRY sampling (default: %d)", params.sparams.dry_allowed_length),
1017
+ [](common_params & params, int value) {
1018
+ params.sparams.dry_allowed_length = value;
1019
+ }
1020
+ ).set_sparam());
1021
+ add_opt(common_arg(
1022
+ {"--dry-penalty-last-n"}, "N",
1023
+ string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sparams.dry_penalty_last_n),
1024
+ [](common_params & params, int value) {
1025
+ params.sparams.dry_penalty_last_n = value;
1026
+ }
1027
+ ).set_sparam());
1028
+ add_opt(common_arg(
1029
+ {"--dry-sequence-breaker"}, "STRING",
1030
+ string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
1031
+ params.sparams.dry_sequence_breakers.empty() ? "none" :
1032
+ std::accumulate(std::next(params.sparams.dry_sequence_breakers.begin()),
1033
+ params.sparams.dry_sequence_breakers.end(),
1034
+ std::string("'") + (params.sparams.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sparams.dry_sequence_breakers[0]) + "'",
1035
+ [](const std::string& a, const std::string& b) {
1036
+ std::string formatted_b = (b == "\n") ? "\\n" : b;
1037
+ return a + ", '" + formatted_b + "'";
1038
+ }).c_str()),
1039
+ [](common_params & params, const std::string & value) {
1040
+ static bool defaults_cleared = false;
1041
+
1042
+ if (!defaults_cleared) {
1043
+ params.sparams.dry_sequence_breakers.clear();
1044
+ defaults_cleared = true;
1045
+ }
1046
+
1047
+ if (value == "none") {
1048
+ params.sparams.dry_sequence_breakers.clear();
1049
+ } else {
1050
+ params.sparams.dry_sequence_breakers.emplace_back(value);
1051
+ }
1052
+ }
1053
+ ).set_sparam());
1054
+ add_opt(common_arg(
1013
1055
  {"--dynatemp-range"}, "N",
1014
- format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
1015
- [](gpt_params & params, const std::string & value) {
1056
+ string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
1057
+ [](common_params & params, const std::string & value) {
1016
1058
  params.sparams.dynatemp_range = std::stof(value);
1017
1059
  }
1018
1060
  ).set_sparam());
1019
- add_opt(llama_arg(
1061
+ add_opt(common_arg(
1020
1062
  {"--dynatemp-exp"}, "N",
1021
- format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
1022
- [](gpt_params & params, const std::string & value) {
1063
+ string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
1064
+ [](common_params & params, const std::string & value) {
1023
1065
  params.sparams.dynatemp_exponent = std::stof(value);
1024
1066
  }
1025
1067
  ).set_sparam());
1026
- add_opt(llama_arg(
1068
+ add_opt(common_arg(
1027
1069
  {"--mirostat"}, "N",
1028
- format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
1070
+ string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
1029
1071
  "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
1030
- [](gpt_params & params, int value) {
1072
+ [](common_params & params, int value) {
1031
1073
  params.sparams.mirostat = value;
1032
1074
  }
1033
1075
  ).set_sparam());
1034
- add_opt(llama_arg(
1076
+ add_opt(common_arg(
1035
1077
  {"--mirostat-lr"}, "N",
1036
- format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
1037
- [](gpt_params & params, const std::string & value) {
1078
+ string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
1079
+ [](common_params & params, const std::string & value) {
1038
1080
  params.sparams.mirostat_eta = std::stof(value);
1039
1081
  }
1040
1082
  ).set_sparam());
1041
- add_opt(llama_arg(
1083
+ add_opt(common_arg(
1042
1084
  {"--mirostat-ent"}, "N",
1043
- format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
1044
- [](gpt_params & params, const std::string & value) {
1085
+ string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
1086
+ [](common_params & params, const std::string & value) {
1045
1087
  params.sparams.mirostat_tau = std::stof(value);
1046
1088
  }
1047
1089
  ).set_sparam());
1048
- add_opt(llama_arg(
1090
+ add_opt(common_arg(
1049
1091
  {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS",
1050
1092
  "modifies the likelihood of token appearing in the completion,\n"
1051
1093
  "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
1052
1094
  "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
1053
- [](gpt_params & params, const std::string & value) {
1095
+ [](common_params & params, const std::string & value) {
1054
1096
  std::stringstream ss(value);
1055
1097
  llama_token key;
1056
1098
  char sign;
@@ -1067,20 +1109,20 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1067
1109
  }
1068
1110
  }
1069
1111
  ).set_sparam());
1070
- add_opt(llama_arg(
1112
+ add_opt(common_arg(
1071
1113
  {"--grammar"}, "GRAMMAR",
1072
- format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
1073
- [](gpt_params & params, const std::string & value) {
1114
+ string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
1115
+ [](common_params & params, const std::string & value) {
1074
1116
  params.sparams.grammar = value;
1075
1117
  }
1076
1118
  ).set_sparam());
1077
- add_opt(llama_arg(
1119
+ add_opt(common_arg(
1078
1120
  {"--grammar-file"}, "FNAME",
1079
1121
  "file to read grammar from",
1080
- [](gpt_params & params, const std::string & value) {
1122
+ [](common_params & params, const std::string & value) {
1081
1123
  std::ifstream file(value);
1082
1124
  if (!file) {
1083
- throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
1125
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1084
1126
  }
1085
1127
  std::copy(
1086
1128
  std::istreambuf_iterator<char>(file),
@@ -1089,17 +1131,17 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1089
1131
  );
1090
1132
  }
1091
1133
  ).set_sparam());
1092
- add_opt(llama_arg(
1134
+ add_opt(common_arg(
1093
1135
  {"-j", "--json-schema"}, "SCHEMA",
1094
1136
  "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
1095
- [](gpt_params & params, const std::string & value) {
1137
+ [](common_params & params, const std::string & value) {
1096
1138
  params.sparams.grammar = json_schema_to_grammar(json::parse(value));
1097
1139
  }
1098
1140
  ).set_sparam());
1099
- add_opt(llama_arg(
1141
+ add_opt(common_arg(
1100
1142
  {"--pooling"}, "{none,mean,cls,last,rank}",
1101
1143
  "pooling type for embeddings, use model default if unspecified",
1102
- [](gpt_params & params, const std::string & value) {
1144
+ [](common_params & params, const std::string & value) {
1103
1145
  /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
1104
1146
  else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
1105
1147
  else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
@@ -1108,275 +1150,275 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1108
1150
  else { throw std::invalid_argument("invalid value"); }
1109
1151
  }
1110
1152
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
1111
- add_opt(llama_arg(
1112
- {"--attention"}, "{causal,non,causal}",
1153
+ add_opt(common_arg(
1154
+ {"--attention"}, "{causal,non-causal}",
1113
1155
  "attention type for embeddings, use model default if unspecified",
1114
- [](gpt_params & params, const std::string & value) {
1156
+ [](common_params & params, const std::string & value) {
1115
1157
  /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
1116
1158
  else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
1117
1159
  else { throw std::invalid_argument("invalid value"); }
1118
1160
  }
1119
1161
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
1120
- add_opt(llama_arg(
1162
+ add_opt(common_arg(
1121
1163
  {"--rope-scaling"}, "{none,linear,yarn}",
1122
1164
  "RoPE frequency scaling method, defaults to linear unless specified by the model",
1123
- [](gpt_params & params, const std::string & value) {
1165
+ [](common_params & params, const std::string & value) {
1124
1166
  /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
1125
1167
  else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
1126
1168
  else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
1127
1169
  else { throw std::invalid_argument("invalid value"); }
1128
1170
  }
1129
1171
  ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE"));
1130
- add_opt(llama_arg(
1172
+ add_opt(common_arg(
1131
1173
  {"--rope-scale"}, "N",
1132
1174
  "RoPE context scaling factor, expands context by a factor of N",
1133
- [](gpt_params & params, const std::string & value) {
1175
+ [](common_params & params, const std::string & value) {
1134
1176
  params.rope_freq_scale = 1.0f / std::stof(value);
1135
1177
  }
1136
1178
  ).set_env("LLAMA_ARG_ROPE_SCALE"));
1137
- add_opt(llama_arg(
1179
+ add_opt(common_arg(
1138
1180
  {"--rope-freq-base"}, "N",
1139
1181
  "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
1140
- [](gpt_params & params, const std::string & value) {
1182
+ [](common_params & params, const std::string & value) {
1141
1183
  params.rope_freq_base = std::stof(value);
1142
1184
  }
1143
1185
  ).set_env("LLAMA_ARG_ROPE_FREQ_BASE"));
1144
- add_opt(llama_arg(
1186
+ add_opt(common_arg(
1145
1187
  {"--rope-freq-scale"}, "N",
1146
1188
  "RoPE frequency scaling factor, expands context by a factor of 1/N",
1147
- [](gpt_params & params, const std::string & value) {
1189
+ [](common_params & params, const std::string & value) {
1148
1190
  params.rope_freq_scale = std::stof(value);
1149
1191
  }
1150
1192
  ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
1151
- add_opt(llama_arg(
1193
+ add_opt(common_arg(
1152
1194
  {"--yarn-orig-ctx"}, "N",
1153
- format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
1154
- [](gpt_params & params, int value) {
1195
+ string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
1196
+ [](common_params & params, int value) {
1155
1197
  params.yarn_orig_ctx = value;
1156
1198
  }
1157
1199
  ).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
1158
- add_opt(llama_arg(
1200
+ add_opt(common_arg(
1159
1201
  {"--yarn-ext-factor"}, "N",
1160
- format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
1161
- [](gpt_params & params, const std::string & value) {
1202
+ string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
1203
+ [](common_params & params, const std::string & value) {
1162
1204
  params.yarn_ext_factor = std::stof(value);
1163
1205
  }
1164
1206
  ).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
1165
- add_opt(llama_arg(
1207
+ add_opt(common_arg(
1166
1208
  {"--yarn-attn-factor"}, "N",
1167
- format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
1168
- [](gpt_params & params, const std::string & value) {
1209
+ string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
1210
+ [](common_params & params, const std::string & value) {
1169
1211
  params.yarn_attn_factor = std::stof(value);
1170
1212
  }
1171
1213
  ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
1172
- add_opt(llama_arg(
1214
+ add_opt(common_arg(
1173
1215
  {"--yarn-beta-slow"}, "N",
1174
- format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
1175
- [](gpt_params & params, const std::string & value) {
1216
+ string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
1217
+ [](common_params & params, const std::string & value) {
1176
1218
  params.yarn_beta_slow = std::stof(value);
1177
1219
  }
1178
1220
  ).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
1179
- add_opt(llama_arg(
1221
+ add_opt(common_arg(
1180
1222
  {"--yarn-beta-fast"}, "N",
1181
- format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
1182
- [](gpt_params & params, const std::string & value) {
1223
+ string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
1224
+ [](common_params & params, const std::string & value) {
1183
1225
  params.yarn_beta_fast = std::stof(value);
1184
1226
  }
1185
1227
  ).set_env("LLAMA_ARG_YARN_BETA_FAST"));
1186
- add_opt(llama_arg(
1228
+ add_opt(common_arg(
1187
1229
  {"-gan", "--grp-attn-n"}, "N",
1188
- format("group-attention factor (default: %d)", params.grp_attn_n),
1189
- [](gpt_params & params, int value) {
1230
+ string_format("group-attention factor (default: %d)", params.grp_attn_n),
1231
+ [](common_params & params, int value) {
1190
1232
  params.grp_attn_n = value;
1191
1233
  }
1192
- ).set_env("LLAMA_ARG_GRP_ATTN_N"));
1193
- add_opt(llama_arg(
1234
+ ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
1235
+ add_opt(common_arg(
1194
1236
  {"-gaw", "--grp-attn-w"}, "N",
1195
- format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
1196
- [](gpt_params & params, int value) {
1237
+ string_format("group-attention width (default: %d)", params.grp_attn_w),
1238
+ [](common_params & params, int value) {
1197
1239
  params.grp_attn_w = value;
1198
1240
  }
1199
- ).set_env("LLAMA_ARG_GRP_ATTN_W"));
1200
- add_opt(llama_arg(
1241
+ ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
1242
+ add_opt(common_arg(
1201
1243
  {"-dkvc", "--dump-kv-cache"},
1202
1244
  "verbose print of the KV cache",
1203
- [](gpt_params & params) {
1245
+ [](common_params & params) {
1204
1246
  params.dump_kv_cache = true;
1205
1247
  }
1206
1248
  ));
1207
- add_opt(llama_arg(
1249
+ add_opt(common_arg(
1208
1250
  {"-nkvo", "--no-kv-offload"},
1209
1251
  "disable KV offload",
1210
- [](gpt_params & params) {
1252
+ [](common_params & params) {
1211
1253
  params.no_kv_offload = true;
1212
1254
  }
1213
1255
  ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
1214
- add_opt(llama_arg(
1256
+ add_opt(common_arg(
1215
1257
  {"-ctk", "--cache-type-k"}, "TYPE",
1216
- format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
1217
- [](gpt_params & params, const std::string & value) {
1258
+ string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
1259
+ [](common_params & params, const std::string & value) {
1218
1260
  // TODO: get the type right here
1219
1261
  params.cache_type_k = value;
1220
1262
  }
1221
1263
  ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
1222
- add_opt(llama_arg(
1264
+ add_opt(common_arg(
1223
1265
  {"-ctv", "--cache-type-v"}, "TYPE",
1224
- format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
1225
- [](gpt_params & params, const std::string & value) {
1266
+ string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
1267
+ [](common_params & params, const std::string & value) {
1226
1268
  // TODO: get the type right here
1227
1269
  params.cache_type_v = value;
1228
1270
  }
1229
1271
  ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
1230
- add_opt(llama_arg(
1272
+ add_opt(common_arg(
1231
1273
  {"--perplexity", "--all-logits"},
1232
- format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
1233
- [](gpt_params & params) {
1274
+ string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
1275
+ [](common_params & params) {
1234
1276
  params.logits_all = true;
1235
1277
  }
1236
1278
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1237
- add_opt(llama_arg(
1279
+ add_opt(common_arg(
1238
1280
  {"--hellaswag"},
1239
1281
  "compute HellaSwag score over random tasks from datafile supplied with -f",
1240
- [](gpt_params & params) {
1282
+ [](common_params & params) {
1241
1283
  params.hellaswag = true;
1242
1284
  }
1243
1285
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1244
- add_opt(llama_arg(
1286
+ add_opt(common_arg(
1245
1287
  {"--hellaswag-tasks"}, "N",
1246
- format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
1247
- [](gpt_params & params, int value) {
1288
+ string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
1289
+ [](common_params & params, int value) {
1248
1290
  params.hellaswag_tasks = value;
1249
1291
  }
1250
1292
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1251
- add_opt(llama_arg(
1293
+ add_opt(common_arg(
1252
1294
  {"--winogrande"},
1253
1295
  "compute Winogrande score over random tasks from datafile supplied with -f",
1254
- [](gpt_params & params) {
1296
+ [](common_params & params) {
1255
1297
  params.winogrande = true;
1256
1298
  }
1257
1299
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1258
- add_opt(llama_arg(
1300
+ add_opt(common_arg(
1259
1301
  {"--winogrande-tasks"}, "N",
1260
- format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
1261
- [](gpt_params & params, int value) {
1302
+ string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
1303
+ [](common_params & params, int value) {
1262
1304
  params.winogrande_tasks = value;
1263
1305
  }
1264
1306
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1265
- add_opt(llama_arg(
1307
+ add_opt(common_arg(
1266
1308
  {"--multiple-choice"},
1267
1309
  "compute multiple choice score over random tasks from datafile supplied with -f",
1268
- [](gpt_params & params) {
1310
+ [](common_params & params) {
1269
1311
  params.multiple_choice = true;
1270
1312
  }
1271
1313
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1272
- add_opt(llama_arg(
1314
+ add_opt(common_arg(
1273
1315
  {"--multiple-choice-tasks"}, "N",
1274
- format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
1275
- [](gpt_params & params, int value) {
1316
+ string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
1317
+ [](common_params & params, int value) {
1276
1318
  params.multiple_choice_tasks = value;
1277
1319
  }
1278
1320
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1279
- add_opt(llama_arg(
1321
+ add_opt(common_arg(
1280
1322
  {"--kl-divergence"},
1281
1323
  "computes KL-divergence to logits provided via --kl-divergence-base",
1282
- [](gpt_params & params) {
1324
+ [](common_params & params) {
1283
1325
  params.kl_divergence = true;
1284
1326
  }
1285
1327
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1286
- add_opt(llama_arg(
1328
+ add_opt(common_arg(
1287
1329
  {"--save-all-logits", "--kl-divergence-base"}, "FNAME",
1288
1330
  "set logits file",
1289
- [](gpt_params & params, const std::string & value) {
1331
+ [](common_params & params, const std::string & value) {
1290
1332
  params.logits_file = value;
1291
1333
  }
1292
1334
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1293
- add_opt(llama_arg(
1335
+ add_opt(common_arg(
1294
1336
  {"--ppl-stride"}, "N",
1295
- format("stride for perplexity calculation (default: %d)", params.ppl_stride),
1296
- [](gpt_params & params, int value) {
1337
+ string_format("stride for perplexity calculation (default: %d)", params.ppl_stride),
1338
+ [](common_params & params, int value) {
1297
1339
  params.ppl_stride = value;
1298
1340
  }
1299
1341
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1300
- add_opt(llama_arg(
1342
+ add_opt(common_arg(
1301
1343
  {"--ppl-output-type"}, "<0|1>",
1302
- format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
1303
- [](gpt_params & params, int value) {
1344
+ string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
1345
+ [](common_params & params, int value) {
1304
1346
  params.ppl_output_type = value;
1305
1347
  }
1306
1348
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1307
- add_opt(llama_arg(
1349
+ add_opt(common_arg(
1308
1350
  {"-dt", "--defrag-thold"}, "N",
1309
- format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
1310
- [](gpt_params & params, const std::string & value) {
1351
+ string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
1352
+ [](common_params & params, const std::string & value) {
1311
1353
  params.defrag_thold = std::stof(value);
1312
1354
  }
1313
1355
  ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
1314
- add_opt(llama_arg(
1356
+ add_opt(common_arg(
1315
1357
  {"-np", "--parallel"}, "N",
1316
- format("number of parallel sequences to decode (default: %d)", params.n_parallel),
1317
- [](gpt_params & params, int value) {
1358
+ string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
1359
+ [](common_params & params, int value) {
1318
1360
  params.n_parallel = value;
1319
1361
  }
1320
1362
  ).set_env("LLAMA_ARG_N_PARALLEL"));
1321
- add_opt(llama_arg(
1363
+ add_opt(common_arg(
1322
1364
  {"-ns", "--sequences"}, "N",
1323
- format("number of sequences to decode (default: %d)", params.n_sequences),
1324
- [](gpt_params & params, int value) {
1365
+ string_format("number of sequences to decode (default: %d)", params.n_sequences),
1366
+ [](common_params & params, int value) {
1325
1367
  params.n_sequences = value;
1326
1368
  }
1327
1369
  ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
1328
- add_opt(llama_arg(
1370
+ add_opt(common_arg(
1329
1371
  {"-cb", "--cont-batching"},
1330
- format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
1331
- [](gpt_params & params) {
1372
+ string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
1373
+ [](common_params & params) {
1332
1374
  params.cont_batching = true;
1333
1375
  }
1334
1376
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
1335
- add_opt(llama_arg(
1377
+ add_opt(common_arg(
1336
1378
  {"-nocb", "--no-cont-batching"},
1337
1379
  "disable continuous batching",
1338
- [](gpt_params & params) {
1380
+ [](common_params & params) {
1339
1381
  params.cont_batching = false;
1340
1382
  }
1341
1383
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
1342
- add_opt(llama_arg(
1384
+ add_opt(common_arg(
1343
1385
  {"--mmproj"}, "FILE",
1344
1386
  "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
1345
- [](gpt_params & params, const std::string & value) {
1387
+ [](common_params & params, const std::string & value) {
1346
1388
  params.mmproj = value;
1347
1389
  }
1348
1390
  ).set_examples({LLAMA_EXAMPLE_LLAVA}));
1349
- add_opt(llama_arg(
1391
+ add_opt(common_arg(
1350
1392
  {"--image"}, "FILE",
1351
1393
  "path to an image file. use with multimodal models. Specify multiple times for batching",
1352
- [](gpt_params & params, const std::string & value) {
1394
+ [](common_params & params, const std::string & value) {
1353
1395
  params.image.emplace_back(value);
1354
1396
  }
1355
1397
  ).set_examples({LLAMA_EXAMPLE_LLAVA}));
1356
- #ifdef GGML_USE_RPC
1357
- add_opt(llama_arg(
1358
- {"--rpc"}, "SERVERS",
1359
- "comma separated list of RPC servers",
1360
- [](gpt_params & params, const std::string & value) {
1361
- params.rpc_servers = value;
1362
- }
1363
- ).set_env("LLAMA_ARG_RPC"));
1364
- #endif
1365
- add_opt(llama_arg(
1398
+ if (llama_supports_rpc()) {
1399
+ add_opt(common_arg(
1400
+ {"--rpc"}, "SERVERS",
1401
+ "comma separated list of RPC servers",
1402
+ [](common_params & params, const std::string & value) {
1403
+ params.rpc_servers = value;
1404
+ }
1405
+ ).set_env("LLAMA_ARG_RPC"));
1406
+ }
1407
+ add_opt(common_arg(
1366
1408
  {"--mlock"},
1367
1409
  "force system to keep model in RAM rather than swapping or compressing",
1368
- [](gpt_params & params) {
1410
+ [](common_params & params) {
1369
1411
  params.use_mlock = true;
1370
1412
  }
1371
1413
  ).set_env("LLAMA_ARG_MLOCK"));
1372
- add_opt(llama_arg(
1414
+ add_opt(common_arg(
1373
1415
  {"--no-mmap"},
1374
1416
  "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
1375
- [](gpt_params & params) {
1417
+ [](common_params & params) {
1376
1418
  params.use_mmap = false;
1377
1419
  }
1378
1420
  ).set_env("LLAMA_ARG_NO_MMAP"));
1379
- add_opt(llama_arg(
1421
+ add_opt(common_arg(
1380
1422
  {"--numa"}, "TYPE",
1381
1423
  "attempt optimizations that help on some NUMA systems\n"
1382
1424
  "- distribute: spread execution evenly over all nodes\n"
@@ -1384,17 +1426,17 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1384
1426
  "- numactl: use the CPU map provided by numactl\n"
1385
1427
  "if run without this previously, it is recommended to drop the system page cache before using this\n"
1386
1428
  "see https://github.com/ggerganov/llama.cpp/issues/1437",
1387
- [](gpt_params & params, const std::string & value) {
1429
+ [](common_params & params, const std::string & value) {
1388
1430
  /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
1389
1431
  else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
1390
1432
  else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
1391
1433
  else { throw std::invalid_argument("invalid value"); }
1392
1434
  }
1393
1435
  ).set_env("LLAMA_ARG_NUMA"));
1394
- add_opt(llama_arg(
1436
+ add_opt(common_arg(
1395
1437
  {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
1396
1438
  "number of layers to store in VRAM",
1397
- [](gpt_params & params, int value) {
1439
+ [](common_params & params, int value) {
1398
1440
  params.n_gpu_layers = value;
1399
1441
  if (!llama_supports_gpu_offload()) {
1400
1442
  fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
@@ -1402,10 +1444,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1402
1444
  }
1403
1445
  }
1404
1446
  ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
1405
- add_opt(llama_arg(
1447
+ add_opt(common_arg(
1406
1448
  {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
1407
1449
  "number of layers to store in VRAM for the draft model",
1408
- [](gpt_params & params, int value) {
1450
+ [](common_params & params, int value) {
1409
1451
  params.n_gpu_layers_draft = value;
1410
1452
  if (!llama_supports_gpu_offload()) {
1411
1453
  fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
@@ -1413,13 +1455,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1413
1455
  }
1414
1456
  }
1415
1457
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
1416
- add_opt(llama_arg(
1458
+ add_opt(common_arg(
1417
1459
  {"-sm", "--split-mode"}, "{none,layer,row}",
1418
1460
  "how to split the model across multiple GPUs, one of:\n"
1419
1461
  "- none: use one GPU only\n"
1420
1462
  "- layer (default): split layers and KV across GPUs\n"
1421
1463
  "- row: split rows across GPUs",
1422
- [](gpt_params & params, const std::string & value) {
1464
+ [](common_params & params, const std::string & value) {
1423
1465
  std::string arg_next = value;
1424
1466
  if (arg_next == "none") {
1425
1467
  params.split_mode = LLAMA_SPLIT_MODE_NONE;
@@ -1439,10 +1481,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1439
1481
  }
1440
1482
  }
1441
1483
  ).set_env("LLAMA_ARG_SPLIT_MODE"));
1442
- add_opt(llama_arg(
1484
+ add_opt(common_arg(
1443
1485
  {"-ts", "--tensor-split"}, "N0,N1,N2,...",
1444
1486
  "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
1445
- [](gpt_params & params, const std::string & value) {
1487
+ [](common_params & params, const std::string & value) {
1446
1488
  std::string arg_next = value;
1447
1489
 
1448
1490
  // split string by , and /
@@ -1451,7 +1493,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1451
1493
  std::vector<std::string> split_arg{ it, {} };
1452
1494
  if (split_arg.size() >= llama_max_devices()) {
1453
1495
  throw std::invalid_argument(
1454
- format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
1496
+ string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
1455
1497
  );
1456
1498
  }
1457
1499
  for (size_t i = 0; i < llama_max_devices(); ++i) {
@@ -1466,315 +1508,315 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1466
1508
  }
1467
1509
  }
1468
1510
  ).set_env("LLAMA_ARG_TENSOR_SPLIT"));
1469
- add_opt(llama_arg(
1511
+ add_opt(common_arg(
1470
1512
  {"-mg", "--main-gpu"}, "INDEX",
1471
- format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
1472
- [](gpt_params & params, int value) {
1513
+ string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
1514
+ [](common_params & params, int value) {
1473
1515
  params.main_gpu = value;
1474
1516
  if (!llama_supports_gpu_offload()) {
1475
1517
  fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
1476
1518
  }
1477
1519
  }
1478
1520
  ).set_env("LLAMA_ARG_MAIN_GPU"));
1479
- add_opt(llama_arg(
1521
+ add_opt(common_arg(
1480
1522
  {"--check-tensors"},
1481
- format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
1482
- [](gpt_params & params) {
1523
+ string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
1524
+ [](common_params & params) {
1483
1525
  params.check_tensors = true;
1484
1526
  }
1485
1527
  ));
1486
- add_opt(llama_arg(
1528
+ add_opt(common_arg(
1487
1529
  {"--override-kv"}, "KEY=TYPE:VALUE",
1488
1530
  "advanced option to override model metadata by key. may be specified multiple times.\n"
1489
1531
  "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
1490
- [](gpt_params & params, const std::string & value) {
1532
+ [](common_params & params, const std::string & value) {
1491
1533
  if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
1492
- throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str()));
1534
+ throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str()));
1493
1535
  }
1494
1536
  }
1495
1537
  ));
1496
- add_opt(llama_arg(
1538
+ add_opt(common_arg(
1497
1539
  {"--lora"}, "FNAME",
1498
1540
  "path to LoRA adapter (can be repeated to use multiple adapters)",
1499
- [](gpt_params & params, const std::string & value) {
1541
+ [](common_params & params, const std::string & value) {
1500
1542
  params.lora_adapters.push_back({ std::string(value), 1.0 });
1501
1543
  }
1502
1544
  // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
1503
1545
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
1504
- add_opt(llama_arg(
1546
+ add_opt(common_arg(
1505
1547
  {"--lora-scaled"}, "FNAME", "SCALE",
1506
1548
  "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
1507
- [](gpt_params & params, const std::string & fname, const std::string & scale) {
1549
+ [](common_params & params, const std::string & fname, const std::string & scale) {
1508
1550
  params.lora_adapters.push_back({ fname, std::stof(scale) });
1509
1551
  }
1510
1552
  // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
1511
1553
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
1512
- add_opt(llama_arg(
1554
+ add_opt(common_arg(
1513
1555
  {"--control-vector"}, "FNAME",
1514
1556
  "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
1515
- [](gpt_params & params, const std::string & value) {
1557
+ [](common_params & params, const std::string & value) {
1516
1558
  params.control_vectors.push_back({ 1.0f, value, });
1517
1559
  }
1518
1560
  ));
1519
- add_opt(llama_arg(
1561
+ add_opt(common_arg(
1520
1562
  {"--control-vector-scaled"}, "FNAME", "SCALE",
1521
1563
  "add a control vector with user defined scaling SCALE\n"
1522
1564
  "note: this argument can be repeated to add multiple scaled control vectors",
1523
- [](gpt_params & params, const std::string & fname, const std::string & scale) {
1565
+ [](common_params & params, const std::string & fname, const std::string & scale) {
1524
1566
  params.control_vectors.push_back({ std::stof(scale), fname });
1525
1567
  }
1526
1568
  ));
1527
- add_opt(llama_arg(
1569
+ add_opt(common_arg(
1528
1570
  {"--control-vector-layer-range"}, "START", "END",
1529
1571
  "layer range to apply the control vector(s) to, start and end inclusive",
1530
- [](gpt_params & params, const std::string & start, const std::string & end) {
1572
+ [](common_params & params, const std::string & start, const std::string & end) {
1531
1573
  params.control_vector_layer_start = std::stoi(start);
1532
1574
  params.control_vector_layer_end = std::stoi(end);
1533
1575
  }
1534
1576
  ));
1535
- add_opt(llama_arg(
1577
+ add_opt(common_arg(
1536
1578
  {"-a", "--alias"}, "STRING",
1537
1579
  "set alias for model name (to be used by REST API)",
1538
- [](gpt_params & params, const std::string & value) {
1580
+ [](common_params & params, const std::string & value) {
1539
1581
  params.model_alias = value;
1540
1582
  }
1541
1583
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
1542
- add_opt(llama_arg(
1584
+ add_opt(common_arg(
1543
1585
  {"-m", "--model"}, "FNAME",
1544
1586
  ex == LLAMA_EXAMPLE_EXPORT_LORA
1545
1587
  ? std::string("model path from which to load base model")
1546
- : format(
1588
+ : string_format(
1547
1589
  "model path (default: `models/$filename` with filename from `--hf-file` "
1548
1590
  "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
1549
1591
  ),
1550
- [](gpt_params & params, const std::string & value) {
1592
+ [](common_params & params, const std::string & value) {
1551
1593
  params.model = value;
1552
1594
  }
1553
1595
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
1554
- add_opt(llama_arg(
1596
+ add_opt(common_arg(
1555
1597
  {"-md", "--model-draft"}, "FNAME",
1556
1598
  "draft model for speculative decoding (default: unused)",
1557
- [](gpt_params & params, const std::string & value) {
1599
+ [](common_params & params, const std::string & value) {
1558
1600
  params.model_draft = value;
1559
1601
  }
1560
1602
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
1561
- add_opt(llama_arg(
1603
+ add_opt(common_arg(
1562
1604
  {"-mu", "--model-url"}, "MODEL_URL",
1563
1605
  "model download url (default: unused)",
1564
- [](gpt_params & params, const std::string & value) {
1606
+ [](common_params & params, const std::string & value) {
1565
1607
  params.model_url = value;
1566
1608
  }
1567
1609
  ).set_env("LLAMA_ARG_MODEL_URL"));
1568
- add_opt(llama_arg(
1610
+ add_opt(common_arg(
1569
1611
  {"-hfr", "--hf-repo"}, "REPO",
1570
1612
  "Hugging Face model repository (default: unused)",
1571
- [](gpt_params & params, const std::string & value) {
1613
+ [](common_params & params, const std::string & value) {
1572
1614
  params.hf_repo = value;
1573
1615
  }
1574
1616
  ).set_env("LLAMA_ARG_HF_REPO"));
1575
- add_opt(llama_arg(
1617
+ add_opt(common_arg(
1576
1618
  {"-hff", "--hf-file"}, "FILE",
1577
1619
  "Hugging Face model file (default: unused)",
1578
- [](gpt_params & params, const std::string & value) {
1620
+ [](common_params & params, const std::string & value) {
1579
1621
  params.hf_file = value;
1580
1622
  }
1581
1623
  ).set_env("LLAMA_ARG_HF_FILE"));
1582
- add_opt(llama_arg(
1624
+ add_opt(common_arg(
1583
1625
  {"-hft", "--hf-token"}, "TOKEN",
1584
1626
  "Hugging Face access token (default: value from HF_TOKEN environment variable)",
1585
- [](gpt_params & params, const std::string & value) {
1627
+ [](common_params & params, const std::string & value) {
1586
1628
  params.hf_token = value;
1587
1629
  }
1588
1630
  ).set_env("HF_TOKEN"));
1589
- add_opt(llama_arg(
1631
+ add_opt(common_arg(
1590
1632
  {"--context-file"}, "FNAME",
1591
1633
  "file to load context from (repeat to specify multiple files)",
1592
- [](gpt_params & params, const std::string & value) {
1634
+ [](common_params & params, const std::string & value) {
1593
1635
  std::ifstream file(value, std::ios::binary);
1594
1636
  if (!file) {
1595
- throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
1637
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1596
1638
  }
1597
1639
  params.context_files.push_back(value);
1598
1640
  }
1599
1641
  ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
1600
- add_opt(llama_arg(
1642
+ add_opt(common_arg(
1601
1643
  {"--chunk-size"}, "N",
1602
- format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
1603
- [](gpt_params & params, int value) {
1644
+ string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
1645
+ [](common_params & params, int value) {
1604
1646
  params.chunk_size = value;
1605
1647
  }
1606
1648
  ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
1607
- add_opt(llama_arg(
1649
+ add_opt(common_arg(
1608
1650
  {"--chunk-separator"}, "STRING",
1609
- format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
1610
- [](gpt_params & params, const std::string & value) {
1651
+ string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
1652
+ [](common_params & params, const std::string & value) {
1611
1653
  params.chunk_separator = value;
1612
1654
  }
1613
1655
  ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
1614
- add_opt(llama_arg(
1656
+ add_opt(common_arg(
1615
1657
  {"--junk"}, "N",
1616
- format("number of times to repeat the junk text (default: %d)", params.n_junk),
1617
- [](gpt_params & params, int value) {
1658
+ string_format("number of times to repeat the junk text (default: %d)", params.n_junk),
1659
+ [](common_params & params, int value) {
1618
1660
  params.n_junk = value;
1619
1661
  }
1620
1662
  ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
1621
- add_opt(llama_arg(
1663
+ add_opt(common_arg(
1622
1664
  {"--pos"}, "N",
1623
- format("position of the passkey in the junk text (default: %d)", params.i_pos),
1624
- [](gpt_params & params, int value) {
1665
+ string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
1666
+ [](common_params & params, int value) {
1625
1667
  params.i_pos = value;
1626
1668
  }
1627
1669
  ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
1628
- add_opt(llama_arg(
1670
+ add_opt(common_arg(
1629
1671
  {"-o", "--output", "--output-file"}, "FNAME",
1630
- format("output file (default: '%s')",
1672
+ string_format("output file (default: '%s')",
1631
1673
  ex == LLAMA_EXAMPLE_EXPORT_LORA
1632
1674
  ? params.lora_outfile.c_str()
1633
1675
  : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
1634
1676
  ? params.cvector_outfile.c_str()
1635
1677
  : params.out_file.c_str()),
1636
- [](gpt_params & params, const std::string & value) {
1678
+ [](common_params & params, const std::string & value) {
1637
1679
  params.out_file = value;
1638
1680
  params.cvector_outfile = value;
1639
1681
  params.lora_outfile = value;
1640
1682
  }
1641
1683
  ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
1642
- add_opt(llama_arg(
1684
+ add_opt(common_arg(
1643
1685
  {"-ofreq", "--output-frequency"}, "N",
1644
- format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
1645
- [](gpt_params & params, int value) {
1686
+ string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
1687
+ [](common_params & params, int value) {
1646
1688
  params.n_out_freq = value;
1647
1689
  }
1648
1690
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1649
- add_opt(llama_arg(
1691
+ add_opt(common_arg(
1650
1692
  {"--save-frequency"}, "N",
1651
- format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
1652
- [](gpt_params & params, int value) {
1693
+ string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
1694
+ [](common_params & params, int value) {
1653
1695
  params.n_save_freq = value;
1654
1696
  }
1655
1697
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1656
- add_opt(llama_arg(
1698
+ add_opt(common_arg(
1657
1699
  {"--process-output"},
1658
- format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
1659
- [](gpt_params & params) {
1700
+ string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
1701
+ [](common_params & params) {
1660
1702
  params.process_output = true;
1661
1703
  }
1662
1704
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1663
- add_opt(llama_arg(
1705
+ add_opt(common_arg(
1664
1706
  {"--no-ppl"},
1665
- format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
1666
- [](gpt_params & params) {
1707
+ string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
1708
+ [](common_params & params) {
1667
1709
  params.compute_ppl = false;
1668
1710
  }
1669
1711
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1670
- add_opt(llama_arg(
1712
+ add_opt(common_arg(
1671
1713
  {"--chunk", "--from-chunk"}, "N",
1672
- format("start processing the input from chunk N (default: %d)", params.i_chunk),
1673
- [](gpt_params & params, int value) {
1714
+ string_format("start processing the input from chunk N (default: %d)", params.i_chunk),
1715
+ [](common_params & params, int value) {
1674
1716
  params.i_chunk = value;
1675
1717
  }
1676
1718
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1677
- add_opt(llama_arg(
1719
+ add_opt(common_arg(
1678
1720
  {"-pps"},
1679
- format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
1680
- [](gpt_params & params) {
1721
+ string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
1722
+ [](common_params & params) {
1681
1723
  params.is_pp_shared = true;
1682
1724
  }
1683
1725
  ).set_examples({LLAMA_EXAMPLE_BENCH}));
1684
- add_opt(llama_arg(
1726
+ add_opt(common_arg(
1685
1727
  {"-npp"}, "n0,n1,...",
1686
1728
  "number of prompt tokens",
1687
- [](gpt_params & params, const std::string & value) {
1729
+ [](common_params & params, const std::string & value) {
1688
1730
  auto p = string_split<int>(value, ',');
1689
1731
  params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
1690
1732
  }
1691
1733
  ).set_examples({LLAMA_EXAMPLE_BENCH}));
1692
- add_opt(llama_arg(
1734
+ add_opt(common_arg(
1693
1735
  {"-ntg"}, "n0,n1,...",
1694
1736
  "number of text generation tokens",
1695
- [](gpt_params & params, const std::string & value) {
1737
+ [](common_params & params, const std::string & value) {
1696
1738
  auto p = string_split<int>(value, ',');
1697
1739
  params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
1698
1740
  }
1699
1741
  ).set_examples({LLAMA_EXAMPLE_BENCH}));
1700
- add_opt(llama_arg(
1742
+ add_opt(common_arg(
1701
1743
  {"-npl"}, "n0,n1,...",
1702
1744
  "number of parallel prompts",
1703
- [](gpt_params & params, const std::string & value) {
1745
+ [](common_params & params, const std::string & value) {
1704
1746
  auto p = string_split<int>(value, ',');
1705
1747
  params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
1706
1748
  }
1707
1749
  ).set_examples({LLAMA_EXAMPLE_BENCH}));
1708
- add_opt(llama_arg(
1750
+ add_opt(common_arg(
1709
1751
  {"--embd-normalize"}, "N",
1710
- format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
1711
- [](gpt_params & params, int value) {
1752
+ string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
1753
+ [](common_params & params, int value) {
1712
1754
  params.embd_normalize = value;
1713
1755
  }
1714
1756
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
1715
- add_opt(llama_arg(
1757
+ add_opt(common_arg(
1716
1758
  {"--embd-output-format"}, "FORMAT",
1717
1759
  "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
1718
- [](gpt_params & params, const std::string & value) {
1760
+ [](common_params & params, const std::string & value) {
1719
1761
  params.embd_out = value;
1720
1762
  }
1721
1763
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
1722
- add_opt(llama_arg(
1764
+ add_opt(common_arg(
1723
1765
  {"--embd-separator"}, "STRING",
1724
- "separator of embendings (default \\n) for example \"<#sep#>\"",
1725
- [](gpt_params & params, const std::string & value) {
1766
+ "separator of embeddings (default \\n) for example \"<#sep#>\"",
1767
+ [](common_params & params, const std::string & value) {
1726
1768
  params.embd_sep = value;
1727
1769
  }
1728
1770
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
1729
- add_opt(llama_arg(
1771
+ add_opt(common_arg(
1730
1772
  {"--host"}, "HOST",
1731
- format("ip address to listen (default: %s)", params.hostname.c_str()),
1732
- [](gpt_params & params, const std::string & value) {
1773
+ string_format("ip address to listen (default: %s)", params.hostname.c_str()),
1774
+ [](common_params & params, const std::string & value) {
1733
1775
  params.hostname = value;
1734
1776
  }
1735
1777
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
1736
- add_opt(llama_arg(
1778
+ add_opt(common_arg(
1737
1779
  {"--port"}, "PORT",
1738
- format("port to listen (default: %d)", params.port),
1739
- [](gpt_params & params, int value) {
1780
+ string_format("port to listen (default: %d)", params.port),
1781
+ [](common_params & params, int value) {
1740
1782
  params.port = value;
1741
1783
  }
1742
1784
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
1743
- add_opt(llama_arg(
1785
+ add_opt(common_arg(
1744
1786
  {"--path"}, "PATH",
1745
- format("path to serve static files from (default: %s)", params.public_path.c_str()),
1746
- [](gpt_params & params, const std::string & value) {
1787
+ string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
1788
+ [](common_params & params, const std::string & value) {
1747
1789
  params.public_path = value;
1748
1790
  }
1749
1791
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
1750
- add_opt(llama_arg(
1792
+ add_opt(common_arg(
1751
1793
  {"--embedding", "--embeddings"},
1752
- format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
1753
- [](gpt_params & params) {
1794
+ string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
1795
+ [](common_params & params) {
1754
1796
  params.embedding = true;
1755
1797
  }
1756
1798
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
1757
- add_opt(llama_arg(
1799
+ add_opt(common_arg(
1758
1800
  {"--reranking", "--rerank"},
1759
- format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
1760
- [](gpt_params & params) {
1801
+ string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
1802
+ [](common_params & params) {
1761
1803
  params.reranking = true;
1762
1804
  }
1763
1805
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
1764
- add_opt(llama_arg(
1806
+ add_opt(common_arg(
1765
1807
  {"--api-key"}, "KEY",
1766
1808
  "API key to use for authentication (default: none)",
1767
- [](gpt_params & params, const std::string & value) {
1809
+ [](common_params & params, const std::string & value) {
1768
1810
  params.api_keys.push_back(value);
1769
1811
  }
1770
1812
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
1771
- add_opt(llama_arg(
1813
+ add_opt(common_arg(
1772
1814
  {"--api-key-file"}, "FNAME",
1773
1815
  "path to file containing API keys (default: none)",
1774
- [](gpt_params & params, const std::string & value) {
1816
+ [](common_params & params, const std::string & value) {
1775
1817
  std::ifstream key_file(value);
1776
1818
  if (!key_file) {
1777
- throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
1819
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1778
1820
  }
1779
1821
  std::string key;
1780
1822
  while (std::getline(key_file, key)) {
@@ -1785,70 +1827,74 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1785
1827
  key_file.close();
1786
1828
  }
1787
1829
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
1788
- add_opt(llama_arg(
1830
+ add_opt(common_arg(
1789
1831
  {"--ssl-key-file"}, "FNAME",
1790
1832
  "path to file a PEM-encoded SSL private key",
1791
- [](gpt_params & params, const std::string & value) {
1833
+ [](common_params & params, const std::string & value) {
1792
1834
  params.ssl_file_key = value;
1793
1835
  }
1794
1836
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE"));
1795
- add_opt(llama_arg(
1837
+ add_opt(common_arg(
1796
1838
  {"--ssl-cert-file"}, "FNAME",
1797
1839
  "path to file a PEM-encoded SSL certificate",
1798
- [](gpt_params & params, const std::string & value) {
1840
+ [](common_params & params, const std::string & value) {
1799
1841
  params.ssl_file_cert = value;
1800
1842
  }
1801
1843
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
1802
- add_opt(llama_arg(
1844
+ add_opt(common_arg(
1803
1845
  {"-to", "--timeout"}, "N",
1804
- format("server read/write timeout in seconds (default: %d)", params.timeout_read),
1805
- [](gpt_params & params, int value) {
1846
+ string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
1847
+ [](common_params & params, int value) {
1806
1848
  params.timeout_read = value;
1807
1849
  params.timeout_write = value;
1808
1850
  }
1809
1851
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
1810
- add_opt(llama_arg(
1852
+ add_opt(common_arg(
1811
1853
  {"--threads-http"}, "N",
1812
- format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
1813
- [](gpt_params & params, int value) {
1854
+ string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
1855
+ [](common_params & params, int value) {
1814
1856
  params.n_threads_http = value;
1815
1857
  }
1816
1858
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
1817
- add_opt(llama_arg(
1818
- {"-spf", "--system-prompt-file"}, "FNAME",
1819
- "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications",
1820
- [](gpt_params & params, const std::string & value) {
1821
- std::ifstream file(value);
1822
- if (!file) {
1823
- throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
1824
- }
1825
- std::string system_prompt;
1826
- std::copy(
1827
- std::istreambuf_iterator<char>(file),
1828
- std::istreambuf_iterator<char>(),
1829
- std::back_inserter(system_prompt)
1830
- );
1831
- params.system_prompt = system_prompt;
1832
- }
1833
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
1834
- add_opt(llama_arg(
1859
+ add_opt(common_arg(
1860
+ {"--cache-reuse"}, "N",
1861
+ string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
1862
+ [](common_params & params, int value) {
1863
+ params.n_cache_reuse = value;
1864
+ }
1865
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
1866
+ add_opt(common_arg(
1835
1867
  {"--metrics"},
1836
- format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
1837
- [](gpt_params & params) {
1868
+ string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
1869
+ [](common_params & params) {
1838
1870
  params.endpoint_metrics = true;
1839
1871
  }
1840
1872
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
1841
- add_opt(llama_arg(
1873
+ add_opt(common_arg(
1874
+ {"--slots"},
1875
+ string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
1876
+ [](common_params & params) {
1877
+ params.endpoint_slots = true;
1878
+ }
1879
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
1880
+ add_opt(common_arg(
1881
+ {"--props"},
1882
+ string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
1883
+ [](common_params & params) {
1884
+ params.endpoint_props = true;
1885
+ }
1886
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
1887
+ add_opt(common_arg(
1842
1888
  {"--no-slots"},
1843
- format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
1844
- [](gpt_params & params) {
1889
+ "disables slots monitoring endpoint",
1890
+ [](common_params & params) {
1845
1891
  params.endpoint_slots = false;
1846
1892
  }
1847
1893
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
1848
- add_opt(llama_arg(
1894
+ add_opt(common_arg(
1849
1895
  {"--slot-save-path"}, "PATH",
1850
1896
  "path to save slot kv cache (default: disabled)",
1851
- [](gpt_params & params, const std::string & value) {
1897
+ [](common_params & params, const std::string & value) {
1852
1898
  params.slot_save_path = value;
1853
1899
  // if doesn't end with DIRECTORY_SEPARATOR, add it
1854
1900
  if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
@@ -1856,14 +1902,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1856
1902
  }
1857
1903
  }
1858
1904
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
1859
- add_opt(llama_arg(
1905
+ add_opt(common_arg(
1860
1906
  {"--chat-template"}, "JINJA_TEMPLATE",
1861
1907
  "set custom jinja chat template (default: template taken from model's metadata)\n"
1862
1908
  "if suffix/prefix are specified, template will be disabled\n"
1863
1909
  "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
1864
- [](gpt_params & params, const std::string & value) {
1865
- if (!llama_chat_verify_template(value)) {
1866
- throw std::runtime_error(format(
1910
+ [](common_params & params, const std::string & value) {
1911
+ if (!common_chat_verify_template(value)) {
1912
+ throw std::runtime_error(string_format(
1867
1913
  "error: the supplied chat template is not supported: %s\n"
1868
1914
  "note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
1869
1915
  value.c_str()
@@ -1872,133 +1918,122 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1872
1918
  params.chat_template = value;
1873
1919
  }
1874
1920
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
1875
- add_opt(llama_arg(
1921
+ add_opt(common_arg(
1876
1922
  {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
1877
- format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
1878
- [](gpt_params & params, const std::string & value) {
1923
+ string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
1924
+ [](common_params & params, const std::string & value) {
1879
1925
  params.slot_prompt_similarity = std::stof(value);
1880
1926
  }
1881
1927
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
1882
- add_opt(llama_arg(
1928
+ add_opt(common_arg(
1883
1929
  {"--lora-init-without-apply"},
1884
- format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
1885
- [](gpt_params & params) {
1930
+ string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
1931
+ [](common_params & params) {
1886
1932
  params.lora_init_without_apply = true;
1887
1933
  }
1888
1934
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
1889
- add_opt(llama_arg(
1935
+ add_opt(common_arg(
1890
1936
  {"--simple-io"},
1891
1937
  "use basic IO for better compatibility in subprocesses and limited consoles",
1892
- [](gpt_params & params) {
1938
+ [](common_params & params) {
1893
1939
  params.simple_io = true;
1894
1940
  }
1895
1941
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
1896
- add_opt(llama_arg(
1897
- {"-ld", "--logdir"}, "LOGDIR",
1898
- "path under which to save YAML logs (no logging if unset)",
1899
- [](gpt_params & params, const std::string & value) {
1900
- params.logdir = value;
1901
-
1902
- if (params.logdir.back() != DIRECTORY_SEPARATOR) {
1903
- params.logdir += DIRECTORY_SEPARATOR;
1904
- }
1905
- }
1906
- ));
1907
- add_opt(llama_arg(
1942
+ add_opt(common_arg(
1908
1943
  {"--positive-file"}, "FNAME",
1909
- format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
1910
- [](gpt_params & params, const std::string & value) {
1944
+ string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
1945
+ [](common_params & params, const std::string & value) {
1911
1946
  params.cvector_positive_file = value;
1912
1947
  }
1913
1948
  ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
1914
- add_opt(llama_arg(
1949
+ add_opt(common_arg(
1915
1950
  {"--negative-file"}, "FNAME",
1916
- format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
1917
- [](gpt_params & params, const std::string & value) {
1951
+ string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
1952
+ [](common_params & params, const std::string & value) {
1918
1953
  params.cvector_negative_file = value;
1919
1954
  }
1920
1955
  ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
1921
- add_opt(llama_arg(
1956
+ add_opt(common_arg(
1922
1957
  {"--pca-batch"}, "N",
1923
- format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
1924
- [](gpt_params & params, int value) {
1958
+ string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
1959
+ [](common_params & params, int value) {
1925
1960
  params.n_pca_batch = value;
1926
1961
  }
1927
1962
  ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
1928
- add_opt(llama_arg(
1963
+ add_opt(common_arg(
1929
1964
  {"--pca-iter"}, "N",
1930
- format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
1931
- [](gpt_params & params, int value) {
1965
+ string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
1966
+ [](common_params & params, int value) {
1932
1967
  params.n_pca_iterations = value;
1933
1968
  }
1934
1969
  ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
1935
- add_opt(llama_arg(
1970
+ add_opt(common_arg(
1936
1971
  {"--method"}, "{pca, mean}",
1937
1972
  "dimensionality reduction method to be used (default: pca)",
1938
- [](gpt_params & params, const std::string & value) {
1973
+ [](common_params & params, const std::string & value) {
1939
1974
  /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
1940
1975
  else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
1941
1976
  else { throw std::invalid_argument("invalid value"); }
1942
1977
  }
1943
1978
  ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
1944
- add_opt(llama_arg(
1979
+ add_opt(common_arg(
1945
1980
  {"--output-format"}, "{md,jsonl}",
1946
1981
  "output format for batched-bench results (default: md)",
1947
- [](gpt_params & params, const std::string & value) {
1982
+ [](common_params & params, const std::string & value) {
1948
1983
  /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
1949
1984
  else if (value == "md") { params.batched_bench_output_jsonl = false; }
1950
1985
  else { std::invalid_argument("invalid value"); }
1951
1986
  }
1952
1987
  ).set_examples({LLAMA_EXAMPLE_BENCH}));
1953
- add_opt(llama_arg(
1988
+ add_opt(common_arg(
1954
1989
  {"--log-disable"},
1955
1990
  "Log disable",
1956
- [](gpt_params &) {
1957
- gpt_log_pause(gpt_log_main());
1991
+ [](common_params &) {
1992
+ common_log_pause(common_log_main());
1958
1993
  }
1959
1994
  ));
1960
- add_opt(llama_arg(
1995
+ add_opt(common_arg(
1961
1996
  {"--log-file"}, "FNAME",
1962
1997
  "Log to file",
1963
- [](gpt_params &, const std::string & value) {
1964
- gpt_log_set_file(gpt_log_main(), value.c_str());
1998
+ [](common_params &, const std::string & value) {
1999
+ common_log_set_file(common_log_main(), value.c_str());
1965
2000
  }
1966
2001
  ));
1967
- add_opt(llama_arg(
2002
+ add_opt(common_arg(
1968
2003
  {"--log-colors"},
1969
2004
  "Enable colored logging",
1970
- [](gpt_params &) {
1971
- gpt_log_set_colors(gpt_log_main(), true);
2005
+ [](common_params &) {
2006
+ common_log_set_colors(common_log_main(), true);
1972
2007
  }
1973
2008
  ).set_env("LLAMA_LOG_COLORS"));
1974
- add_opt(llama_arg(
2009
+ add_opt(common_arg(
1975
2010
  {"-v", "--verbose", "--log-verbose"},
1976
2011
  "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
1977
- [](gpt_params & params) {
2012
+ [](common_params & params) {
1978
2013
  params.verbosity = INT_MAX;
1979
- gpt_log_set_verbosity_thold(INT_MAX);
2014
+ common_log_set_verbosity_thold(INT_MAX);
1980
2015
  }
1981
2016
  ));
1982
- add_opt(llama_arg(
2017
+ add_opt(common_arg(
1983
2018
  {"-lv", "--verbosity", "--log-verbosity"}, "N",
1984
2019
  "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
1985
- [](gpt_params & params, int value) {
2020
+ [](common_params & params, int value) {
1986
2021
  params.verbosity = value;
1987
- gpt_log_set_verbosity_thold(value);
2022
+ common_log_set_verbosity_thold(value);
1988
2023
  }
1989
2024
  ).set_env("LLAMA_LOG_VERBOSITY"));
1990
- add_opt(llama_arg(
2025
+ add_opt(common_arg(
1991
2026
  {"--log-prefix"},
1992
2027
  "Enable prefx in log messages",
1993
- [](gpt_params &) {
1994
- gpt_log_set_prefix(gpt_log_main(), true);
2028
+ [](common_params &) {
2029
+ common_log_set_prefix(common_log_main(), true);
1995
2030
  }
1996
2031
  ).set_env("LLAMA_LOG_PREFIX"));
1997
- add_opt(llama_arg(
2032
+ add_opt(common_arg(
1998
2033
  {"--log-timestamps"},
1999
2034
  "Enable timestamps in log messages",
2000
- [](gpt_params &) {
2001
- gpt_log_set_timestamps(gpt_log_main(), true);
2035
+ [](common_params &) {
2036
+ common_log_set_timestamps(common_log_main(), true);
2002
2037
  }
2003
2038
  ).set_env("LLAMA_LOG_TIMESTAMPS"));
2004
2039