@fugood/llama.node 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/CMakeLists.txt +5 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/EmbeddingWorker.cpp +15 -5
  19. package/src/EmbeddingWorker.h +2 -1
  20. package/src/LlamaCompletionWorker.cpp +1 -1
  21. package/src/LlamaContext.cpp +81 -18
  22. package/src/LlamaContext.h +2 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +197 -159
  24. package/src/llama.cpp/.github/workflows/docker.yml +5 -8
  25. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  27. package/src/llama.cpp/CMakeLists.txt +11 -6
  28. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  29. package/src/llama.cpp/cmake/common.cmake +33 -0
  30. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  31. package/src/llama.cpp/common/CMakeLists.txt +6 -2
  32. package/src/llama.cpp/common/arg.cpp +426 -245
  33. package/src/llama.cpp/common/common.cpp +143 -80
  34. package/src/llama.cpp/common/common.h +81 -24
  35. package/src/llama.cpp/common/sampling.cpp +53 -19
  36. package/src/llama.cpp/common/sampling.h +22 -1
  37. package/src/llama.cpp/common/speculative.cpp +274 -0
  38. package/src/llama.cpp/common/speculative.h +28 -0
  39. package/src/llama.cpp/docs/build.md +101 -148
  40. package/src/llama.cpp/examples/CMakeLists.txt +32 -13
  41. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +5 -4
  43. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  46. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  47. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  48. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  49. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  50. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  52. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  55. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  57. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  59. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
  61. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/infill/infill.cpp +1 -1
  63. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  64. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
  65. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  66. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  67. package/src/llama.cpp/examples/llava/clip.cpp +262 -66
  68. package/src/llama.cpp/examples/llava/clip.h +8 -2
  69. package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
  70. package/src/llama.cpp/examples/llava/llava.cpp +46 -19
  71. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
  72. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  73. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  75. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  76. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
  77. package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
  78. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/main/main.cpp +9 -5
  80. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  81. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  83. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  84. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  87. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  88. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
  90. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  91. package/src/llama.cpp/examples/run/run.cpp +911 -0
  92. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
  94. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
  95. package/src/llama.cpp/examples/server/server.cpp +1758 -886
  96. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  97. package/src/llama.cpp/examples/server/utils.hpp +94 -304
  98. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  99. package/src/llama.cpp/examples/simple/simple.cpp +4 -0
  100. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
  101. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
  102. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
  104. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  106. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
  108. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  109. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  110. package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
  111. package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
  112. package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
  113. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  114. package/src/llama.cpp/ggml/include/ggml.h +106 -24
  115. package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
  116. package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
  117. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
  118. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
  119. package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
  120. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
  121. package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
  122. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
  123. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  124. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  125. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
  126. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  127. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  128. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  129. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  130. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  131. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  132. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  133. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  134. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  135. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
  136. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  137. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  138. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
  139. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
  140. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
  141. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  142. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
  143. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
  151. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
  152. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
  153. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  155. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
  156. package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
  157. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
  158. package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
  159. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
  160. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
  161. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
  162. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  163. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  164. package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
  165. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
  167. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
  169. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
  172. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  173. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  174. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
  175. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
  176. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  177. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
  178. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  179. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  180. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
  181. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
  182. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
  183. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  184. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  185. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
  187. package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
  188. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
  189. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
  190. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
  191. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
  192. package/src/llama.cpp/ggml/src/ggml.c +367 -207
  193. package/src/llama.cpp/include/llama-cpp.h +25 -0
  194. package/src/llama.cpp/include/llama.h +26 -19
  195. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  196. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  197. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  198. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  199. package/src/llama.cpp/src/CMakeLists.txt +2 -7
  200. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  201. package/src/llama.cpp/src/llama-grammar.h +2 -5
  202. package/src/llama.cpp/src/llama-sampling.cpp +35 -90
  203. package/src/llama.cpp/src/llama-vocab.cpp +6 -1
  204. package/src/llama.cpp/src/llama.cpp +1748 -640
  205. package/src/llama.cpp/src/unicode.cpp +62 -51
  206. package/src/llama.cpp/src/unicode.h +9 -10
  207. package/src/llama.cpp/tests/CMakeLists.txt +48 -37
  208. package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
  209. package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
  210. package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
  211. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  212. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  213. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  214. package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
  215. package/src/llama.cpp/tests/test-rope.cpp +61 -20
  216. package/src/llama.cpp/tests/test-sampling.cpp +2 -2
  217. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  218. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  219. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  220. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  221. package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
  222. package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
  223. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
  224. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  225. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
@@ -119,26 +119,63 @@ std::string common_arg::to_string() {
119
119
  // utils
120
120
  //
121
121
 
122
- static void common_params_handle_model_default(common_params & params) {
123
- if (!params.hf_repo.empty()) {
122
+ static void common_params_handle_model_default(
123
+ std::string & model,
124
+ std::string & model_url,
125
+ std::string & hf_repo,
126
+ std::string & hf_file) {
127
+ if (!hf_repo.empty()) {
124
128
  // short-hand to avoid specifying --hf-file -> default it to --model
125
- if (params.hf_file.empty()) {
126
- if (params.model.empty()) {
129
+ if (hf_file.empty()) {
130
+ if (model.empty()) {
127
131
  throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
128
132
  }
129
- params.hf_file = params.model;
130
- } else if (params.model.empty()) {
131
- params.model = fs_get_cache_file(string_split<std::string>(params.hf_file, '/').back());
132
- }
133
- } else if (!params.model_url.empty()) {
134
- if (params.model.empty()) {
135
- auto f = string_split<std::string>(params.model_url, '#').front();
133
+ hf_file = model;
134
+ } else if (model.empty()) {
135
+ // this is to avoid different repo having same file name, or same file name in different subdirs
136
+ std::string filename = hf_repo + "_" + hf_file;
137
+ // to make sure we don't have any slashes in the filename
138
+ string_replace_all(filename, "/", "_");
139
+ model = fs_get_cache_file(filename);
140
+ }
141
+ } else if (!model_url.empty()) {
142
+ if (model.empty()) {
143
+ auto f = string_split<std::string>(model_url, '#').front();
136
144
  f = string_split<std::string>(f, '?').front();
137
- params.model = fs_get_cache_file(string_split<std::string>(f, '/').back());
145
+ model = fs_get_cache_file(string_split<std::string>(f, '/').back());
138
146
  }
139
- } else if (params.model.empty()) {
140
- params.model = DEFAULT_MODEL_PATH;
147
+ } else if (model.empty()) {
148
+ model = DEFAULT_MODEL_PATH;
149
+ }
150
+ }
151
+
152
+ const std::vector<ggml_type> kv_cache_types = {
153
+ GGML_TYPE_F32,
154
+ GGML_TYPE_F16,
155
+ GGML_TYPE_BF16,
156
+ GGML_TYPE_Q8_0,
157
+ GGML_TYPE_Q4_0,
158
+ GGML_TYPE_Q4_1,
159
+ GGML_TYPE_IQ4_NL,
160
+ GGML_TYPE_Q5_0,
161
+ GGML_TYPE_Q5_1,
162
+ };
163
+
164
+ static ggml_type kv_cache_type_from_str(const std::string & s) {
165
+ for (const auto & type : kv_cache_types) {
166
+ if (ggml_type_name(type) == s) {
167
+ return type;
168
+ }
169
+ }
170
+ throw std::runtime_error("Unsupported cache type: " + s);
171
+ }
172
+
173
+ static std::string get_all_kv_cache_types() {
174
+ std::ostringstream msg;
175
+ for (const auto & type : kv_cache_types) {
176
+ msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
141
177
  }
178
+ return msg.str();
142
179
  }
143
180
 
144
181
  //
@@ -233,16 +270,19 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
233
270
  }
234
271
  }
235
272
 
236
- postprocess_cpu_params(params.cpuparams, nullptr);
273
+ postprocess_cpu_params(params.cpuparams, nullptr);
237
274
  postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
238
- postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
239
- postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
275
+
276
+ postprocess_cpu_params(params.speculative.cpuparams, &params.cpuparams);
277
+ postprocess_cpu_params(params.speculative.cpuparams_batch, &params.cpuparams_batch);
240
278
 
241
279
  if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
242
280
  throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
243
281
  }
244
282
 
245
- common_params_handle_model_default(params);
283
+ // TODO: refactor model params in a common struct
284
+ common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file);
285
+ common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file);
246
286
 
247
287
  if (params.escape) {
248
288
  string_process_escapes(params.prompt);
@@ -251,7 +291,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
251
291
  for (auto & antiprompt : params.antiprompt) {
252
292
  string_process_escapes(antiprompt);
253
293
  }
254
- for (auto & seq_breaker : params.sparams.dry_sequence_breakers) {
294
+ for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
255
295
  string_process_escapes(seq_breaker);
256
296
  }
257
297
  }
@@ -297,6 +337,27 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
297
337
  print_options(specific_options);
298
338
  }
299
339
 
340
+ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
341
+ std::vector<ggml_backend_dev_t> devices;
342
+ auto dev_names = string_split<std::string>(value, ',');
343
+ if (dev_names.empty()) {
344
+ throw std::invalid_argument("no devices specified");
345
+ }
346
+ if (dev_names.size() == 1 && dev_names[0] == "none") {
347
+ devices.push_back(nullptr);
348
+ } else {
349
+ for (const auto & device : dev_names) {
350
+ auto * dev = ggml_backend_dev_by_name(device.c_str());
351
+ if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
352
+ throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
353
+ }
354
+ devices.push_back(dev);
355
+ }
356
+ devices.push_back(nullptr);
357
+ }
358
+ return devices;
359
+ }
360
+
300
361
  bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
301
362
  auto ctx_arg = common_params_parser_init(params, ex, print_usage);
302
363
  const common_params params_org = ctx_arg.params; // the example can modify the default params
@@ -322,14 +383,29 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
322
383
  return true;
323
384
  }
324
385
 
386
+ static std::string list_builtin_chat_templates() {
387
+ std::vector<const char *> supported_tmpl;
388
+ int32_t res = llama_chat_builtin_templates(nullptr, 0);
389
+ supported_tmpl.resize(res);
390
+ res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
391
+ std::ostringstream msg;
392
+ for (auto & tmpl : supported_tmpl) {
393
+ msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", ");
394
+ }
395
+ return msg.str();
396
+ }
397
+
325
398
  common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
399
+ // load dynamic backends
400
+ ggml_backend_load_all();
401
+
326
402
  common_params_context ctx_arg(params);
327
403
  ctx_arg.print_usage = print_usage;
328
404
  ctx_arg.ex = ex;
329
405
 
330
406
  std::string sampler_type_chars;
331
407
  std::string sampler_type_names;
332
- for (const auto & sampler : params.sparams.samplers) {
408
+ for (const auto & sampler : params.sampling.samplers) {
333
409
  sampler_type_chars += common_sampler_type_to_chr(sampler);
334
410
  sampler_type_names += common_sampler_type_to_str(sampler) + ";";
335
411
  }
@@ -407,26 +483,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
407
483
  }
408
484
  }
409
485
  ));
410
- add_opt(common_arg(
411
- {"-td", "--threads-draft"}, "N",
412
- "number of threads to use during generation (default: same as --threads)",
413
- [](common_params & params, int value) {
414
- params.draft_cpuparams.n_threads = value;
415
- if (params.draft_cpuparams.n_threads <= 0) {
416
- params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
417
- }
418
- }
419
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
420
- add_opt(common_arg(
421
- {"-tbd", "--threads-batch-draft"}, "N",
422
- "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
423
- [](common_params & params, int value) {
424
- params.draft_cpuparams_batch.n_threads = value;
425
- if (params.draft_cpuparams_batch.n_threads <= 0) {
426
- params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
427
- }
428
- }
429
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
430
486
  add_opt(common_arg(
431
487
  {"-C", "--cpu-mask"}, "M",
432
488
  "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
@@ -515,108 +571,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
515
571
  params.cpuparams_batch.poll = value;
516
572
  }
517
573
  ));
518
- add_opt(common_arg(
519
- {"-Cd", "--cpu-mask-draft"}, "M",
520
- "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
521
- [](common_params & params, const std::string & mask) {
522
- params.draft_cpuparams.mask_valid = true;
523
- if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
524
- throw std::invalid_argument("invalid cpumask");
525
- }
526
- }
527
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
528
- add_opt(common_arg(
529
- {"-Crd", "--cpu-range-draft"}, "lo-hi",
530
- "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
531
- [](common_params & params, const std::string & range) {
532
- params.draft_cpuparams.mask_valid = true;
533
- if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
534
- throw std::invalid_argument("invalid range");
535
- }
536
- }
537
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
538
- add_opt(common_arg(
539
- {"--cpu-strict-draft"}, "<0|1>",
540
- "Use strict CPU placement for draft model (default: same as --cpu-strict)",
541
- [](common_params & params, int value) {
542
- params.draft_cpuparams.strict_cpu = value;
543
- }
544
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
545
- add_opt(common_arg(
546
- {"--prio-draft"}, "N",
547
- string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
548
- [](common_params & params, int prio) {
549
- if (prio < 0 || prio > 3) {
550
- throw std::invalid_argument("invalid value");
551
- }
552
- params.draft_cpuparams.priority = (enum ggml_sched_priority) prio;
553
- }
554
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
555
- add_opt(common_arg(
556
- {"--poll-draft"}, "<0|1>",
557
- "Use polling to wait for draft model work (default: same as --poll])",
558
- [](common_params & params, int value) {
559
- params.draft_cpuparams.poll = value;
560
- }
561
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
562
- add_opt(common_arg(
563
- {"-Cbd", "--cpu-mask-batch-draft"}, "M",
564
- "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
565
- [](common_params & params, const std::string & mask) {
566
- params.draft_cpuparams_batch.mask_valid = true;
567
- if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) {
568
- throw std::invalid_argument("invalid cpumask");
569
- }
570
- }
571
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
572
- add_opt(common_arg(
573
- {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
574
- "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
575
- [](common_params & params, const std::string & range) {
576
- params.draft_cpuparams_batch.mask_valid = true;
577
- if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
578
- throw std::invalid_argument("invalid cpumask");
579
- }
580
- }
581
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
582
- add_opt(common_arg(
583
- {"--cpu-strict-batch-draft"}, "<0|1>",
584
- "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
585
- [](common_params & params, int value) {
586
- params.draft_cpuparams_batch.strict_cpu = value;
587
- }
588
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
589
- add_opt(common_arg(
590
- {"--prio-batch-draft"}, "N",
591
- string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
592
- [](common_params & params, int prio) {
593
- if (prio < 0 || prio > 3) {
594
- throw std::invalid_argument("invalid value");
595
- }
596
- params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio;
597
- }
598
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
599
- add_opt(common_arg(
600
- {"--poll-batch-draft"}, "<0|1>",
601
- "Use polling to wait for draft model work (default: --poll-draft)",
602
- [](common_params & params, int value) {
603
- params.draft_cpuparams_batch.poll = value;
604
- }
605
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
606
- add_opt(common_arg(
607
- {"--draft"}, "N",
608
- string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
609
- [](common_params & params, int value) {
610
- params.n_draft = value;
611
- }
612
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
613
- add_opt(common_arg(
614
- {"-ps", "--p-split"}, "N",
615
- string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
616
- [](common_params & params, const std::string & value) {
617
- params.p_split = std::stof(value);
618
- }
619
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
620
574
  add_opt(common_arg(
621
575
  {"-lcs", "--lookup-cache-static"}, "FNAME",
622
576
  "path to static lookup cache to use for lookup decoding (not updated by generation)",
@@ -672,7 +626,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
672
626
  [](common_params & params) {
673
627
  params.ctx_shift = false;
674
628
  }
675
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
629
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
676
630
  add_opt(common_arg(
677
631
  {"--chunks"}, "N",
678
632
  string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -701,7 +655,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
701
655
  string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
702
656
  [](common_params & params) {
703
657
  params.no_perf = true;
704
- params.sparams.no_perf = true;
658
+ params.sampling.no_perf = true;
705
659
  }
706
660
  ).set_env("LLAMA_ARG_NO_PERF"));
707
661
  add_opt(common_arg(
@@ -867,7 +821,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
867
821
  [](common_params & params) {
868
822
  params.warmup = false;
869
823
  }
870
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
824
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
871
825
  add_opt(common_arg(
872
826
  {"--spm-infill"},
873
827
  string_format(
@@ -883,155 +837,154 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
883
837
  string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
884
838
  [](common_params & params, const std::string & value) {
885
839
  const auto sampler_names = string_split<std::string>(value, ';');
886
- params.sparams.samplers = common_sampler_types_from_names(sampler_names, true);
840
+ params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
887
841
  }
888
842
  ).set_sparam());
889
843
  add_opt(common_arg(
890
844
  {"-s", "--seed"}, "SEED",
891
- string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
845
+ string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED),
892
846
  [](common_params & params, const std::string & value) {
893
- params.sparams.seed = std::stoul(value);
847
+ params.sampling.seed = std::stoul(value);
894
848
  }
895
849
  ).set_sparam());
896
850
  add_opt(common_arg(
897
- {"--sampling-seq"}, "SEQUENCE",
851
+ {"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
898
852
  string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
899
853
  [](common_params & params, const std::string & value) {
900
- params.sparams.samplers = common_sampler_types_from_chars(value);
854
+ params.sampling.samplers = common_sampler_types_from_chars(value);
901
855
  }
902
856
  ).set_sparam());
903
857
  add_opt(common_arg(
904
858
  {"--ignore-eos"},
905
859
  "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
906
860
  [](common_params & params) {
907
- params.sparams.ignore_eos = true;
908
- }
909
- ).set_sparam());
910
- add_opt(common_arg(
911
- {"--penalize-nl"},
912
- string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
913
- [](common_params & params) {
914
- params.sparams.penalize_nl = true;
861
+ params.sampling.ignore_eos = true;
915
862
  }
916
863
  ).set_sparam());
917
864
  add_opt(common_arg(
918
865
  {"--temp"}, "N",
919
- string_format("temperature (default: %.1f)", (double)params.sparams.temp),
866
+ string_format("temperature (default: %.1f)", (double)params.sampling.temp),
920
867
  [](common_params & params, const std::string & value) {
921
- params.sparams.temp = std::stof(value);
922
- params.sparams.temp = std::max(params.sparams.temp, 0.0f);
868
+ params.sampling.temp = std::stof(value);
869
+ params.sampling.temp = std::max(params.sampling.temp, 0.0f);
923
870
  }
924
871
  ).set_sparam());
925
872
  add_opt(common_arg(
926
873
  {"--top-k"}, "N",
927
- string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
874
+ string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
928
875
  [](common_params & params, int value) {
929
- params.sparams.top_k = value;
876
+ params.sampling.top_k = value;
930
877
  }
931
878
  ).set_sparam());
932
879
  add_opt(common_arg(
933
880
  {"--top-p"}, "N",
934
- string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
881
+ string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
935
882
  [](common_params & params, const std::string & value) {
936
- params.sparams.top_p = std::stof(value);
883
+ params.sampling.top_p = std::stof(value);
937
884
  }
938
885
  ).set_sparam());
939
886
  add_opt(common_arg(
940
887
  {"--min-p"}, "N",
941
- string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
888
+ string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
942
889
  [](common_params & params, const std::string & value) {
943
- params.sparams.min_p = std::stof(value);
890
+ params.sampling.min_p = std::stof(value);
944
891
  }
945
892
  ).set_sparam());
946
893
  add_opt(common_arg(
947
894
  {"--xtc-probability"}, "N",
948
- string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
895
+ string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
949
896
  [](common_params & params, const std::string & value) {
950
- params.sparams.xtc_probability = std::stof(value);
897
+ params.sampling.xtc_probability = std::stof(value);
951
898
  }
952
899
  ).set_sparam());
953
900
  add_opt(common_arg(
954
901
  {"--xtc-threshold"}, "N",
955
- string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
902
+ string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
956
903
  [](common_params & params, const std::string & value) {
957
- params.sparams.xtc_threshold = std::stof(value);
904
+ params.sampling.xtc_threshold = std::stof(value);
958
905
  }
959
906
  ).set_sparam());
960
907
  add_opt(common_arg(
961
908
  {"--typical"}, "N",
962
- string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
909
+ string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),
963
910
  [](common_params & params, const std::string & value) {
964
- params.sparams.typ_p = std::stof(value);
911
+ params.sampling.typ_p = std::stof(value);
965
912
  }
966
913
  ).set_sparam());
967
914
  add_opt(common_arg(
968
915
  {"--repeat-last-n"}, "N",
969
- string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
916
+ string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
970
917
  [](common_params & params, int value) {
971
- params.sparams.penalty_last_n = value;
972
- params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
918
+ if (value < -1) {
919
+ throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
920
+ }
921
+ params.sampling.penalty_last_n = value;
922
+ params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
973
923
  }
974
924
  ).set_sparam());
975
925
  add_opt(common_arg(
976
926
  {"--repeat-penalty"}, "N",
977
- string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
927
+ string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
978
928
  [](common_params & params, const std::string & value) {
979
- params.sparams.penalty_repeat = std::stof(value);
929
+ params.sampling.penalty_repeat = std::stof(value);
980
930
  }
981
931
  ).set_sparam());
982
932
  add_opt(common_arg(
983
933
  {"--presence-penalty"}, "N",
984
- string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
934
+ string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present),
985
935
  [](common_params & params, const std::string & value) {
986
- params.sparams.penalty_present = std::stof(value);
936
+ params.sampling.penalty_present = std::stof(value);
987
937
  }
988
938
  ).set_sparam());
989
939
  add_opt(common_arg(
990
940
  {"--frequency-penalty"}, "N",
991
- string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
941
+ string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
992
942
  [](common_params & params, const std::string & value) {
993
- params.sparams.penalty_freq = std::stof(value);
943
+ params.sampling.penalty_freq = std::stof(value);
994
944
  }
995
945
  ).set_sparam());
996
946
  add_opt(common_arg(
997
947
  {"--dry-multiplier"}, "N",
998
- string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sparams.dry_multiplier),
948
+ string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
999
949
  [](common_params & params, const std::string & value) {
1000
- params.sparams.dry_multiplier = std::stof(value);
950
+ params.sampling.dry_multiplier = std::stof(value);
1001
951
  }
1002
952
  ).set_sparam());
1003
953
  add_opt(common_arg(
1004
954
  {"--dry-base"}, "N",
1005
- string_format("set DRY sampling base value (default: %.2f)", (double)params.sparams.dry_base),
955
+ string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base),
1006
956
  [](common_params & params, const std::string & value) {
1007
957
  float potential_base = std::stof(value);
1008
958
  if (potential_base >= 1.0f)
1009
959
  {
1010
- params.sparams.dry_base = potential_base;
960
+ params.sampling.dry_base = potential_base;
1011
961
  }
1012
962
  }
1013
963
  ).set_sparam());
1014
964
  add_opt(common_arg(
1015
965
  {"--dry-allowed-length"}, "N",
1016
- string_format("set allowed length for DRY sampling (default: %d)", params.sparams.dry_allowed_length),
966
+ string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length),
1017
967
  [](common_params & params, int value) {
1018
- params.sparams.dry_allowed_length = value;
968
+ params.sampling.dry_allowed_length = value;
1019
969
  }
1020
970
  ).set_sparam());
1021
971
  add_opt(common_arg(
1022
972
  {"--dry-penalty-last-n"}, "N",
1023
- string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sparams.dry_penalty_last_n),
973
+ string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
1024
974
  [](common_params & params, int value) {
1025
- params.sparams.dry_penalty_last_n = value;
975
+ if (value < -1) {
976
+ throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
977
+ }
978
+ params.sampling.dry_penalty_last_n = value;
1026
979
  }
1027
980
  ).set_sparam());
1028
981
  add_opt(common_arg(
1029
982
  {"--dry-sequence-breaker"}, "STRING",
1030
983
  string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
1031
- params.sparams.dry_sequence_breakers.empty() ? "none" :
1032
- std::accumulate(std::next(params.sparams.dry_sequence_breakers.begin()),
1033
- params.sparams.dry_sequence_breakers.end(),
1034
- std::string("'") + (params.sparams.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sparams.dry_sequence_breakers[0]) + "'",
984
+ params.sampling.dry_sequence_breakers.empty() ? "none" :
985
+ std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()),
986
+ params.sampling.dry_sequence_breakers.end(),
987
+ std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'",
1035
988
  [](const std::string& a, const std::string& b) {
1036
989
  std::string formatted_b = (b == "\n") ? "\\n" : b;
1037
990
  return a + ", '" + formatted_b + "'";
@@ -1040,51 +993,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1040
993
  static bool defaults_cleared = false;
1041
994
 
1042
995
  if (!defaults_cleared) {
1043
- params.sparams.dry_sequence_breakers.clear();
996
+ params.sampling.dry_sequence_breakers.clear();
1044
997
  defaults_cleared = true;
1045
998
  }
1046
999
 
1047
1000
  if (value == "none") {
1048
- params.sparams.dry_sequence_breakers.clear();
1001
+ params.sampling.dry_sequence_breakers.clear();
1049
1002
  } else {
1050
- params.sparams.dry_sequence_breakers.emplace_back(value);
1003
+ params.sampling.dry_sequence_breakers.emplace_back(value);
1051
1004
  }
1052
1005
  }
1053
1006
  ).set_sparam());
1054
1007
  add_opt(common_arg(
1055
1008
  {"--dynatemp-range"}, "N",
1056
- string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
1009
+ string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
1057
1010
  [](common_params & params, const std::string & value) {
1058
- params.sparams.dynatemp_range = std::stof(value);
1011
+ params.sampling.dynatemp_range = std::stof(value);
1059
1012
  }
1060
1013
  ).set_sparam());
1061
1014
  add_opt(common_arg(
1062
1015
  {"--dynatemp-exp"}, "N",
1063
- string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
1016
+ string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent),
1064
1017
  [](common_params & params, const std::string & value) {
1065
- params.sparams.dynatemp_exponent = std::stof(value);
1018
+ params.sampling.dynatemp_exponent = std::stof(value);
1066
1019
  }
1067
1020
  ).set_sparam());
1068
1021
  add_opt(common_arg(
1069
1022
  {"--mirostat"}, "N",
1070
1023
  string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
1071
- "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
1024
+ "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
1072
1025
  [](common_params & params, int value) {
1073
- params.sparams.mirostat = value;
1026
+ params.sampling.mirostat = value;
1074
1027
  }
1075
1028
  ).set_sparam());
1076
1029
  add_opt(common_arg(
1077
1030
  {"--mirostat-lr"}, "N",
1078
- string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
1031
+ string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
1079
1032
  [](common_params & params, const std::string & value) {
1080
- params.sparams.mirostat_eta = std::stof(value);
1033
+ params.sampling.mirostat_eta = std::stof(value);
1081
1034
  }
1082
1035
  ).set_sparam());
1083
1036
  add_opt(common_arg(
1084
1037
  {"--mirostat-ent"}, "N",
1085
- string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
1038
+ string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
1086
1039
  [](common_params & params, const std::string & value) {
1087
- params.sparams.mirostat_tau = std::stof(value);
1040
+ params.sampling.mirostat_tau = std::stof(value);
1088
1041
  }
1089
1042
  ).set_sparam());
1090
1043
  add_opt(common_arg(
@@ -1100,7 +1053,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1100
1053
  try {
1101
1054
  if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
1102
1055
  const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
1103
- params.sparams.logit_bias.push_back({key, bias});
1056
+ params.sampling.logit_bias.push_back({key, bias});
1104
1057
  } else {
1105
1058
  throw std::invalid_argument("invalid input format");
1106
1059
  }
@@ -1111,9 +1064,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1111
1064
  ).set_sparam());
1112
1065
  add_opt(common_arg(
1113
1066
  {"--grammar"}, "GRAMMAR",
1114
- string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
1067
+ string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
1115
1068
  [](common_params & params, const std::string & value) {
1116
- params.sparams.grammar = value;
1069
+ params.sampling.grammar = value;
1117
1070
  }
1118
1071
  ).set_sparam());
1119
1072
  add_opt(common_arg(
@@ -1127,7 +1080,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1127
1080
  std::copy(
1128
1081
  std::istreambuf_iterator<char>(file),
1129
1082
  std::istreambuf_iterator<char>(),
1130
- std::back_inserter(params.sparams.grammar)
1083
+ std::back_inserter(params.sampling.grammar)
1131
1084
  );
1132
1085
  }
1133
1086
  ).set_sparam());
@@ -1135,7 +1088,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1135
1088
  {"-j", "--json-schema"}, "SCHEMA",
1136
1089
  "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
1137
1090
  [](common_params & params, const std::string & value) {
1138
- params.sparams.grammar = json_schema_to_grammar(json::parse(value));
1091
+ params.sampling.grammar = json_schema_to_grammar(json::parse(value));
1139
1092
  }
1140
1093
  ).set_sparam());
1141
1094
  add_opt(common_arg(
@@ -1255,18 +1208,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1255
1208
  ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
1256
1209
  add_opt(common_arg(
1257
1210
  {"-ctk", "--cache-type-k"}, "TYPE",
1258
- string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
1211
+ string_format(
1212
+ "KV cache data type for K\n"
1213
+ "allowed values: %s\n"
1214
+ "(default: %s)",
1215
+ get_all_kv_cache_types().c_str(),
1216
+ ggml_type_name(params.cache_type_k)
1217
+ ),
1259
1218
  [](common_params & params, const std::string & value) {
1260
- // TODO: get the type right here
1261
- params.cache_type_k = value;
1219
+ params.cache_type_k = kv_cache_type_from_str(value);
1262
1220
  }
1263
1221
  ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
1264
1222
  add_opt(common_arg(
1265
1223
  {"-ctv", "--cache-type-v"}, "TYPE",
1266
- string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
1224
+ string_format(
1225
+ "KV cache data type for V\n"
1226
+ "allowed values: %s\n"
1227
+ "(default: %s)",
1228
+ get_all_kv_cache_types().c_str(),
1229
+ ggml_type_name(params.cache_type_v)
1230
+ ),
1267
1231
  [](common_params & params, const std::string & value) {
1268
- // TODO: get the type right here
1269
- params.cache_type_v = value;
1232
+ params.cache_type_v = kv_cache_type_from_str(value);
1270
1233
  }
1271
1234
  ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
1272
1235
  add_opt(common_arg(
@@ -1433,28 +1396,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1433
1396
  else { throw std::invalid_argument("invalid value"); }
1434
1397
  }
1435
1398
  ).set_env("LLAMA_ARG_NUMA"));
1399
+ add_opt(common_arg(
1400
+ {"-dev", "--device"}, "<dev1,dev2,..>",
1401
+ "comma-separated list of devices to use for offloading (none = don't offload)\n"
1402
+ "use --list-devices to see a list of available devices",
1403
+ [](common_params & params, const std::string & value) {
1404
+ params.devices = parse_device_list(value);
1405
+ }
1406
+ ).set_env("LLAMA_ARG_DEVICE"));
1407
+ add_opt(common_arg(
1408
+ {"--list-devices"},
1409
+ "print list of available devices and exit",
1410
+ [](common_params &) {
1411
+ printf("Available devices:\n");
1412
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
1413
+ auto * dev = ggml_backend_dev_get(i);
1414
+ if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
1415
+ size_t free, total;
1416
+ ggml_backend_dev_memory(dev, &free, &total);
1417
+ printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
1418
+ }
1419
+ }
1420
+ exit(0);
1421
+ }
1422
+ ));
1436
1423
  add_opt(common_arg(
1437
1424
  {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
1438
1425
  "number of layers to store in VRAM",
1439
1426
  [](common_params & params, int value) {
1440
1427
  params.n_gpu_layers = value;
1441
1428
  if (!llama_supports_gpu_offload()) {
1442
- fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
1443
- fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
1429
+ fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
1430
+ fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
1431
+ fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
1444
1432
  }
1445
1433
  }
1446
1434
  ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
1447
- add_opt(common_arg(
1448
- {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
1449
- "number of layers to store in VRAM for the draft model",
1450
- [](common_params & params, int value) {
1451
- params.n_gpu_layers_draft = value;
1452
- if (!llama_supports_gpu_offload()) {
1453
- fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
1454
- fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
1455
- }
1456
- }
1457
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
1458
1435
  add_opt(common_arg(
1459
1436
  {"-sm", "--split-mode"}, "{none,layer,row}",
1460
1437
  "how to split the model across multiple GPUs, one of:\n"
@@ -1468,10 +1445,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1468
1445
  } else if (arg_next == "layer") {
1469
1446
  params.split_mode = LLAMA_SPLIT_MODE_LAYER;
1470
1447
  } else if (arg_next == "row") {
1471
- #ifdef GGML_USE_SYCL
1472
- fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
1473
- exit(1);
1474
- #endif // GGML_USE_SYCL
1475
1448
  params.split_mode = LLAMA_SPLIT_MODE_ROW;
1476
1449
  } else {
1477
1450
  throw std::invalid_argument("invalid value");
@@ -1593,13 +1566,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1593
1566
  params.model = value;
1594
1567
  }
1595
1568
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
1596
- add_opt(common_arg(
1597
- {"-md", "--model-draft"}, "FNAME",
1598
- "draft model for speculative decoding (default: unused)",
1599
- [](common_params & params, const std::string & value) {
1600
- params.model_draft = value;
1601
- }
1602
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
1603
1569
  add_opt(common_arg(
1604
1570
  {"-mu", "--model-url"}, "MODEL_URL",
1605
1571
  "model download url (default: unused)",
@@ -1621,6 +1587,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1621
1587
  params.hf_file = value;
1622
1588
  }
1623
1589
  ).set_env("LLAMA_ARG_HF_FILE"));
1590
+ add_opt(common_arg(
1591
+ {"-hfrv", "--hf-repo-v"}, "REPO",
1592
+ "Hugging Face model repository for the vocoder model (default: unused)",
1593
+ [](common_params & params, const std::string & value) {
1594
+ params.vocoder.hf_repo = value;
1595
+ }
1596
+ ).set_env("LLAMA_ARG_HF_REPO_V"));
1597
+ add_opt(common_arg(
1598
+ {"-hffv", "--hf-file-v"}, "FILE",
1599
+ "Hugging Face model file for the vocoder model (default: unused)",
1600
+ [](common_params & params, const std::string & value) {
1601
+ params.vocoder.hf_file = value;
1602
+ }
1603
+ ).set_env("LLAMA_ARG_HF_FILE_V"));
1624
1604
  add_opt(common_arg(
1625
1605
  {"-hft", "--hf-token"}, "TOKEN",
1626
1606
  "Hugging Face access token (default: value from HF_TOKEN environment variable)",
@@ -1789,6 +1769,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1789
1769
  params.public_path = value;
1790
1770
  }
1791
1771
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
1772
+ add_opt(common_arg(
1773
+ {"--no-webui"},
1774
+ string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
1775
+ [](common_params & params) {
1776
+ params.webui = false;
1777
+ }
1778
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
1792
1779
  add_opt(common_arg(
1793
1780
  {"--embedding", "--embeddings"},
1794
1781
  string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@@ -1904,9 +1891,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1904
1891
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
1905
1892
  add_opt(common_arg(
1906
1893
  {"--chat-template"}, "JINJA_TEMPLATE",
1907
- "set custom jinja chat template (default: template taken from model's metadata)\n"
1908
- "if suffix/prefix are specified, template will be disabled\n"
1909
- "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
1894
+ string_format(
1895
+ "set custom jinja chat template (default: template taken from model's metadata)\n"
1896
+ "if suffix/prefix are specified, template will be disabled\n"
1897
+ "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
1898
+ ),
1910
1899
  [](common_params & params, const std::string & value) {
1911
1900
  if (!common_chat_verify_template(value)) {
1912
1901
  throw std::runtime_error(string_format(
@@ -2037,5 +2026,197 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2037
2026
  }
2038
2027
  ).set_env("LLAMA_LOG_TIMESTAMPS"));
2039
2028
 
2029
+ // speculative parameters
2030
+ add_opt(common_arg(
2031
+ {"-td", "--threads-draft"}, "N",
2032
+ "number of threads to use during generation (default: same as --threads)",
2033
+ [](common_params & params, int value) {
2034
+ params.speculative.cpuparams.n_threads = value;
2035
+ if (params.speculative.cpuparams.n_threads <= 0) {
2036
+ params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
2037
+ }
2038
+ }
2039
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2040
+ add_opt(common_arg(
2041
+ {"-tbd", "--threads-batch-draft"}, "N",
2042
+ "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
2043
+ [](common_params & params, int value) {
2044
+ params.speculative.cpuparams_batch.n_threads = value;
2045
+ if (params.speculative.cpuparams_batch.n_threads <= 0) {
2046
+ params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
2047
+ }
2048
+ }
2049
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2050
+ add_opt(common_arg(
2051
+ {"-Cd", "--cpu-mask-draft"}, "M",
2052
+ "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
2053
+ [](common_params & params, const std::string & mask) {
2054
+ params.speculative.cpuparams.mask_valid = true;
2055
+ if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) {
2056
+ throw std::invalid_argument("invalid cpumask");
2057
+ }
2058
+ }
2059
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2060
+ add_opt(common_arg(
2061
+ {"-Crd", "--cpu-range-draft"}, "lo-hi",
2062
+ "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
2063
+ [](common_params & params, const std::string & range) {
2064
+ params.speculative.cpuparams.mask_valid = true;
2065
+ if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) {
2066
+ throw std::invalid_argument("invalid range");
2067
+ }
2068
+ }
2069
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2070
+ add_opt(common_arg(
2071
+ {"--cpu-strict-draft"}, "<0|1>",
2072
+ "Use strict CPU placement for draft model (default: same as --cpu-strict)",
2073
+ [](common_params & params, int value) {
2074
+ params.speculative.cpuparams.strict_cpu = value;
2075
+ }
2076
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2077
+ add_opt(common_arg(
2078
+ {"--prio-draft"}, "N",
2079
+ string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority),
2080
+ [](common_params & params, int prio) {
2081
+ if (prio < 0 || prio > 3) {
2082
+ throw std::invalid_argument("invalid value");
2083
+ }
2084
+ params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio;
2085
+ }
2086
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2087
+ add_opt(common_arg(
2088
+ {"--poll-draft"}, "<0|1>",
2089
+ "Use polling to wait for draft model work (default: same as --poll])",
2090
+ [](common_params & params, int value) {
2091
+ params.speculative.cpuparams.poll = value;
2092
+ }
2093
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2094
+ add_opt(common_arg(
2095
+ {"-Cbd", "--cpu-mask-batch-draft"}, "M",
2096
+ "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
2097
+ [](common_params & params, const std::string & mask) {
2098
+ params.speculative.cpuparams_batch.mask_valid = true;
2099
+ if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) {
2100
+ throw std::invalid_argument("invalid cpumask");
2101
+ }
2102
+ }
2103
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2104
+ add_opt(common_arg(
2105
+ {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
2106
+ "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
2107
+ [](common_params & params, const std::string & range) {
2108
+ params.speculative.cpuparams_batch.mask_valid = true;
2109
+ if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) {
2110
+ throw std::invalid_argument("invalid cpumask");
2111
+ }
2112
+ }
2113
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2114
+ add_opt(common_arg(
2115
+ {"--cpu-strict-batch-draft"}, "<0|1>",
2116
+ "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
2117
+ [](common_params & params, int value) {
2118
+ params.speculative.cpuparams_batch.strict_cpu = value;
2119
+ }
2120
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2121
+ add_opt(common_arg(
2122
+ {"--prio-batch-draft"}, "N",
2123
+ string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority),
2124
+ [](common_params & params, int prio) {
2125
+ if (prio < 0 || prio > 3) {
2126
+ throw std::invalid_argument("invalid value");
2127
+ }
2128
+ params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
2129
+ }
2130
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2131
+ add_opt(common_arg(
2132
+ {"--poll-batch-draft"}, "<0|1>",
2133
+ "Use polling to wait for draft model work (default: --poll-draft)",
2134
+ [](common_params & params, int value) {
2135
+ params.speculative.cpuparams_batch.poll = value;
2136
+ }
2137
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2138
+ add_opt(common_arg(
2139
+ {"--draft-max", "--draft", "--draft-n"}, "N",
2140
+ string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
2141
+ [](common_params & params, int value) {
2142
+ params.speculative.n_max = value;
2143
+ }
2144
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
2145
+ add_opt(common_arg(
2146
+ {"--draft-min", "--draft-n-min"}, "N",
2147
+ string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
2148
+ [](common_params & params, int value) {
2149
+ params.speculative.n_min = value;
2150
+ }
2151
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
2152
+ add_opt(common_arg(
2153
+ {"--draft-p-split"}, "P",
2154
+ string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
2155
+ [](common_params & params, const std::string & value) {
2156
+ params.speculative.p_split = std::stof(value);
2157
+ }
2158
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
2159
+ add_opt(common_arg(
2160
+ {"--draft-p-min"}, "P",
2161
+ string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
2162
+ [](common_params & params, const std::string & value) {
2163
+ params.speculative.p_min = std::stof(value);
2164
+ }
2165
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
2166
+ add_opt(common_arg(
2167
+ {"-cd", "--ctx-size-draft"}, "N",
2168
+ string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
2169
+ [](common_params & params, int value) {
2170
+ params.speculative.n_ctx = value;
2171
+ }
2172
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
2173
+ add_opt(common_arg(
2174
+ {"-devd", "--device-draft"}, "<dev1,dev2,..>",
2175
+ "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
2176
+ "use --list-devices to see a list of available devices",
2177
+ [](common_params & params, const std::string & value) {
2178
+ params.speculative.devices = parse_device_list(value);
2179
+ }
2180
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2181
+ add_opt(common_arg(
2182
+ {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
2183
+ "number of layers to store in VRAM for the draft model",
2184
+ [](common_params & params, int value) {
2185
+ params.speculative.n_gpu_layers = value;
2186
+ if (!llama_supports_gpu_offload()) {
2187
+ fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
2188
+ fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
2189
+ fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
2190
+ }
2191
+ }
2192
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
2193
+ add_opt(common_arg(
2194
+ {"-md", "--model-draft"}, "FNAME",
2195
+ "draft model for speculative decoding (default: unused)",
2196
+ [](common_params & params, const std::string & value) {
2197
+ params.speculative.model = value;
2198
+ }
2199
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
2200
+
2201
+ add_opt(common_arg(
2202
+ {"-mv", "--model-vocoder"}, "FNAME",
2203
+ "vocoder model for audio generation (default: unused)",
2204
+ [](common_params & params, const std::string & value) {
2205
+ params.vocoder.model = value;
2206
+ }
2207
+ ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
2208
+
2209
+ // model-specific
2210
+ add_opt(common_arg(
2211
+ {"--tts-oute-default"},
2212
+ string_format("use default OuteTTS models (note: can download weights from the internet)"),
2213
+ [](common_params & params) {
2214
+ params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
2215
+ params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
2216
+ params.vocoder.hf_repo = "ggml-org/WavTokenizer";
2217
+ params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
2218
+ }
2219
+ ).set_examples({LLAMA_EXAMPLE_TTS}));
2220
+
2040
2221
  return ctx_arg;
2041
2222
  }