@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -17,27 +17,27 @@
17
17
 
18
18
  using json = nlohmann::ordered_json;
19
19
 
20
- llama_arg & llama_arg::set_examples(std::initializer_list<enum llama_example> examples) {
20
+ common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
21
21
  this->examples = std::move(examples);
22
22
  return *this;
23
23
  }
24
24
 
25
- llama_arg & llama_arg::set_env(const char * env) {
25
+ common_arg & common_arg::set_env(const char * env) {
26
26
  help = help + "\n(env: " + env + ")";
27
27
  this->env = env;
28
28
  return *this;
29
29
  }
30
30
 
31
- llama_arg & llama_arg::set_sparam() {
31
+ common_arg & common_arg::set_sparam() {
32
32
  is_sparam = true;
33
33
  return *this;
34
34
  }
35
35
 
36
- bool llama_arg::in_example(enum llama_example ex) {
36
+ bool common_arg::in_example(enum llama_example ex) {
37
37
  return examples.find(ex) != examples.end();
38
38
  }
39
39
 
40
- bool llama_arg::get_value_from_env(std::string & output) {
40
+ bool common_arg::get_value_from_env(std::string & output) {
41
41
  if (env == nullptr) return false;
42
42
  char * value = std::getenv(env);
43
43
  if (value) {
@@ -47,7 +47,7 @@ bool llama_arg::get_value_from_env(std::string & output) {
47
47
  return false;
48
48
  }
49
49
 
50
- bool llama_arg::has_value_from_env() {
50
+ bool common_arg::has_value_from_env() {
51
51
  return env != nullptr && std::getenv(env);
52
52
  }
53
53
 
@@ -78,7 +78,7 @@ static std::vector<std::string> break_str_into_lines(std::string input, size_t m
78
78
  return result;
79
79
  }
80
80
 
81
- std::string llama_arg::to_string() {
81
+ std::string common_arg::to_string() {
82
82
  // params for printing to console
83
83
  const static int n_leading_spaces = 40;
84
84
  const static int n_char_per_line_help = 70; // TODO: detect this based on current console
@@ -119,64 +119,75 @@ std::string llama_arg::to_string() {
119
119
  // utils
120
120
  //
121
121
 
122
- #ifdef __GNUC__
123
- #ifdef __MINGW32__
124
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
125
- #else
126
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
127
- #endif
128
- #else
129
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
130
- #endif
131
-
132
- LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
133
- static std::string format(const char * fmt, ...) {
134
- va_list ap;
135
- va_list ap2;
136
- va_start(ap, fmt);
137
- va_copy(ap2, ap);
138
- int size = vsnprintf(NULL, 0, fmt, ap);
139
- GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
140
- std::vector<char> buf(size + 1);
141
- int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
142
- GGML_ASSERT(size2 == size);
143
- va_end(ap2);
144
- va_end(ap);
145
- return std::string(buf.data(), size);
146
- }
147
-
148
- static void gpt_params_handle_model_default(gpt_params & params) {
149
- if (!params.hf_repo.empty()) {
122
+ static void common_params_handle_model_default(
123
+ std::string & model,
124
+ std::string & model_url,
125
+ std::string & hf_repo,
126
+ std::string & hf_file) {
127
+ if (!hf_repo.empty()) {
150
128
  // short-hand to avoid specifying --hf-file -> default it to --model
151
- if (params.hf_file.empty()) {
152
- if (params.model.empty()) {
129
+ if (hf_file.empty()) {
130
+ if (model.empty()) {
153
131
  throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
154
132
  }
155
- params.hf_file = params.model;
156
- } else if (params.model.empty()) {
157
- params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
158
- }
159
- } else if (!params.model_url.empty()) {
160
- if (params.model.empty()) {
161
- auto f = string_split(params.model_url, '#').front();
162
- f = string_split(f, '?').front();
163
- params.model = fs_get_cache_file(string_split(f, '/').back());
164
- }
165
- } else if (params.model.empty()) {
166
- params.model = DEFAULT_MODEL_PATH;
133
+ hf_file = model;
134
+ } else if (model.empty()) {
135
+ // this is to avoid different repo having same file name, or same file name in different subdirs
136
+ std::string filename = hf_repo + "_" + hf_file;
137
+ // to make sure we don't have any slashes in the filename
138
+ string_replace_all(filename, "/", "_");
139
+ model = fs_get_cache_file(filename);
140
+ }
141
+ } else if (!model_url.empty()) {
142
+ if (model.empty()) {
143
+ auto f = string_split<std::string>(model_url, '#').front();
144
+ f = string_split<std::string>(f, '?').front();
145
+ model = fs_get_cache_file(string_split<std::string>(f, '/').back());
146
+ }
147
+ } else if (model.empty()) {
148
+ model = DEFAULT_MODEL_PATH;
149
+ }
150
+ }
151
+
152
+ const std::vector<ggml_type> kv_cache_types = {
153
+ GGML_TYPE_F32,
154
+ GGML_TYPE_F16,
155
+ GGML_TYPE_BF16,
156
+ GGML_TYPE_Q8_0,
157
+ GGML_TYPE_Q4_0,
158
+ GGML_TYPE_Q4_1,
159
+ GGML_TYPE_IQ4_NL,
160
+ GGML_TYPE_Q5_0,
161
+ GGML_TYPE_Q5_1,
162
+ };
163
+
164
+ static ggml_type kv_cache_type_from_str(const std::string & s) {
165
+ for (const auto & type : kv_cache_types) {
166
+ if (ggml_type_name(type) == s) {
167
+ return type;
168
+ }
169
+ }
170
+ throw std::runtime_error("Unsupported cache type: " + s);
171
+ }
172
+
173
+ static std::string get_all_kv_cache_types() {
174
+ std::ostringstream msg;
175
+ for (const auto & type : kv_cache_types) {
176
+ msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
167
177
  }
178
+ return msg.str();
168
179
  }
169
180
 
170
181
  //
171
182
  // CLI argument parsing functions
172
183
  //
173
184
 
174
- static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx_arg) {
185
+ static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
175
186
  std::string arg;
176
187
  const std::string arg_prefix = "--";
177
- gpt_params & params = ctx_arg.params;
188
+ common_params & params = ctx_arg.params;
178
189
 
179
- std::unordered_map<std::string, llama_arg *> arg_to_options;
190
+ std::unordered_map<std::string, common_arg *> arg_to_options;
180
191
  for (auto & opt : ctx_arg.options) {
181
192
  for (const auto & arg : opt.args) {
182
193
  arg_to_options[arg] = &opt;
@@ -199,7 +210,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
199
210
  continue;
200
211
  }
201
212
  } catch (std::exception & e) {
202
- throw std::invalid_argument(format(
213
+ throw std::invalid_argument(string_format(
203
214
  "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
204
215
  }
205
216
  }
@@ -220,7 +231,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
220
231
  std::replace(arg.begin(), arg.end(), '_', '-');
221
232
  }
222
233
  if (arg_to_options.find(arg) == arg_to_options.end()) {
223
- throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str()));
234
+ throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
224
235
  }
225
236
  auto opt = *arg_to_options[arg];
226
237
  if (opt.has_value_from_env()) {
@@ -252,23 +263,26 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
252
263
  continue;
253
264
  }
254
265
  } catch (std::exception & e) {
255
- throw std::invalid_argument(format(
266
+ throw std::invalid_argument(string_format(
256
267
  "error while handling argument \"%s\": %s\n\n"
257
268
  "usage:\n%s\n\nto show complete usage, run with -h",
258
269
  arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
259
270
  }
260
271
  }
261
272
 
262
- postprocess_cpu_params(params.cpuparams, nullptr);
273
+ postprocess_cpu_params(params.cpuparams, nullptr);
263
274
  postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
264
- postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
265
- postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
275
+
276
+ postprocess_cpu_params(params.speculative.cpuparams, &params.cpuparams);
277
+ postprocess_cpu_params(params.speculative.cpuparams_batch, &params.cpuparams_batch);
266
278
 
267
279
  if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
268
280
  throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
269
281
  }
270
282
 
271
- gpt_params_handle_model_default(params);
283
+ // TODO: refactor model params in a common struct
284
+ common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file);
285
+ common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file);
272
286
 
273
287
  if (params.escape) {
274
288
  string_process_escapes(params.prompt);
@@ -277,6 +291,9 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
277
291
  for (auto & antiprompt : params.antiprompt) {
278
292
  string_process_escapes(antiprompt);
279
293
  }
294
+ for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
295
+ string_process_escapes(seq_breaker);
296
+ }
280
297
  }
281
298
 
282
299
  if (!params.kv_overrides.empty()) {
@@ -291,16 +308,16 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
291
308
  return true;
292
309
  }
293
310
 
294
- static void gpt_params_print_usage(gpt_params_context & ctx_arg) {
295
- auto print_options = [](std::vector<llama_arg *> & options) {
296
- for (llama_arg * opt : options) {
311
+ static void common_params_print_usage(common_params_context & ctx_arg) {
312
+ auto print_options = [](std::vector<common_arg *> & options) {
313
+ for (common_arg * opt : options) {
297
314
  printf("%s", opt->to_string().c_str());
298
315
  }
299
316
  };
300
317
 
301
- std::vector<llama_arg *> common_options;
302
- std::vector<llama_arg *> sparam_options;
303
- std::vector<llama_arg *> specific_options;
318
+ std::vector<common_arg *> common_options;
319
+ std::vector<common_arg *> sparam_options;
320
+ std::vector<common_arg *> specific_options;
304
321
  for (auto & opt : ctx_arg.options) {
305
322
  // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
306
323
  if (opt.is_sparam) {
@@ -320,17 +337,38 @@ static void gpt_params_print_usage(gpt_params_context & ctx_arg) {
320
337
  print_options(specific_options);
321
338
  }
322
339
 
323
- bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) {
324
- auto ctx_arg = gpt_params_parser_init(params, ex, print_usage);
325
- const gpt_params params_org = ctx_arg.params; // the example can modify the default params
340
+ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
341
+ std::vector<ggml_backend_dev_t> devices;
342
+ auto dev_names = string_split<std::string>(value, ',');
343
+ if (dev_names.empty()) {
344
+ throw std::invalid_argument("no devices specified");
345
+ }
346
+ if (dev_names.size() == 1 && dev_names[0] == "none") {
347
+ devices.push_back(nullptr);
348
+ } else {
349
+ for (const auto & device : dev_names) {
350
+ auto * dev = ggml_backend_dev_by_name(device.c_str());
351
+ if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
352
+ throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
353
+ }
354
+ devices.push_back(dev);
355
+ }
356
+ devices.push_back(nullptr);
357
+ }
358
+ return devices;
359
+ }
360
+
361
+ bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
362
+ auto ctx_arg = common_params_parser_init(params, ex, print_usage);
363
+ const common_params params_org = ctx_arg.params; // the example can modify the default params
326
364
 
327
365
  try {
328
- if (!gpt_params_parse_ex(argc, argv, ctx_arg)) {
366
+ if (!common_params_parse_ex(argc, argv, ctx_arg)) {
329
367
  ctx_arg.params = params_org;
330
368
  return false;
331
369
  }
332
370
  if (ctx_arg.params.usage) {
333
- gpt_params_print_usage(ctx_arg);
371
+ common_params_print_usage(ctx_arg);
334
372
  if (ctx_arg.print_usage) {
335
373
  ctx_arg.print_usage(argc, argv);
336
374
  }
@@ -345,16 +383,31 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example
345
383
  return true;
346
384
  }
347
385
 
348
- gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) {
349
- gpt_params_context ctx_arg(params);
386
+ static std::string list_builtin_chat_templates() {
387
+ std::vector<const char *> supported_tmpl;
388
+ int32_t res = llama_chat_builtin_templates(nullptr, 0);
389
+ supported_tmpl.resize(res);
390
+ res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
391
+ std::ostringstream msg;
392
+ for (auto & tmpl : supported_tmpl) {
393
+ msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", ");
394
+ }
395
+ return msg.str();
396
+ }
397
+
398
+ common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
399
+ // load dynamic backends
400
+ ggml_backend_load_all();
401
+
402
+ common_params_context ctx_arg(params);
350
403
  ctx_arg.print_usage = print_usage;
351
404
  ctx_arg.ex = ex;
352
405
 
353
406
  std::string sampler_type_chars;
354
407
  std::string sampler_type_names;
355
- for (const auto & sampler : params.sparams.samplers) {
356
- sampler_type_chars += gpt_sampler_type_to_chr(sampler);
357
- sampler_type_names += gpt_sampler_type_to_str(sampler) + ";";
408
+ for (const auto & sampler : params.sampling.samplers) {
409
+ sampler_type_chars += common_sampler_type_to_chr(sampler);
410
+ sampler_type_names += common_sampler_type_to_str(sampler) + ";";
358
411
  }
359
412
  sampler_type_names.pop_back();
360
413
 
@@ -366,374 +419,252 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
366
419
  * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
367
420
  * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
368
421
  */
369
- auto add_opt = [&](llama_arg arg) {
422
+ auto add_opt = [&](common_arg arg) {
370
423
  if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
371
424
  ctx_arg.options.push_back(std::move(arg));
372
425
  }
373
426
  };
374
427
 
375
428
 
376
- add_opt(llama_arg(
429
+ add_opt(common_arg(
377
430
  {"-h", "--help", "--usage"},
378
431
  "print usage and exit",
379
- [](gpt_params & params) {
432
+ [](common_params & params) {
380
433
  params.usage = true;
381
434
  }
382
435
  ));
383
- add_opt(llama_arg(
436
+ add_opt(common_arg(
384
437
  {"--version"},
385
438
  "show version and build info",
386
- [](gpt_params &) {
439
+ [](common_params &) {
387
440
  fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
388
441
  fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
389
442
  exit(0);
390
443
  }
391
444
  ));
392
- add_opt(llama_arg(
445
+ add_opt(common_arg(
393
446
  {"--verbose-prompt"},
394
- format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
395
- [](gpt_params & params) {
447
+ string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
448
+ [](common_params & params) {
396
449
  params.verbose_prompt = true;
397
450
  }
398
451
  ));
399
- add_opt(llama_arg(
452
+ add_opt(common_arg(
400
453
  {"--no-display-prompt"},
401
- format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
402
- [](gpt_params & params) {
454
+ string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
455
+ [](common_params & params) {
403
456
  params.display_prompt = false;
404
457
  }
405
458
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
406
- add_opt(llama_arg(
459
+ add_opt(common_arg(
407
460
  {"-co", "--color"},
408
- format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
409
- [](gpt_params & params) {
461
+ string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
462
+ [](common_params & params) {
410
463
  params.use_color = true;
411
464
  }
412
465
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
413
- add_opt(llama_arg(
466
+ add_opt(common_arg(
414
467
  {"-t", "--threads"}, "N",
415
- format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
416
- [](gpt_params & params, int value) {
468
+ string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
469
+ [](common_params & params, int value) {
417
470
  params.cpuparams.n_threads = value;
418
471
  if (params.cpuparams.n_threads <= 0) {
419
472
  params.cpuparams.n_threads = std::thread::hardware_concurrency();
420
473
  }
421
474
  }
422
475
  ).set_env("LLAMA_ARG_THREADS"));
423
- add_opt(llama_arg(
476
+ add_opt(common_arg(
424
477
  {"-tb", "--threads-batch"}, "N",
425
478
  "number of threads to use during batch and prompt processing (default: same as --threads)",
426
- [](gpt_params & params, int value) {
479
+ [](common_params & params, int value) {
427
480
  params.cpuparams_batch.n_threads = value;
428
481
  if (params.cpuparams_batch.n_threads <= 0) {
429
482
  params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
430
483
  }
431
484
  }
432
485
  ));
433
- add_opt(llama_arg(
434
- {"-td", "--threads-draft"}, "N",
435
- "number of threads to use during generation (default: same as --threads)",
436
- [](gpt_params & params, int value) {
437
- params.draft_cpuparams.n_threads = value;
438
- if (params.draft_cpuparams.n_threads <= 0) {
439
- params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
440
- }
441
- }
442
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
443
- add_opt(llama_arg(
444
- {"-tbd", "--threads-batch-draft"}, "N",
445
- "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
446
- [](gpt_params & params, int value) {
447
- params.draft_cpuparams_batch.n_threads = value;
448
- if (params.draft_cpuparams_batch.n_threads <= 0) {
449
- params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
450
- }
451
- }
452
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
453
- add_opt(llama_arg(
486
+ add_opt(common_arg(
454
487
  {"-C", "--cpu-mask"}, "M",
455
488
  "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
456
- [](gpt_params & params, const std::string & mask) {
489
+ [](common_params & params, const std::string & mask) {
457
490
  params.cpuparams.mask_valid = true;
458
491
  if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) {
459
492
  throw std::invalid_argument("invalid cpumask");
460
493
  }
461
494
  }
462
495
  ));
463
- add_opt(llama_arg(
496
+ add_opt(common_arg(
464
497
  {"-Cr", "--cpu-range"}, "lo-hi",
465
498
  "range of CPUs for affinity. Complements --cpu-mask",
466
- [](gpt_params & params, const std::string & range) {
499
+ [](common_params & params, const std::string & range) {
467
500
  params.cpuparams.mask_valid = true;
468
501
  if (!parse_cpu_range(range, params.cpuparams.cpumask)) {
469
502
  throw std::invalid_argument("invalid range");
470
503
  }
471
504
  }
472
505
  ));
473
- add_opt(llama_arg(
506
+ add_opt(common_arg(
474
507
  {"--cpu-strict"}, "<0|1>",
475
- format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
476
- [](gpt_params & params, const std::string & value) {
508
+ string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
509
+ [](common_params & params, const std::string & value) {
477
510
  params.cpuparams.strict_cpu = std::stoul(value);
478
511
  }
479
512
  ));
480
- add_opt(llama_arg(
513
+ add_opt(common_arg(
481
514
  {"--prio"}, "N",
482
- format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
483
- [](gpt_params & params, int prio) {
515
+ string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
516
+ [](common_params & params, int prio) {
484
517
  if (prio < 0 || prio > 3) {
485
518
  throw std::invalid_argument("invalid value");
486
519
  }
487
520
  params.cpuparams.priority = (enum ggml_sched_priority) prio;
488
521
  }
489
522
  ));
490
- add_opt(llama_arg(
523
+ add_opt(common_arg(
491
524
  {"--poll"}, "<0...100>",
492
- format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
493
- [](gpt_params & params, const std::string & value) {
525
+ string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
526
+ [](common_params & params, const std::string & value) {
494
527
  params.cpuparams.poll = std::stoul(value);
495
528
  }
496
529
  ));
497
- add_opt(llama_arg(
530
+ add_opt(common_arg(
498
531
  {"-Cb", "--cpu-mask-batch"}, "M",
499
532
  "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
500
- [](gpt_params & params, const std::string & mask) {
533
+ [](common_params & params, const std::string & mask) {
501
534
  params.cpuparams_batch.mask_valid = true;
502
535
  if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) {
503
536
  throw std::invalid_argument("invalid cpumask");
504
537
  }
505
538
  }
506
539
  ));
507
- add_opt(llama_arg(
540
+ add_opt(common_arg(
508
541
  {"-Crb", "--cpu-range-batch"}, "lo-hi",
509
542
  "ranges of CPUs for affinity. Complements --cpu-mask-batch",
510
- [](gpt_params & params, const std::string & range) {
543
+ [](common_params & params, const std::string & range) {
511
544
  params.cpuparams_batch.mask_valid = true;
512
545
  if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) {
513
546
  throw std::invalid_argument("invalid range");
514
547
  }
515
548
  }
516
549
  ));
517
- add_opt(llama_arg(
550
+ add_opt(common_arg(
518
551
  {"--cpu-strict-batch"}, "<0|1>",
519
552
  "use strict CPU placement (default: same as --cpu-strict)",
520
- [](gpt_params & params, int value) {
553
+ [](common_params & params, int value) {
521
554
  params.cpuparams_batch.strict_cpu = value;
522
555
  }
523
556
  ));
524
- add_opt(llama_arg(
557
+ add_opt(common_arg(
525
558
  {"--prio-batch"}, "N",
526
- format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
527
- [](gpt_params & params, int prio) {
559
+ string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
560
+ [](common_params & params, int prio) {
528
561
  if (prio < 0 || prio > 3) {
529
562
  throw std::invalid_argument("invalid value");
530
563
  }
531
564
  params.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
532
565
  }
533
566
  ));
534
- add_opt(llama_arg(
567
+ add_opt(common_arg(
535
568
  {"--poll-batch"}, "<0|1>",
536
569
  "use polling to wait for work (default: same as --poll)",
537
- [](gpt_params & params, int value) {
570
+ [](common_params & params, int value) {
538
571
  params.cpuparams_batch.poll = value;
539
572
  }
540
573
  ));
541
- add_opt(llama_arg(
542
- {"-Cd", "--cpu-mask-draft"}, "M",
543
- "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
544
- [](gpt_params & params, const std::string & mask) {
545
- params.draft_cpuparams.mask_valid = true;
546
- if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
547
- throw std::invalid_argument("invalid cpumask");
548
- }
549
- }
550
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
551
- add_opt(llama_arg(
552
- {"-Crd", "--cpu-range-draft"}, "lo-hi",
553
- "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
554
- [](gpt_params & params, const std::string & range) {
555
- params.draft_cpuparams.mask_valid = true;
556
- if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
557
- throw std::invalid_argument("invalid range");
558
- }
559
- }
560
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
561
- add_opt(llama_arg(
562
- {"--cpu-strict-draft"}, "<0|1>",
563
- "Use strict CPU placement for draft model (default: same as --cpu-strict)",
564
- [](gpt_params & params, int value) {
565
- params.draft_cpuparams.strict_cpu = value;
566
- }
567
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
568
- add_opt(llama_arg(
569
- {"--prio-draft"}, "N",
570
- format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
571
- [](gpt_params & params, int prio) {
572
- if (prio < 0 || prio > 3) {
573
- throw std::invalid_argument("invalid value");
574
- }
575
- params.draft_cpuparams.priority = (enum ggml_sched_priority) prio;
576
- }
577
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
578
- add_opt(llama_arg(
579
- {"--poll-draft"}, "<0|1>",
580
- "Use polling to wait for draft model work (default: same as --poll])",
581
- [](gpt_params & params, int value) {
582
- params.draft_cpuparams.poll = value;
583
- }
584
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
585
- add_opt(llama_arg(
586
- {"-Cbd", "--cpu-mask-batch-draft"}, "M",
587
- "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
588
- [](gpt_params & params, const std::string & mask) {
589
- params.draft_cpuparams_batch.mask_valid = true;
590
- if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) {
591
- throw std::invalid_argument("invalid cpumask");
592
- }
593
- }
594
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
595
- add_opt(llama_arg(
596
- {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
597
- "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
598
- [](gpt_params & params, const std::string & range) {
599
- params.draft_cpuparams_batch.mask_valid = true;
600
- if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
601
- throw std::invalid_argument("invalid cpumask");
602
- }
603
- }
604
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
605
- add_opt(llama_arg(
606
- {"--cpu-strict-batch-draft"}, "<0|1>",
607
- "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
608
- [](gpt_params & params, int value) {
609
- params.draft_cpuparams_batch.strict_cpu = value;
610
- }
611
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
612
- add_opt(llama_arg(
613
- {"--prio-batch-draft"}, "N",
614
- format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
615
- [](gpt_params & params, int prio) {
616
- if (prio < 0 || prio > 3) {
617
- throw std::invalid_argument("invalid value");
618
- }
619
- params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio;
620
- }
621
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
622
- add_opt(llama_arg(
623
- {"--poll-batch-draft"}, "<0|1>",
624
- "Use polling to wait for draft model work (default: --poll-draft)",
625
- [](gpt_params & params, int value) {
626
- params.draft_cpuparams_batch.poll = value;
627
- }
628
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
629
- add_opt(llama_arg(
630
- {"--draft"}, "N",
631
- format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
632
- [](gpt_params & params, int value) {
633
- params.n_draft = value;
634
- }
635
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
636
- add_opt(llama_arg(
637
- {"-ps", "--p-split"}, "N",
638
- format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
639
- [](gpt_params & params, const std::string & value) {
640
- params.p_split = std::stof(value);
641
- }
642
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
643
- add_opt(llama_arg(
574
+ add_opt(common_arg(
644
575
  {"-lcs", "--lookup-cache-static"}, "FNAME",
645
576
  "path to static lookup cache to use for lookup decoding (not updated by generation)",
646
- [](gpt_params & params, const std::string & value) {
577
+ [](common_params & params, const std::string & value) {
647
578
  params.lookup_cache_static = value;
648
579
  }
649
580
  ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
650
- add_opt(llama_arg(
581
+ add_opt(common_arg(
651
582
  {"-lcd", "--lookup-cache-dynamic"}, "FNAME",
652
583
  "path to dynamic lookup cache to use for lookup decoding (updated by generation)",
653
- [](gpt_params & params, const std::string & value) {
584
+ [](common_params & params, const std::string & value) {
654
585
  params.lookup_cache_dynamic = value;
655
586
  }
656
587
  ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
657
- add_opt(llama_arg(
588
+ add_opt(common_arg(
658
589
  {"-c", "--ctx-size"}, "N",
659
- format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
660
- [](gpt_params & params, int value) {
590
+ string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
591
+ [](common_params & params, int value) {
661
592
  params.n_ctx = value;
662
593
  }
663
594
  ).set_env("LLAMA_ARG_CTX_SIZE"));
664
- add_opt(llama_arg(
595
+ add_opt(common_arg(
665
596
  {"-n", "--predict", "--n-predict"}, "N",
666
- format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
667
- [](gpt_params & params, int value) {
597
+ string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
598
+ [](common_params & params, int value) {
668
599
  params.n_predict = value;
669
600
  }
670
601
  ).set_env("LLAMA_ARG_N_PREDICT"));
671
- add_opt(llama_arg(
602
+ add_opt(common_arg(
672
603
  {"-b", "--batch-size"}, "N",
673
- format("logical maximum batch size (default: %d)", params.n_batch),
674
- [](gpt_params & params, int value) {
604
+ string_format("logical maximum batch size (default: %d)", params.n_batch),
605
+ [](common_params & params, int value) {
675
606
  params.n_batch = value;
676
607
  }
677
608
  ).set_env("LLAMA_ARG_BATCH"));
678
- add_opt(llama_arg(
609
+ add_opt(common_arg(
679
610
  {"-ub", "--ubatch-size"}, "N",
680
- format("physical maximum batch size (default: %d)", params.n_ubatch),
681
- [](gpt_params & params, int value) {
611
+ string_format("physical maximum batch size (default: %d)", params.n_ubatch),
612
+ [](common_params & params, int value) {
682
613
  params.n_ubatch = value;
683
614
  }
684
615
  ).set_env("LLAMA_ARG_UBATCH"));
685
- add_opt(llama_arg(
616
+ add_opt(common_arg(
686
617
  {"--keep"}, "N",
687
- format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
688
- [](gpt_params & params, int value) {
618
+ string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
619
+ [](common_params & params, int value) {
689
620
  params.n_keep = value;
690
621
  }
691
622
  ));
692
- add_opt(llama_arg(
623
+ add_opt(common_arg(
693
624
  {"--no-context-shift"},
694
- format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
695
- [](gpt_params & params) {
625
+ string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
626
+ [](common_params & params) {
696
627
  params.ctx_shift = false;
697
628
  }
698
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
699
- add_opt(llama_arg(
629
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
630
+ add_opt(common_arg(
700
631
  {"--chunks"}, "N",
701
- format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
702
- [](gpt_params & params, int value) {
632
+ string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
633
+ [](common_params & params, int value) {
703
634
  params.n_chunks = value;
704
635
  }
705
636
  ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
706
- add_opt(llama_arg(
637
+ add_opt(common_arg(
707
638
  {"-fa", "--flash-attn"},
708
- format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
709
- [](gpt_params & params) {
639
+ string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
640
+ [](common_params & params) {
710
641
  params.flash_attn = true;
711
642
  }
712
643
  ).set_env("LLAMA_ARG_FLASH_ATTN"));
713
- add_opt(llama_arg(
644
+ add_opt(common_arg(
714
645
  {"-p", "--prompt"}, "PROMPT",
715
646
  ex == LLAMA_EXAMPLE_MAIN
716
647
  ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
717
648
  : "prompt to start generation with",
718
- [](gpt_params & params, const std::string & value) {
649
+ [](common_params & params, const std::string & value) {
719
650
  params.prompt = value;
720
651
  }
721
652
  ));
722
- add_opt(llama_arg(
653
+ add_opt(common_arg(
723
654
  {"--no-perf"},
724
- format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
725
- [](gpt_params & params) {
655
+ string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
656
+ [](common_params & params) {
726
657
  params.no_perf = true;
727
- params.sparams.no_perf = true;
658
+ params.sampling.no_perf = true;
728
659
  }
729
660
  ).set_env("LLAMA_ARG_NO_PERF"));
730
- add_opt(llama_arg(
661
+ add_opt(common_arg(
731
662
  {"-f", "--file"}, "FNAME",
732
663
  "a file containing the prompt (default: none)",
733
- [](gpt_params & params, const std::string & value) {
664
+ [](common_params & params, const std::string & value) {
734
665
  std::ifstream file(value);
735
666
  if (!file) {
736
- throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
667
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
737
668
  }
738
669
  // store the external file name in params
739
670
  params.prompt_file = value;
@@ -743,24 +674,24 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
743
674
  }
744
675
  }
745
676
  ));
746
- add_opt(llama_arg(
677
+ add_opt(common_arg(
747
678
  {"--in-file"}, "FNAME",
748
679
  "an input file (repeat to specify multiple files)",
749
- [](gpt_params & params, const std::string & value) {
680
+ [](common_params & params, const std::string & value) {
750
681
  std::ifstream file(value);
751
682
  if (!file) {
752
- throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
683
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
753
684
  }
754
685
  params.in_files.push_back(value);
755
686
  }
756
687
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
757
- add_opt(llama_arg(
688
+ add_opt(common_arg(
758
689
  {"-bf", "--binary-file"}, "FNAME",
759
690
  "binary file containing the prompt (default: none)",
760
- [](gpt_params & params, const std::string & value) {
691
+ [](common_params & params, const std::string & value) {
761
692
  std::ifstream file(value, std::ios::binary);
762
693
  if (!file) {
763
- throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
694
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
764
695
  }
765
696
  // store the external file name in params
766
697
  params.prompt_file = value;
@@ -770,287 +701,351 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
770
701
  fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
771
702
  }
772
703
  ));
773
- add_opt(llama_arg(
704
+ add_opt(common_arg(
774
705
  {"-e", "--escape"},
775
- format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
776
- [](gpt_params & params) {
706
+ string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
707
+ [](common_params & params) {
777
708
  params.escape = true;
778
709
  }
779
710
  ));
780
- add_opt(llama_arg(
711
+ add_opt(common_arg(
781
712
  {"--no-escape"},
782
713
  "do not process escape sequences",
783
- [](gpt_params & params) {
714
+ [](common_params & params) {
784
715
  params.escape = false;
785
716
  }
786
717
  ));
787
- add_opt(llama_arg(
718
+ add_opt(common_arg(
788
719
  {"-ptc", "--print-token-count"}, "N",
789
- format("print token count every N tokens (default: %d)", params.n_print),
790
- [](gpt_params & params, int value) {
720
+ string_format("print token count every N tokens (default: %d)", params.n_print),
721
+ [](common_params & params, int value) {
791
722
  params.n_print = value;
792
723
  }
793
724
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
794
- add_opt(llama_arg(
725
+ add_opt(common_arg(
795
726
  {"--prompt-cache"}, "FNAME",
796
727
  "file to cache prompt state for faster startup (default: none)",
797
- [](gpt_params & params, const std::string & value) {
728
+ [](common_params & params, const std::string & value) {
798
729
  params.path_prompt_cache = value;
799
730
  }
800
731
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
801
- add_opt(llama_arg(
732
+ add_opt(common_arg(
802
733
  {"--prompt-cache-all"},
803
734
  "if specified, saves user input and generations to cache as well\n",
804
- [](gpt_params & params) {
735
+ [](common_params & params) {
805
736
  params.prompt_cache_all = true;
806
737
  }
807
738
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
808
- add_opt(llama_arg(
739
+ add_opt(common_arg(
809
740
  {"--prompt-cache-ro"},
810
741
  "if specified, uses the prompt cache but does not update it",
811
- [](gpt_params & params) {
742
+ [](common_params & params) {
812
743
  params.prompt_cache_ro = true;
813
744
  }
814
745
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
815
- add_opt(llama_arg(
746
+ add_opt(common_arg(
816
747
  {"-r", "--reverse-prompt"}, "PROMPT",
817
748
  "halt generation at PROMPT, return control in interactive mode\n",
818
- [](gpt_params & params, const std::string & value) {
749
+ [](common_params & params, const std::string & value) {
819
750
  params.antiprompt.emplace_back(value);
820
751
  }
821
752
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
822
- add_opt(llama_arg(
753
+ add_opt(common_arg(
823
754
  {"-sp", "--special"},
824
- format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
825
- [](gpt_params & params) {
755
+ string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
756
+ [](common_params & params) {
826
757
  params.special = true;
827
758
  }
828
759
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
829
- add_opt(llama_arg(
760
+ add_opt(common_arg(
830
761
  {"-cnv", "--conversation"},
831
- format(
762
+ string_format(
832
763
  "run in conversation mode:\n"
833
764
  "- does not print special tokens and suffix/prefix\n"
834
765
  "- interactive mode is also enabled\n"
835
766
  "(default: %s)",
836
767
  params.conversation ? "true" : "false"
837
768
  ),
838
- [](gpt_params & params) {
769
+ [](common_params & params) {
839
770
  params.conversation = true;
840
771
  }
841
772
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
842
- add_opt(llama_arg(
773
+ add_opt(common_arg(
843
774
  {"-i", "--interactive"},
844
- format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
845
- [](gpt_params & params) {
775
+ string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
776
+ [](common_params & params) {
846
777
  params.interactive = true;
847
778
  }
848
779
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
849
- add_opt(llama_arg(
780
+ add_opt(common_arg(
850
781
  {"-if", "--interactive-first"},
851
- format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
852
- [](gpt_params & params) {
782
+ string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
783
+ [](common_params & params) {
853
784
  params.interactive_first = true;
854
785
  }
855
786
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
856
- add_opt(llama_arg(
787
+ add_opt(common_arg(
857
788
  {"-mli", "--multiline-input"},
858
789
  "allows you to write or paste multiple lines without ending each in '\\'",
859
- [](gpt_params & params) {
790
+ [](common_params & params) {
860
791
  params.multiline_input = true;
861
792
  }
862
793
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
863
- add_opt(llama_arg(
794
+ add_opt(common_arg(
864
795
  {"--in-prefix-bos"},
865
796
  "prefix BOS to user inputs, preceding the `--in-prefix` string",
866
- [](gpt_params & params) {
797
+ [](common_params & params) {
867
798
  params.input_prefix_bos = true;
868
799
  params.enable_chat_template = false;
869
800
  }
870
801
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
871
- add_opt(llama_arg(
802
+ add_opt(common_arg(
872
803
  {"--in-prefix"}, "STRING",
873
804
  "string to prefix user inputs with (default: empty)",
874
- [](gpt_params & params, const std::string & value) {
805
+ [](common_params & params, const std::string & value) {
875
806
  params.input_prefix = value;
876
807
  params.enable_chat_template = false;
877
808
  }
878
809
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
879
- add_opt(llama_arg(
810
+ add_opt(common_arg(
880
811
  {"--in-suffix"}, "STRING",
881
812
  "string to suffix after user inputs with (default: empty)",
882
- [](gpt_params & params, const std::string & value) {
813
+ [](common_params & params, const std::string & value) {
883
814
  params.input_suffix = value;
884
815
  params.enable_chat_template = false;
885
816
  }
886
817
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
887
- add_opt(llama_arg(
818
+ add_opt(common_arg(
888
819
  {"--no-warmup"},
889
820
  "skip warming up the model with an empty run",
890
- [](gpt_params & params) {
821
+ [](common_params & params) {
891
822
  params.warmup = false;
892
823
  }
893
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
894
- add_opt(llama_arg(
824
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
825
+ add_opt(common_arg(
895
826
  {"--spm-infill"},
896
- format(
827
+ string_format(
897
828
  "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
898
829
  params.spm_infill ? "enabled" : "disabled"
899
830
  ),
900
- [](gpt_params & params) {
831
+ [](common_params & params) {
901
832
  params.spm_infill = true;
902
833
  }
903
834
  ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
904
- add_opt(llama_arg(
835
+ add_opt(common_arg(
905
836
  {"--samplers"}, "SAMPLERS",
906
- format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
907
- [](gpt_params & params, const std::string & value) {
908
- const auto sampler_names = string_split(value, ';');
909
- params.sparams.samplers = gpt_sampler_types_from_names(sampler_names, true);
837
+ string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
838
+ [](common_params & params, const std::string & value) {
839
+ const auto sampler_names = string_split<std::string>(value, ';');
840
+ params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
910
841
  }
911
842
  ).set_sparam());
912
- add_opt(llama_arg(
843
+ add_opt(common_arg(
913
844
  {"-s", "--seed"}, "SEED",
914
- format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
915
- [](gpt_params & params, const std::string & value) {
916
- params.sparams.seed = std::stoul(value);
845
+ string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED),
846
+ [](common_params & params, const std::string & value) {
847
+ params.sampling.seed = std::stoul(value);
917
848
  }
918
849
  ).set_sparam());
919
- add_opt(llama_arg(
920
- {"--sampling-seq"}, "SEQUENCE",
921
- format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
922
- [](gpt_params & params, const std::string & value) {
923
- params.sparams.samplers = gpt_sampler_types_from_chars(value);
850
+ add_opt(common_arg(
851
+ {"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
852
+ string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
853
+ [](common_params & params, const std::string & value) {
854
+ params.sampling.samplers = common_sampler_types_from_chars(value);
924
855
  }
925
856
  ).set_sparam());
926
- add_opt(llama_arg(
857
+ add_opt(common_arg(
927
858
  {"--ignore-eos"},
928
859
  "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
929
- [](gpt_params & params) {
930
- params.sparams.ignore_eos = true;
931
- }
932
- ).set_sparam());
933
- add_opt(llama_arg(
934
- {"--penalize-nl"},
935
- format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
936
- [](gpt_params & params) {
937
- params.sparams.penalize_nl = true;
860
+ [](common_params & params) {
861
+ params.sampling.ignore_eos = true;
938
862
  }
939
863
  ).set_sparam());
940
- add_opt(llama_arg(
864
+ add_opt(common_arg(
941
865
  {"--temp"}, "N",
942
- format("temperature (default: %.1f)", (double)params.sparams.temp),
943
- [](gpt_params & params, const std::string & value) {
944
- params.sparams.temp = std::stof(value);
945
- params.sparams.temp = std::max(params.sparams.temp, 0.0f);
866
+ string_format("temperature (default: %.1f)", (double)params.sampling.temp),
867
+ [](common_params & params, const std::string & value) {
868
+ params.sampling.temp = std::stof(value);
869
+ params.sampling.temp = std::max(params.sampling.temp, 0.0f);
946
870
  }
947
871
  ).set_sparam());
948
- add_opt(llama_arg(
872
+ add_opt(common_arg(
949
873
  {"--top-k"}, "N",
950
- format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
951
- [](gpt_params & params, int value) {
952
- params.sparams.top_k = value;
874
+ string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
875
+ [](common_params & params, int value) {
876
+ params.sampling.top_k = value;
953
877
  }
954
878
  ).set_sparam());
955
- add_opt(llama_arg(
879
+ add_opt(common_arg(
956
880
  {"--top-p"}, "N",
957
- format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
958
- [](gpt_params & params, const std::string & value) {
959
- params.sparams.top_p = std::stof(value);
881
+ string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
882
+ [](common_params & params, const std::string & value) {
883
+ params.sampling.top_p = std::stof(value);
960
884
  }
961
885
  ).set_sparam());
962
- add_opt(llama_arg(
886
+ add_opt(common_arg(
963
887
  {"--min-p"}, "N",
964
- format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
965
- [](gpt_params & params, const std::string & value) {
966
- params.sparams.min_p = std::stof(value);
888
+ string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
889
+ [](common_params & params, const std::string & value) {
890
+ params.sampling.min_p = std::stof(value);
967
891
  }
968
892
  ).set_sparam());
969
- add_opt(llama_arg(
970
- {"--tfs"}, "N",
971
- format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
972
- [](gpt_params & params, const std::string & value) {
973
- params.sparams.tfs_z = std::stof(value);
893
+ add_opt(common_arg(
894
+ {"--xtc-probability"}, "N",
895
+ string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
896
+ [](common_params & params, const std::string & value) {
897
+ params.sampling.xtc_probability = std::stof(value);
974
898
  }
975
899
  ).set_sparam());
976
- add_opt(llama_arg(
900
+ add_opt(common_arg(
901
+ {"--xtc-threshold"}, "N",
902
+ string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
903
+ [](common_params & params, const std::string & value) {
904
+ params.sampling.xtc_threshold = std::stof(value);
905
+ }
906
+ ).set_sparam());
907
+ add_opt(common_arg(
977
908
  {"--typical"}, "N",
978
- format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
979
- [](gpt_params & params, const std::string & value) {
980
- params.sparams.typ_p = std::stof(value);
909
+ string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),
910
+ [](common_params & params, const std::string & value) {
911
+ params.sampling.typ_p = std::stof(value);
981
912
  }
982
913
  ).set_sparam());
983
- add_opt(llama_arg(
914
+ add_opt(common_arg(
984
915
  {"--repeat-last-n"}, "N",
985
- format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
986
- [](gpt_params & params, int value) {
987
- params.sparams.penalty_last_n = value;
988
- params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
916
+ string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
917
+ [](common_params & params, int value) {
918
+ if (value < -1) {
919
+ throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
920
+ }
921
+ params.sampling.penalty_last_n = value;
922
+ params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
989
923
  }
990
924
  ).set_sparam());
991
- add_opt(llama_arg(
925
+ add_opt(common_arg(
992
926
  {"--repeat-penalty"}, "N",
993
- format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
994
- [](gpt_params & params, const std::string & value) {
995
- params.sparams.penalty_repeat = std::stof(value);
927
+ string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
928
+ [](common_params & params, const std::string & value) {
929
+ params.sampling.penalty_repeat = std::stof(value);
996
930
  }
997
931
  ).set_sparam());
998
- add_opt(llama_arg(
932
+ add_opt(common_arg(
999
933
  {"--presence-penalty"}, "N",
1000
- format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
1001
- [](gpt_params & params, const std::string & value) {
1002
- params.sparams.penalty_present = std::stof(value);
934
+ string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present),
935
+ [](common_params & params, const std::string & value) {
936
+ params.sampling.penalty_present = std::stof(value);
1003
937
  }
1004
938
  ).set_sparam());
1005
- add_opt(llama_arg(
939
+ add_opt(common_arg(
1006
940
  {"--frequency-penalty"}, "N",
1007
- format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
1008
- [](gpt_params & params, const std::string & value) {
1009
- params.sparams.penalty_freq = std::stof(value);
941
+ string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
942
+ [](common_params & params, const std::string & value) {
943
+ params.sampling.penalty_freq = std::stof(value);
944
+ }
945
+ ).set_sparam());
946
+ add_opt(common_arg(
947
+ {"--dry-multiplier"}, "N",
948
+ string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
949
+ [](common_params & params, const std::string & value) {
950
+ params.sampling.dry_multiplier = std::stof(value);
951
+ }
952
+ ).set_sparam());
953
+ add_opt(common_arg(
954
+ {"--dry-base"}, "N",
955
+ string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base),
956
+ [](common_params & params, const std::string & value) {
957
+ float potential_base = std::stof(value);
958
+ if (potential_base >= 1.0f)
959
+ {
960
+ params.sampling.dry_base = potential_base;
961
+ }
962
+ }
963
+ ).set_sparam());
964
+ add_opt(common_arg(
965
+ {"--dry-allowed-length"}, "N",
966
+ string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length),
967
+ [](common_params & params, int value) {
968
+ params.sampling.dry_allowed_length = value;
969
+ }
970
+ ).set_sparam());
971
+ add_opt(common_arg(
972
+ {"--dry-penalty-last-n"}, "N",
973
+ string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
974
+ [](common_params & params, int value) {
975
+ if (value < -1) {
976
+ throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
977
+ }
978
+ params.sampling.dry_penalty_last_n = value;
979
+ }
980
+ ).set_sparam());
981
+ add_opt(common_arg(
982
+ {"--dry-sequence-breaker"}, "STRING",
983
+ string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
984
+ params.sampling.dry_sequence_breakers.empty() ? "none" :
985
+ std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()),
986
+ params.sampling.dry_sequence_breakers.end(),
987
+ std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'",
988
+ [](const std::string& a, const std::string& b) {
989
+ std::string formatted_b = (b == "\n") ? "\\n" : b;
990
+ return a + ", '" + formatted_b + "'";
991
+ }).c_str()),
992
+ [](common_params & params, const std::string & value) {
993
+ static bool defaults_cleared = false;
994
+
995
+ if (!defaults_cleared) {
996
+ params.sampling.dry_sequence_breakers.clear();
997
+ defaults_cleared = true;
998
+ }
999
+
1000
+ if (value == "none") {
1001
+ params.sampling.dry_sequence_breakers.clear();
1002
+ } else {
1003
+ params.sampling.dry_sequence_breakers.emplace_back(value);
1004
+ }
1010
1005
  }
1011
1006
  ).set_sparam());
1012
- add_opt(llama_arg(
1007
+ add_opt(common_arg(
1013
1008
  {"--dynatemp-range"}, "N",
1014
- format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
1015
- [](gpt_params & params, const std::string & value) {
1016
- params.sparams.dynatemp_range = std::stof(value);
1009
+ string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
1010
+ [](common_params & params, const std::string & value) {
1011
+ params.sampling.dynatemp_range = std::stof(value);
1017
1012
  }
1018
1013
  ).set_sparam());
1019
- add_opt(llama_arg(
1014
+ add_opt(common_arg(
1020
1015
  {"--dynatemp-exp"}, "N",
1021
- format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
1022
- [](gpt_params & params, const std::string & value) {
1023
- params.sparams.dynatemp_exponent = std::stof(value);
1016
+ string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent),
1017
+ [](common_params & params, const std::string & value) {
1018
+ params.sampling.dynatemp_exponent = std::stof(value);
1024
1019
  }
1025
1020
  ).set_sparam());
1026
- add_opt(llama_arg(
1021
+ add_opt(common_arg(
1027
1022
  {"--mirostat"}, "N",
1028
- format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
1029
- "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
1030
- [](gpt_params & params, int value) {
1031
- params.sparams.mirostat = value;
1023
+ string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
1024
+ "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
1025
+ [](common_params & params, int value) {
1026
+ params.sampling.mirostat = value;
1032
1027
  }
1033
1028
  ).set_sparam());
1034
- add_opt(llama_arg(
1029
+ add_opt(common_arg(
1035
1030
  {"--mirostat-lr"}, "N",
1036
- format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
1037
- [](gpt_params & params, const std::string & value) {
1038
- params.sparams.mirostat_eta = std::stof(value);
1031
+ string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
1032
+ [](common_params & params, const std::string & value) {
1033
+ params.sampling.mirostat_eta = std::stof(value);
1039
1034
  }
1040
1035
  ).set_sparam());
1041
- add_opt(llama_arg(
1036
+ add_opt(common_arg(
1042
1037
  {"--mirostat-ent"}, "N",
1043
- format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
1044
- [](gpt_params & params, const std::string & value) {
1045
- params.sparams.mirostat_tau = std::stof(value);
1038
+ string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
1039
+ [](common_params & params, const std::string & value) {
1040
+ params.sampling.mirostat_tau = std::stof(value);
1046
1041
  }
1047
1042
  ).set_sparam());
1048
- add_opt(llama_arg(
1043
+ add_opt(common_arg(
1049
1044
  {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS",
1050
1045
  "modifies the likelihood of token appearing in the completion,\n"
1051
1046
  "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
1052
1047
  "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
1053
- [](gpt_params & params, const std::string & value) {
1048
+ [](common_params & params, const std::string & value) {
1054
1049
  std::stringstream ss(value);
1055
1050
  llama_token key;
1056
1051
  char sign;
@@ -1058,7 +1053,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1058
1053
  try {
1059
1054
  if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
1060
1055
  const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
1061
- params.sparams.logit_bias.push_back({key, bias});
1056
+ params.sampling.logit_bias.push_back({key, bias});
1062
1057
  } else {
1063
1058
  throw std::invalid_argument("invalid input format");
1064
1059
  }
@@ -1067,39 +1062,39 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1067
1062
  }
1068
1063
  }
1069
1064
  ).set_sparam());
1070
- add_opt(llama_arg(
1065
+ add_opt(common_arg(
1071
1066
  {"--grammar"}, "GRAMMAR",
1072
- format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
1073
- [](gpt_params & params, const std::string & value) {
1074
- params.sparams.grammar = value;
1067
+ string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
1068
+ [](common_params & params, const std::string & value) {
1069
+ params.sampling.grammar = value;
1075
1070
  }
1076
1071
  ).set_sparam());
1077
- add_opt(llama_arg(
1072
+ add_opt(common_arg(
1078
1073
  {"--grammar-file"}, "FNAME",
1079
1074
  "file to read grammar from",
1080
- [](gpt_params & params, const std::string & value) {
1075
+ [](common_params & params, const std::string & value) {
1081
1076
  std::ifstream file(value);
1082
1077
  if (!file) {
1083
- throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
1078
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1084
1079
  }
1085
1080
  std::copy(
1086
1081
  std::istreambuf_iterator<char>(file),
1087
1082
  std::istreambuf_iterator<char>(),
1088
- std::back_inserter(params.sparams.grammar)
1083
+ std::back_inserter(params.sampling.grammar)
1089
1084
  );
1090
1085
  }
1091
1086
  ).set_sparam());
1092
- add_opt(llama_arg(
1087
+ add_opt(common_arg(
1093
1088
  {"-j", "--json-schema"}, "SCHEMA",
1094
1089
  "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
1095
- [](gpt_params & params, const std::string & value) {
1096
- params.sparams.grammar = json_schema_to_grammar(json::parse(value));
1090
+ [](common_params & params, const std::string & value) {
1091
+ params.sampling.grammar = json_schema_to_grammar(json::parse(value));
1097
1092
  }
1098
1093
  ).set_sparam());
1099
- add_opt(llama_arg(
1094
+ add_opt(common_arg(
1100
1095
  {"--pooling"}, "{none,mean,cls,last,rank}",
1101
1096
  "pooling type for embeddings, use model default if unspecified",
1102
- [](gpt_params & params, const std::string & value) {
1097
+ [](common_params & params, const std::string & value) {
1103
1098
  /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
1104
1099
  else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
1105
1100
  else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
@@ -1108,275 +1103,285 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1108
1103
  else { throw std::invalid_argument("invalid value"); }
1109
1104
  }
1110
1105
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
1111
- add_opt(llama_arg(
1112
- {"--attention"}, "{causal,non,causal}",
1106
+ add_opt(common_arg(
1107
+ {"--attention"}, "{causal,non-causal}",
1113
1108
  "attention type for embeddings, use model default if unspecified",
1114
- [](gpt_params & params, const std::string & value) {
1109
+ [](common_params & params, const std::string & value) {
1115
1110
  /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
1116
1111
  else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
1117
1112
  else { throw std::invalid_argument("invalid value"); }
1118
1113
  }
1119
1114
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
1120
- add_opt(llama_arg(
1115
+ add_opt(common_arg(
1121
1116
  {"--rope-scaling"}, "{none,linear,yarn}",
1122
1117
  "RoPE frequency scaling method, defaults to linear unless specified by the model",
1123
- [](gpt_params & params, const std::string & value) {
1118
+ [](common_params & params, const std::string & value) {
1124
1119
  /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
1125
1120
  else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
1126
1121
  else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
1127
1122
  else { throw std::invalid_argument("invalid value"); }
1128
1123
  }
1129
1124
  ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE"));
1130
- add_opt(llama_arg(
1125
+ add_opt(common_arg(
1131
1126
  {"--rope-scale"}, "N",
1132
1127
  "RoPE context scaling factor, expands context by a factor of N",
1133
- [](gpt_params & params, const std::string & value) {
1128
+ [](common_params & params, const std::string & value) {
1134
1129
  params.rope_freq_scale = 1.0f / std::stof(value);
1135
1130
  }
1136
1131
  ).set_env("LLAMA_ARG_ROPE_SCALE"));
1137
- add_opt(llama_arg(
1132
+ add_opt(common_arg(
1138
1133
  {"--rope-freq-base"}, "N",
1139
1134
  "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
1140
- [](gpt_params & params, const std::string & value) {
1135
+ [](common_params & params, const std::string & value) {
1141
1136
  params.rope_freq_base = std::stof(value);
1142
1137
  }
1143
1138
  ).set_env("LLAMA_ARG_ROPE_FREQ_BASE"));
1144
- add_opt(llama_arg(
1139
+ add_opt(common_arg(
1145
1140
  {"--rope-freq-scale"}, "N",
1146
1141
  "RoPE frequency scaling factor, expands context by a factor of 1/N",
1147
- [](gpt_params & params, const std::string & value) {
1142
+ [](common_params & params, const std::string & value) {
1148
1143
  params.rope_freq_scale = std::stof(value);
1149
1144
  }
1150
1145
  ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
1151
- add_opt(llama_arg(
1146
+ add_opt(common_arg(
1152
1147
  {"--yarn-orig-ctx"}, "N",
1153
- format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
1154
- [](gpt_params & params, int value) {
1148
+ string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
1149
+ [](common_params & params, int value) {
1155
1150
  params.yarn_orig_ctx = value;
1156
1151
  }
1157
1152
  ).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
1158
- add_opt(llama_arg(
1153
+ add_opt(common_arg(
1159
1154
  {"--yarn-ext-factor"}, "N",
1160
- format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
1161
- [](gpt_params & params, const std::string & value) {
1155
+ string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
1156
+ [](common_params & params, const std::string & value) {
1162
1157
  params.yarn_ext_factor = std::stof(value);
1163
1158
  }
1164
1159
  ).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
1165
- add_opt(llama_arg(
1160
+ add_opt(common_arg(
1166
1161
  {"--yarn-attn-factor"}, "N",
1167
- format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
1168
- [](gpt_params & params, const std::string & value) {
1162
+ string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
1163
+ [](common_params & params, const std::string & value) {
1169
1164
  params.yarn_attn_factor = std::stof(value);
1170
1165
  }
1171
1166
  ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
1172
- add_opt(llama_arg(
1167
+ add_opt(common_arg(
1173
1168
  {"--yarn-beta-slow"}, "N",
1174
- format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
1175
- [](gpt_params & params, const std::string & value) {
1169
+ string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
1170
+ [](common_params & params, const std::string & value) {
1176
1171
  params.yarn_beta_slow = std::stof(value);
1177
1172
  }
1178
1173
  ).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
1179
- add_opt(llama_arg(
1174
+ add_opt(common_arg(
1180
1175
  {"--yarn-beta-fast"}, "N",
1181
- format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
1182
- [](gpt_params & params, const std::string & value) {
1176
+ string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
1177
+ [](common_params & params, const std::string & value) {
1183
1178
  params.yarn_beta_fast = std::stof(value);
1184
1179
  }
1185
1180
  ).set_env("LLAMA_ARG_YARN_BETA_FAST"));
1186
- add_opt(llama_arg(
1181
+ add_opt(common_arg(
1187
1182
  {"-gan", "--grp-attn-n"}, "N",
1188
- format("group-attention factor (default: %d)", params.grp_attn_n),
1189
- [](gpt_params & params, int value) {
1183
+ string_format("group-attention factor (default: %d)", params.grp_attn_n),
1184
+ [](common_params & params, int value) {
1190
1185
  params.grp_attn_n = value;
1191
1186
  }
1192
- ).set_env("LLAMA_ARG_GRP_ATTN_N"));
1193
- add_opt(llama_arg(
1187
+ ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
1188
+ add_opt(common_arg(
1194
1189
  {"-gaw", "--grp-attn-w"}, "N",
1195
- format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
1196
- [](gpt_params & params, int value) {
1190
+ string_format("group-attention width (default: %d)", params.grp_attn_w),
1191
+ [](common_params & params, int value) {
1197
1192
  params.grp_attn_w = value;
1198
1193
  }
1199
- ).set_env("LLAMA_ARG_GRP_ATTN_W"));
1200
- add_opt(llama_arg(
1194
+ ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
1195
+ add_opt(common_arg(
1201
1196
  {"-dkvc", "--dump-kv-cache"},
1202
1197
  "verbose print of the KV cache",
1203
- [](gpt_params & params) {
1198
+ [](common_params & params) {
1204
1199
  params.dump_kv_cache = true;
1205
1200
  }
1206
1201
  ));
1207
- add_opt(llama_arg(
1202
+ add_opt(common_arg(
1208
1203
  {"-nkvo", "--no-kv-offload"},
1209
1204
  "disable KV offload",
1210
- [](gpt_params & params) {
1205
+ [](common_params & params) {
1211
1206
  params.no_kv_offload = true;
1212
1207
  }
1213
1208
  ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
1214
- add_opt(llama_arg(
1209
+ add_opt(common_arg(
1215
1210
  {"-ctk", "--cache-type-k"}, "TYPE",
1216
- format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
1217
- [](gpt_params & params, const std::string & value) {
1218
- // TODO: get the type right here
1219
- params.cache_type_k = value;
1211
+ string_format(
1212
+ "KV cache data type for K\n"
1213
+ "allowed values: %s\n"
1214
+ "(default: %s)",
1215
+ get_all_kv_cache_types().c_str(),
1216
+ ggml_type_name(params.cache_type_k)
1217
+ ),
1218
+ [](common_params & params, const std::string & value) {
1219
+ params.cache_type_k = kv_cache_type_from_str(value);
1220
1220
  }
1221
1221
  ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
1222
- add_opt(llama_arg(
1222
+ add_opt(common_arg(
1223
1223
  {"-ctv", "--cache-type-v"}, "TYPE",
1224
- format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
1225
- [](gpt_params & params, const std::string & value) {
1226
- // TODO: get the type right here
1227
- params.cache_type_v = value;
1224
+ string_format(
1225
+ "KV cache data type for V\n"
1226
+ "allowed values: %s\n"
1227
+ "(default: %s)",
1228
+ get_all_kv_cache_types().c_str(),
1229
+ ggml_type_name(params.cache_type_v)
1230
+ ),
1231
+ [](common_params & params, const std::string & value) {
1232
+ params.cache_type_v = kv_cache_type_from_str(value);
1228
1233
  }
1229
1234
  ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
1230
- add_opt(llama_arg(
1235
+ add_opt(common_arg(
1231
1236
  {"--perplexity", "--all-logits"},
1232
- format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
1233
- [](gpt_params & params) {
1237
+ string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
1238
+ [](common_params & params) {
1234
1239
  params.logits_all = true;
1235
1240
  }
1236
1241
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1237
- add_opt(llama_arg(
1242
+ add_opt(common_arg(
1238
1243
  {"--hellaswag"},
1239
1244
  "compute HellaSwag score over random tasks from datafile supplied with -f",
1240
- [](gpt_params & params) {
1245
+ [](common_params & params) {
1241
1246
  params.hellaswag = true;
1242
1247
  }
1243
1248
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1244
- add_opt(llama_arg(
1249
+ add_opt(common_arg(
1245
1250
  {"--hellaswag-tasks"}, "N",
1246
- format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
1247
- [](gpt_params & params, int value) {
1251
+ string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
1252
+ [](common_params & params, int value) {
1248
1253
  params.hellaswag_tasks = value;
1249
1254
  }
1250
1255
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1251
- add_opt(llama_arg(
1256
+ add_opt(common_arg(
1252
1257
  {"--winogrande"},
1253
1258
  "compute Winogrande score over random tasks from datafile supplied with -f",
1254
- [](gpt_params & params) {
1259
+ [](common_params & params) {
1255
1260
  params.winogrande = true;
1256
1261
  }
1257
1262
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1258
- add_opt(llama_arg(
1263
+ add_opt(common_arg(
1259
1264
  {"--winogrande-tasks"}, "N",
1260
- format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
1261
- [](gpt_params & params, int value) {
1265
+ string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
1266
+ [](common_params & params, int value) {
1262
1267
  params.winogrande_tasks = value;
1263
1268
  }
1264
1269
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1265
- add_opt(llama_arg(
1270
+ add_opt(common_arg(
1266
1271
  {"--multiple-choice"},
1267
1272
  "compute multiple choice score over random tasks from datafile supplied with -f",
1268
- [](gpt_params & params) {
1273
+ [](common_params & params) {
1269
1274
  params.multiple_choice = true;
1270
1275
  }
1271
1276
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1272
- add_opt(llama_arg(
1277
+ add_opt(common_arg(
1273
1278
  {"--multiple-choice-tasks"}, "N",
1274
- format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
1275
- [](gpt_params & params, int value) {
1279
+ string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
1280
+ [](common_params & params, int value) {
1276
1281
  params.multiple_choice_tasks = value;
1277
1282
  }
1278
1283
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1279
- add_opt(llama_arg(
1284
+ add_opt(common_arg(
1280
1285
  {"--kl-divergence"},
1281
1286
  "computes KL-divergence to logits provided via --kl-divergence-base",
1282
- [](gpt_params & params) {
1287
+ [](common_params & params) {
1283
1288
  params.kl_divergence = true;
1284
1289
  }
1285
1290
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1286
- add_opt(llama_arg(
1291
+ add_opt(common_arg(
1287
1292
  {"--save-all-logits", "--kl-divergence-base"}, "FNAME",
1288
1293
  "set logits file",
1289
- [](gpt_params & params, const std::string & value) {
1294
+ [](common_params & params, const std::string & value) {
1290
1295
  params.logits_file = value;
1291
1296
  }
1292
1297
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1293
- add_opt(llama_arg(
1298
+ add_opt(common_arg(
1294
1299
  {"--ppl-stride"}, "N",
1295
- format("stride for perplexity calculation (default: %d)", params.ppl_stride),
1296
- [](gpt_params & params, int value) {
1300
+ string_format("stride for perplexity calculation (default: %d)", params.ppl_stride),
1301
+ [](common_params & params, int value) {
1297
1302
  params.ppl_stride = value;
1298
1303
  }
1299
1304
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1300
- add_opt(llama_arg(
1305
+ add_opt(common_arg(
1301
1306
  {"--ppl-output-type"}, "<0|1>",
1302
- format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
1303
- [](gpt_params & params, int value) {
1307
+ string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
1308
+ [](common_params & params, int value) {
1304
1309
  params.ppl_output_type = value;
1305
1310
  }
1306
1311
  ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1307
- add_opt(llama_arg(
1312
+ add_opt(common_arg(
1308
1313
  {"-dt", "--defrag-thold"}, "N",
1309
- format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
1310
- [](gpt_params & params, const std::string & value) {
1314
+ string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
1315
+ [](common_params & params, const std::string & value) {
1311
1316
  params.defrag_thold = std::stof(value);
1312
1317
  }
1313
1318
  ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
1314
- add_opt(llama_arg(
1319
+ add_opt(common_arg(
1315
1320
  {"-np", "--parallel"}, "N",
1316
- format("number of parallel sequences to decode (default: %d)", params.n_parallel),
1317
- [](gpt_params & params, int value) {
1321
+ string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
1322
+ [](common_params & params, int value) {
1318
1323
  params.n_parallel = value;
1319
1324
  }
1320
1325
  ).set_env("LLAMA_ARG_N_PARALLEL"));
1321
- add_opt(llama_arg(
1326
+ add_opt(common_arg(
1322
1327
  {"-ns", "--sequences"}, "N",
1323
- format("number of sequences to decode (default: %d)", params.n_sequences),
1324
- [](gpt_params & params, int value) {
1328
+ string_format("number of sequences to decode (default: %d)", params.n_sequences),
1329
+ [](common_params & params, int value) {
1325
1330
  params.n_sequences = value;
1326
1331
  }
1327
1332
  ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
1328
- add_opt(llama_arg(
1333
+ add_opt(common_arg(
1329
1334
  {"-cb", "--cont-batching"},
1330
- format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
1331
- [](gpt_params & params) {
1335
+ string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
1336
+ [](common_params & params) {
1332
1337
  params.cont_batching = true;
1333
1338
  }
1334
1339
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
1335
- add_opt(llama_arg(
1340
+ add_opt(common_arg(
1336
1341
  {"-nocb", "--no-cont-batching"},
1337
1342
  "disable continuous batching",
1338
- [](gpt_params & params) {
1343
+ [](common_params & params) {
1339
1344
  params.cont_batching = false;
1340
1345
  }
1341
1346
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
1342
- add_opt(llama_arg(
1347
+ add_opt(common_arg(
1343
1348
  {"--mmproj"}, "FILE",
1344
1349
  "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
1345
- [](gpt_params & params, const std::string & value) {
1350
+ [](common_params & params, const std::string & value) {
1346
1351
  params.mmproj = value;
1347
1352
  }
1348
1353
  ).set_examples({LLAMA_EXAMPLE_LLAVA}));
1349
- add_opt(llama_arg(
1354
+ add_opt(common_arg(
1350
1355
  {"--image"}, "FILE",
1351
1356
  "path to an image file. use with multimodal models. Specify multiple times for batching",
1352
- [](gpt_params & params, const std::string & value) {
1357
+ [](common_params & params, const std::string & value) {
1353
1358
  params.image.emplace_back(value);
1354
1359
  }
1355
1360
  ).set_examples({LLAMA_EXAMPLE_LLAVA}));
1356
- #ifdef GGML_USE_RPC
1357
- add_opt(llama_arg(
1358
- {"--rpc"}, "SERVERS",
1359
- "comma separated list of RPC servers",
1360
- [](gpt_params & params, const std::string & value) {
1361
- params.rpc_servers = value;
1362
- }
1363
- ).set_env("LLAMA_ARG_RPC"));
1364
- #endif
1365
- add_opt(llama_arg(
1361
+ if (llama_supports_rpc()) {
1362
+ add_opt(common_arg(
1363
+ {"--rpc"}, "SERVERS",
1364
+ "comma separated list of RPC servers",
1365
+ [](common_params & params, const std::string & value) {
1366
+ params.rpc_servers = value;
1367
+ }
1368
+ ).set_env("LLAMA_ARG_RPC"));
1369
+ }
1370
+ add_opt(common_arg(
1366
1371
  {"--mlock"},
1367
1372
  "force system to keep model in RAM rather than swapping or compressing",
1368
- [](gpt_params & params) {
1373
+ [](common_params & params) {
1369
1374
  params.use_mlock = true;
1370
1375
  }
1371
1376
  ).set_env("LLAMA_ARG_MLOCK"));
1372
- add_opt(llama_arg(
1377
+ add_opt(common_arg(
1373
1378
  {"--no-mmap"},
1374
1379
  "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
1375
- [](gpt_params & params) {
1380
+ [](common_params & params) {
1376
1381
  params.use_mmap = false;
1377
1382
  }
1378
1383
  ).set_env("LLAMA_ARG_NO_MMAP"));
1379
- add_opt(llama_arg(
1384
+ add_opt(common_arg(
1380
1385
  {"--numa"}, "TYPE",
1381
1386
  "attempt optimizations that help on some NUMA systems\n"
1382
1387
  "- distribute: spread execution evenly over all nodes\n"
@@ -1384,52 +1389,62 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1384
1389
  "- numactl: use the CPU map provided by numactl\n"
1385
1390
  "if run without this previously, it is recommended to drop the system page cache before using this\n"
1386
1391
  "see https://github.com/ggerganov/llama.cpp/issues/1437",
1387
- [](gpt_params & params, const std::string & value) {
1392
+ [](common_params & params, const std::string & value) {
1388
1393
  /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
1389
1394
  else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
1390
1395
  else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
1391
1396
  else { throw std::invalid_argument("invalid value"); }
1392
1397
  }
1393
1398
  ).set_env("LLAMA_ARG_NUMA"));
1394
- add_opt(llama_arg(
1399
+ add_opt(common_arg(
1400
+ {"-dev", "--device"}, "<dev1,dev2,..>",
1401
+ "comma-separated list of devices to use for offloading (none = don't offload)\n"
1402
+ "use --list-devices to see a list of available devices",
1403
+ [](common_params & params, const std::string & value) {
1404
+ params.devices = parse_device_list(value);
1405
+ }
1406
+ ).set_env("LLAMA_ARG_DEVICE"));
1407
+ add_opt(common_arg(
1408
+ {"--list-devices"},
1409
+ "print list of available devices and exit",
1410
+ [](common_params &) {
1411
+ printf("Available devices:\n");
1412
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
1413
+ auto * dev = ggml_backend_dev_get(i);
1414
+ if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
1415
+ size_t free, total;
1416
+ ggml_backend_dev_memory(dev, &free, &total);
1417
+ printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
1418
+ }
1419
+ }
1420
+ exit(0);
1421
+ }
1422
+ ));
1423
+ add_opt(common_arg(
1395
1424
  {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
1396
1425
  "number of layers to store in VRAM",
1397
- [](gpt_params & params, int value) {
1426
+ [](common_params & params, int value) {
1398
1427
  params.n_gpu_layers = value;
1399
1428
  if (!llama_supports_gpu_offload()) {
1400
- fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
1401
- fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
1429
+ fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
1430
+ fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
1431
+ fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
1402
1432
  }
1403
1433
  }
1404
1434
  ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
1405
- add_opt(llama_arg(
1406
- {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
1407
- "number of layers to store in VRAM for the draft model",
1408
- [](gpt_params & params, int value) {
1409
- params.n_gpu_layers_draft = value;
1410
- if (!llama_supports_gpu_offload()) {
1411
- fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
1412
- fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
1413
- }
1414
- }
1415
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
1416
- add_opt(llama_arg(
1435
+ add_opt(common_arg(
1417
1436
  {"-sm", "--split-mode"}, "{none,layer,row}",
1418
1437
  "how to split the model across multiple GPUs, one of:\n"
1419
1438
  "- none: use one GPU only\n"
1420
1439
  "- layer (default): split layers and KV across GPUs\n"
1421
1440
  "- row: split rows across GPUs",
1422
- [](gpt_params & params, const std::string & value) {
1441
+ [](common_params & params, const std::string & value) {
1423
1442
  std::string arg_next = value;
1424
1443
  if (arg_next == "none") {
1425
1444
  params.split_mode = LLAMA_SPLIT_MODE_NONE;
1426
1445
  } else if (arg_next == "layer") {
1427
1446
  params.split_mode = LLAMA_SPLIT_MODE_LAYER;
1428
1447
  } else if (arg_next == "row") {
1429
- #ifdef GGML_USE_SYCL
1430
- fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
1431
- exit(1);
1432
- #endif // GGML_USE_SYCL
1433
1448
  params.split_mode = LLAMA_SPLIT_MODE_ROW;
1434
1449
  } else {
1435
1450
  throw std::invalid_argument("invalid value");
@@ -1439,10 +1454,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1439
1454
  }
1440
1455
  }
1441
1456
  ).set_env("LLAMA_ARG_SPLIT_MODE"));
1442
- add_opt(llama_arg(
1457
+ add_opt(common_arg(
1443
1458
  {"-ts", "--tensor-split"}, "N0,N1,N2,...",
1444
1459
  "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
1445
- [](gpt_params & params, const std::string & value) {
1460
+ [](common_params & params, const std::string & value) {
1446
1461
  std::string arg_next = value;
1447
1462
 
1448
1463
  // split string by , and /
@@ -1451,7 +1466,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1451
1466
  std::vector<std::string> split_arg{ it, {} };
1452
1467
  if (split_arg.size() >= llama_max_devices()) {
1453
1468
  throw std::invalid_argument(
1454
- format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
1469
+ string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
1455
1470
  );
1456
1471
  }
1457
1472
  for (size_t i = 0; i < llama_max_devices(); ++i) {
@@ -1466,315 +1481,329 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1466
1481
  }
1467
1482
  }
1468
1483
  ).set_env("LLAMA_ARG_TENSOR_SPLIT"));
1469
- add_opt(llama_arg(
1484
+ add_opt(common_arg(
1470
1485
  {"-mg", "--main-gpu"}, "INDEX",
1471
- format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
1472
- [](gpt_params & params, int value) {
1486
+ string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
1487
+ [](common_params & params, int value) {
1473
1488
  params.main_gpu = value;
1474
1489
  if (!llama_supports_gpu_offload()) {
1475
1490
  fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
1476
1491
  }
1477
1492
  }
1478
1493
  ).set_env("LLAMA_ARG_MAIN_GPU"));
1479
- add_opt(llama_arg(
1494
+ add_opt(common_arg(
1480
1495
  {"--check-tensors"},
1481
- format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
1482
- [](gpt_params & params) {
1496
+ string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
1497
+ [](common_params & params) {
1483
1498
  params.check_tensors = true;
1484
1499
  }
1485
1500
  ));
1486
- add_opt(llama_arg(
1501
+ add_opt(common_arg(
1487
1502
  {"--override-kv"}, "KEY=TYPE:VALUE",
1488
1503
  "advanced option to override model metadata by key. may be specified multiple times.\n"
1489
1504
  "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
1490
- [](gpt_params & params, const std::string & value) {
1505
+ [](common_params & params, const std::string & value) {
1491
1506
  if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
1492
- throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str()));
1507
+ throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str()));
1493
1508
  }
1494
1509
  }
1495
1510
  ));
1496
- add_opt(llama_arg(
1511
+ add_opt(common_arg(
1497
1512
  {"--lora"}, "FNAME",
1498
1513
  "path to LoRA adapter (can be repeated to use multiple adapters)",
1499
- [](gpt_params & params, const std::string & value) {
1514
+ [](common_params & params, const std::string & value) {
1500
1515
  params.lora_adapters.push_back({ std::string(value), 1.0 });
1501
1516
  }
1502
1517
  // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
1503
1518
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
1504
- add_opt(llama_arg(
1519
+ add_opt(common_arg(
1505
1520
  {"--lora-scaled"}, "FNAME", "SCALE",
1506
1521
  "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
1507
- [](gpt_params & params, const std::string & fname, const std::string & scale) {
1522
+ [](common_params & params, const std::string & fname, const std::string & scale) {
1508
1523
  params.lora_adapters.push_back({ fname, std::stof(scale) });
1509
1524
  }
1510
1525
  // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
1511
1526
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
1512
- add_opt(llama_arg(
1527
+ add_opt(common_arg(
1513
1528
  {"--control-vector"}, "FNAME",
1514
1529
  "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
1515
- [](gpt_params & params, const std::string & value) {
1530
+ [](common_params & params, const std::string & value) {
1516
1531
  params.control_vectors.push_back({ 1.0f, value, });
1517
1532
  }
1518
1533
  ));
1519
- add_opt(llama_arg(
1534
+ add_opt(common_arg(
1520
1535
  {"--control-vector-scaled"}, "FNAME", "SCALE",
1521
1536
  "add a control vector with user defined scaling SCALE\n"
1522
1537
  "note: this argument can be repeated to add multiple scaled control vectors",
1523
- [](gpt_params & params, const std::string & fname, const std::string & scale) {
1538
+ [](common_params & params, const std::string & fname, const std::string & scale) {
1524
1539
  params.control_vectors.push_back({ std::stof(scale), fname });
1525
1540
  }
1526
1541
  ));
1527
- add_opt(llama_arg(
1542
+ add_opt(common_arg(
1528
1543
  {"--control-vector-layer-range"}, "START", "END",
1529
1544
  "layer range to apply the control vector(s) to, start and end inclusive",
1530
- [](gpt_params & params, const std::string & start, const std::string & end) {
1545
+ [](common_params & params, const std::string & start, const std::string & end) {
1531
1546
  params.control_vector_layer_start = std::stoi(start);
1532
1547
  params.control_vector_layer_end = std::stoi(end);
1533
1548
  }
1534
1549
  ));
1535
- add_opt(llama_arg(
1550
+ add_opt(common_arg(
1536
1551
  {"-a", "--alias"}, "STRING",
1537
1552
  "set alias for model name (to be used by REST API)",
1538
- [](gpt_params & params, const std::string & value) {
1553
+ [](common_params & params, const std::string & value) {
1539
1554
  params.model_alias = value;
1540
1555
  }
1541
1556
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
1542
- add_opt(llama_arg(
1557
+ add_opt(common_arg(
1543
1558
  {"-m", "--model"}, "FNAME",
1544
1559
  ex == LLAMA_EXAMPLE_EXPORT_LORA
1545
1560
  ? std::string("model path from which to load base model")
1546
- : format(
1561
+ : string_format(
1547
1562
  "model path (default: `models/$filename` with filename from `--hf-file` "
1548
1563
  "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
1549
1564
  ),
1550
- [](gpt_params & params, const std::string & value) {
1565
+ [](common_params & params, const std::string & value) {
1551
1566
  params.model = value;
1552
1567
  }
1553
1568
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
1554
- add_opt(llama_arg(
1555
- {"-md", "--model-draft"}, "FNAME",
1556
- "draft model for speculative decoding (default: unused)",
1557
- [](gpt_params & params, const std::string & value) {
1558
- params.model_draft = value;
1559
- }
1560
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
1561
- add_opt(llama_arg(
1569
+ add_opt(common_arg(
1562
1570
  {"-mu", "--model-url"}, "MODEL_URL",
1563
1571
  "model download url (default: unused)",
1564
- [](gpt_params & params, const std::string & value) {
1572
+ [](common_params & params, const std::string & value) {
1565
1573
  params.model_url = value;
1566
1574
  }
1567
1575
  ).set_env("LLAMA_ARG_MODEL_URL"));
1568
- add_opt(llama_arg(
1576
+ add_opt(common_arg(
1569
1577
  {"-hfr", "--hf-repo"}, "REPO",
1570
1578
  "Hugging Face model repository (default: unused)",
1571
- [](gpt_params & params, const std::string & value) {
1579
+ [](common_params & params, const std::string & value) {
1572
1580
  params.hf_repo = value;
1573
1581
  }
1574
1582
  ).set_env("LLAMA_ARG_HF_REPO"));
1575
- add_opt(llama_arg(
1583
+ add_opt(common_arg(
1576
1584
  {"-hff", "--hf-file"}, "FILE",
1577
1585
  "Hugging Face model file (default: unused)",
1578
- [](gpt_params & params, const std::string & value) {
1586
+ [](common_params & params, const std::string & value) {
1579
1587
  params.hf_file = value;
1580
1588
  }
1581
1589
  ).set_env("LLAMA_ARG_HF_FILE"));
1582
- add_opt(llama_arg(
1590
+ add_opt(common_arg(
1591
+ {"-hfrv", "--hf-repo-v"}, "REPO",
1592
+ "Hugging Face model repository for the vocoder model (default: unused)",
1593
+ [](common_params & params, const std::string & value) {
1594
+ params.vocoder.hf_repo = value;
1595
+ }
1596
+ ).set_env("LLAMA_ARG_HF_REPO_V"));
1597
+ add_opt(common_arg(
1598
+ {"-hffv", "--hf-file-v"}, "FILE",
1599
+ "Hugging Face model file for the vocoder model (default: unused)",
1600
+ [](common_params & params, const std::string & value) {
1601
+ params.vocoder.hf_file = value;
1602
+ }
1603
+ ).set_env("LLAMA_ARG_HF_FILE_V"));
1604
+ add_opt(common_arg(
1583
1605
  {"-hft", "--hf-token"}, "TOKEN",
1584
1606
  "Hugging Face access token (default: value from HF_TOKEN environment variable)",
1585
- [](gpt_params & params, const std::string & value) {
1607
+ [](common_params & params, const std::string & value) {
1586
1608
  params.hf_token = value;
1587
1609
  }
1588
1610
  ).set_env("HF_TOKEN"));
1589
- add_opt(llama_arg(
1611
+ add_opt(common_arg(
1590
1612
  {"--context-file"}, "FNAME",
1591
1613
  "file to load context from (repeat to specify multiple files)",
1592
- [](gpt_params & params, const std::string & value) {
1614
+ [](common_params & params, const std::string & value) {
1593
1615
  std::ifstream file(value, std::ios::binary);
1594
1616
  if (!file) {
1595
- throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
1617
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1596
1618
  }
1597
1619
  params.context_files.push_back(value);
1598
1620
  }
1599
1621
  ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
1600
- add_opt(llama_arg(
1622
+ add_opt(common_arg(
1601
1623
  {"--chunk-size"}, "N",
1602
- format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
1603
- [](gpt_params & params, int value) {
1624
+ string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
1625
+ [](common_params & params, int value) {
1604
1626
  params.chunk_size = value;
1605
1627
  }
1606
1628
  ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
1607
- add_opt(llama_arg(
1629
+ add_opt(common_arg(
1608
1630
  {"--chunk-separator"}, "STRING",
1609
- format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
1610
- [](gpt_params & params, const std::string & value) {
1631
+ string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
1632
+ [](common_params & params, const std::string & value) {
1611
1633
  params.chunk_separator = value;
1612
1634
  }
1613
1635
  ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
1614
- add_opt(llama_arg(
1636
+ add_opt(common_arg(
1615
1637
  {"--junk"}, "N",
1616
- format("number of times to repeat the junk text (default: %d)", params.n_junk),
1617
- [](gpt_params & params, int value) {
1638
+ string_format("number of times to repeat the junk text (default: %d)", params.n_junk),
1639
+ [](common_params & params, int value) {
1618
1640
  params.n_junk = value;
1619
1641
  }
1620
1642
  ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
1621
- add_opt(llama_arg(
1643
+ add_opt(common_arg(
1622
1644
  {"--pos"}, "N",
1623
- format("position of the passkey in the junk text (default: %d)", params.i_pos),
1624
- [](gpt_params & params, int value) {
1645
+ string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
1646
+ [](common_params & params, int value) {
1625
1647
  params.i_pos = value;
1626
1648
  }
1627
1649
  ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
1628
- add_opt(llama_arg(
1650
+ add_opt(common_arg(
1629
1651
  {"-o", "--output", "--output-file"}, "FNAME",
1630
- format("output file (default: '%s')",
1652
+ string_format("output file (default: '%s')",
1631
1653
  ex == LLAMA_EXAMPLE_EXPORT_LORA
1632
1654
  ? params.lora_outfile.c_str()
1633
1655
  : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
1634
1656
  ? params.cvector_outfile.c_str()
1635
1657
  : params.out_file.c_str()),
1636
- [](gpt_params & params, const std::string & value) {
1658
+ [](common_params & params, const std::string & value) {
1637
1659
  params.out_file = value;
1638
1660
  params.cvector_outfile = value;
1639
1661
  params.lora_outfile = value;
1640
1662
  }
1641
1663
  ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
1642
- add_opt(llama_arg(
1664
+ add_opt(common_arg(
1643
1665
  {"-ofreq", "--output-frequency"}, "N",
1644
- format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
1645
- [](gpt_params & params, int value) {
1666
+ string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
1667
+ [](common_params & params, int value) {
1646
1668
  params.n_out_freq = value;
1647
1669
  }
1648
1670
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1649
- add_opt(llama_arg(
1671
+ add_opt(common_arg(
1650
1672
  {"--save-frequency"}, "N",
1651
- format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
1652
- [](gpt_params & params, int value) {
1673
+ string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
1674
+ [](common_params & params, int value) {
1653
1675
  params.n_save_freq = value;
1654
1676
  }
1655
1677
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1656
- add_opt(llama_arg(
1678
+ add_opt(common_arg(
1657
1679
  {"--process-output"},
1658
- format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
1659
- [](gpt_params & params) {
1680
+ string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
1681
+ [](common_params & params) {
1660
1682
  params.process_output = true;
1661
1683
  }
1662
1684
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1663
- add_opt(llama_arg(
1685
+ add_opt(common_arg(
1664
1686
  {"--no-ppl"},
1665
- format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
1666
- [](gpt_params & params) {
1687
+ string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
1688
+ [](common_params & params) {
1667
1689
  params.compute_ppl = false;
1668
1690
  }
1669
1691
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1670
- add_opt(llama_arg(
1692
+ add_opt(common_arg(
1671
1693
  {"--chunk", "--from-chunk"}, "N",
1672
- format("start processing the input from chunk N (default: %d)", params.i_chunk),
1673
- [](gpt_params & params, int value) {
1694
+ string_format("start processing the input from chunk N (default: %d)", params.i_chunk),
1695
+ [](common_params & params, int value) {
1674
1696
  params.i_chunk = value;
1675
1697
  }
1676
1698
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1677
- add_opt(llama_arg(
1699
+ add_opt(common_arg(
1678
1700
  {"-pps"},
1679
- format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
1680
- [](gpt_params & params) {
1701
+ string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
1702
+ [](common_params & params) {
1681
1703
  params.is_pp_shared = true;
1682
1704
  }
1683
1705
  ).set_examples({LLAMA_EXAMPLE_BENCH}));
1684
- add_opt(llama_arg(
1706
+ add_opt(common_arg(
1685
1707
  {"-npp"}, "n0,n1,...",
1686
1708
  "number of prompt tokens",
1687
- [](gpt_params & params, const std::string & value) {
1709
+ [](common_params & params, const std::string & value) {
1688
1710
  auto p = string_split<int>(value, ',');
1689
1711
  params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
1690
1712
  }
1691
1713
  ).set_examples({LLAMA_EXAMPLE_BENCH}));
1692
- add_opt(llama_arg(
1714
+ add_opt(common_arg(
1693
1715
  {"-ntg"}, "n0,n1,...",
1694
1716
  "number of text generation tokens",
1695
- [](gpt_params & params, const std::string & value) {
1717
+ [](common_params & params, const std::string & value) {
1696
1718
  auto p = string_split<int>(value, ',');
1697
1719
  params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
1698
1720
  }
1699
1721
  ).set_examples({LLAMA_EXAMPLE_BENCH}));
1700
- add_opt(llama_arg(
1722
+ add_opt(common_arg(
1701
1723
  {"-npl"}, "n0,n1,...",
1702
1724
  "number of parallel prompts",
1703
- [](gpt_params & params, const std::string & value) {
1725
+ [](common_params & params, const std::string & value) {
1704
1726
  auto p = string_split<int>(value, ',');
1705
1727
  params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
1706
1728
  }
1707
1729
  ).set_examples({LLAMA_EXAMPLE_BENCH}));
1708
- add_opt(llama_arg(
1730
+ add_opt(common_arg(
1709
1731
  {"--embd-normalize"}, "N",
1710
- format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
1711
- [](gpt_params & params, int value) {
1732
+ string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
1733
+ [](common_params & params, int value) {
1712
1734
  params.embd_normalize = value;
1713
1735
  }
1714
1736
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
1715
- add_opt(llama_arg(
1737
+ add_opt(common_arg(
1716
1738
  {"--embd-output-format"}, "FORMAT",
1717
1739
  "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
1718
- [](gpt_params & params, const std::string & value) {
1740
+ [](common_params & params, const std::string & value) {
1719
1741
  params.embd_out = value;
1720
1742
  }
1721
1743
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
1722
- add_opt(llama_arg(
1744
+ add_opt(common_arg(
1723
1745
  {"--embd-separator"}, "STRING",
1724
- "separator of embendings (default \\n) for example \"<#sep#>\"",
1725
- [](gpt_params & params, const std::string & value) {
1746
+ "separator of embeddings (default \\n) for example \"<#sep#>\"",
1747
+ [](common_params & params, const std::string & value) {
1726
1748
  params.embd_sep = value;
1727
1749
  }
1728
1750
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
1729
- add_opt(llama_arg(
1751
+ add_opt(common_arg(
1730
1752
  {"--host"}, "HOST",
1731
- format("ip address to listen (default: %s)", params.hostname.c_str()),
1732
- [](gpt_params & params, const std::string & value) {
1753
+ string_format("ip address to listen (default: %s)", params.hostname.c_str()),
1754
+ [](common_params & params, const std::string & value) {
1733
1755
  params.hostname = value;
1734
1756
  }
1735
1757
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
1736
- add_opt(llama_arg(
1758
+ add_opt(common_arg(
1737
1759
  {"--port"}, "PORT",
1738
- format("port to listen (default: %d)", params.port),
1739
- [](gpt_params & params, int value) {
1760
+ string_format("port to listen (default: %d)", params.port),
1761
+ [](common_params & params, int value) {
1740
1762
  params.port = value;
1741
1763
  }
1742
1764
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
1743
- add_opt(llama_arg(
1765
+ add_opt(common_arg(
1744
1766
  {"--path"}, "PATH",
1745
- format("path to serve static files from (default: %s)", params.public_path.c_str()),
1746
- [](gpt_params & params, const std::string & value) {
1767
+ string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
1768
+ [](common_params & params, const std::string & value) {
1747
1769
  params.public_path = value;
1748
1770
  }
1749
1771
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
1750
- add_opt(llama_arg(
1772
+ add_opt(common_arg(
1773
+ {"--no-webui"},
1774
+ string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
1775
+ [](common_params & params) {
1776
+ params.webui = false;
1777
+ }
1778
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
1779
+ add_opt(common_arg(
1751
1780
  {"--embedding", "--embeddings"},
1752
- format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
1753
- [](gpt_params & params) {
1781
+ string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
1782
+ [](common_params & params) {
1754
1783
  params.embedding = true;
1755
1784
  }
1756
1785
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
1757
- add_opt(llama_arg(
1786
+ add_opt(common_arg(
1758
1787
  {"--reranking", "--rerank"},
1759
- format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
1760
- [](gpt_params & params) {
1788
+ string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
1789
+ [](common_params & params) {
1761
1790
  params.reranking = true;
1762
1791
  }
1763
1792
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
1764
- add_opt(llama_arg(
1793
+ add_opt(common_arg(
1765
1794
  {"--api-key"}, "KEY",
1766
1795
  "API key to use for authentication (default: none)",
1767
- [](gpt_params & params, const std::string & value) {
1796
+ [](common_params & params, const std::string & value) {
1768
1797
  params.api_keys.push_back(value);
1769
1798
  }
1770
1799
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
1771
- add_opt(llama_arg(
1800
+ add_opt(common_arg(
1772
1801
  {"--api-key-file"}, "FNAME",
1773
1802
  "path to file containing API keys (default: none)",
1774
- [](gpt_params & params, const std::string & value) {
1803
+ [](common_params & params, const std::string & value) {
1775
1804
  std::ifstream key_file(value);
1776
1805
  if (!key_file) {
1777
- throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
1806
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1778
1807
  }
1779
1808
  std::string key;
1780
1809
  while (std::getline(key_file, key)) {
@@ -1785,70 +1814,74 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1785
1814
  key_file.close();
1786
1815
  }
1787
1816
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
1788
- add_opt(llama_arg(
1817
+ add_opt(common_arg(
1789
1818
  {"--ssl-key-file"}, "FNAME",
1790
1819
  "path to file a PEM-encoded SSL private key",
1791
- [](gpt_params & params, const std::string & value) {
1820
+ [](common_params & params, const std::string & value) {
1792
1821
  params.ssl_file_key = value;
1793
1822
  }
1794
1823
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE"));
1795
- add_opt(llama_arg(
1824
+ add_opt(common_arg(
1796
1825
  {"--ssl-cert-file"}, "FNAME",
1797
1826
  "path to file a PEM-encoded SSL certificate",
1798
- [](gpt_params & params, const std::string & value) {
1827
+ [](common_params & params, const std::string & value) {
1799
1828
  params.ssl_file_cert = value;
1800
1829
  }
1801
1830
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
1802
- add_opt(llama_arg(
1831
+ add_opt(common_arg(
1803
1832
  {"-to", "--timeout"}, "N",
1804
- format("server read/write timeout in seconds (default: %d)", params.timeout_read),
1805
- [](gpt_params & params, int value) {
1833
+ string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
1834
+ [](common_params & params, int value) {
1806
1835
  params.timeout_read = value;
1807
1836
  params.timeout_write = value;
1808
1837
  }
1809
1838
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
1810
- add_opt(llama_arg(
1839
+ add_opt(common_arg(
1811
1840
  {"--threads-http"}, "N",
1812
- format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
1813
- [](gpt_params & params, int value) {
1841
+ string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
1842
+ [](common_params & params, int value) {
1814
1843
  params.n_threads_http = value;
1815
1844
  }
1816
1845
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
1817
- add_opt(llama_arg(
1818
- {"-spf", "--system-prompt-file"}, "FNAME",
1819
- "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications",
1820
- [](gpt_params & params, const std::string & value) {
1821
- std::ifstream file(value);
1822
- if (!file) {
1823
- throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
1824
- }
1825
- std::string system_prompt;
1826
- std::copy(
1827
- std::istreambuf_iterator<char>(file),
1828
- std::istreambuf_iterator<char>(),
1829
- std::back_inserter(system_prompt)
1830
- );
1831
- params.system_prompt = system_prompt;
1832
- }
1833
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
1834
- add_opt(llama_arg(
1846
+ add_opt(common_arg(
1847
+ {"--cache-reuse"}, "N",
1848
+ string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
1849
+ [](common_params & params, int value) {
1850
+ params.n_cache_reuse = value;
1851
+ }
1852
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
1853
+ add_opt(common_arg(
1835
1854
  {"--metrics"},
1836
- format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
1837
- [](gpt_params & params) {
1855
+ string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
1856
+ [](common_params & params) {
1838
1857
  params.endpoint_metrics = true;
1839
1858
  }
1840
1859
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
1841
- add_opt(llama_arg(
1860
+ add_opt(common_arg(
1861
+ {"--slots"},
1862
+ string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
1863
+ [](common_params & params) {
1864
+ params.endpoint_slots = true;
1865
+ }
1866
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
1867
+ add_opt(common_arg(
1868
+ {"--props"},
1869
+ string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
1870
+ [](common_params & params) {
1871
+ params.endpoint_props = true;
1872
+ }
1873
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
1874
+ add_opt(common_arg(
1842
1875
  {"--no-slots"},
1843
- format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
1844
- [](gpt_params & params) {
1876
+ "disables slots monitoring endpoint",
1877
+ [](common_params & params) {
1845
1878
  params.endpoint_slots = false;
1846
1879
  }
1847
1880
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
1848
- add_opt(llama_arg(
1881
+ add_opt(common_arg(
1849
1882
  {"--slot-save-path"}, "PATH",
1850
1883
  "path to save slot kv cache (default: disabled)",
1851
- [](gpt_params & params, const std::string & value) {
1884
+ [](common_params & params, const std::string & value) {
1852
1885
  params.slot_save_path = value;
1853
1886
  // if doesn't end with DIRECTORY_SEPARATOR, add it
1854
1887
  if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
@@ -1856,14 +1889,16 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1856
1889
  }
1857
1890
  }
1858
1891
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
1859
- add_opt(llama_arg(
1892
+ add_opt(common_arg(
1860
1893
  {"--chat-template"}, "JINJA_TEMPLATE",
1861
- "set custom jinja chat template (default: template taken from model's metadata)\n"
1862
- "if suffix/prefix are specified, template will be disabled\n"
1863
- "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
1864
- [](gpt_params & params, const std::string & value) {
1865
- if (!llama_chat_verify_template(value)) {
1866
- throw std::runtime_error(format(
1894
+ string_format(
1895
+ "set custom jinja chat template (default: template taken from model's metadata)\n"
1896
+ "if suffix/prefix are specified, template will be disabled\n"
1897
+ "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
1898
+ ),
1899
+ [](common_params & params, const std::string & value) {
1900
+ if (!common_chat_verify_template(value)) {
1901
+ throw std::runtime_error(string_format(
1867
1902
  "error: the supplied chat template is not supported: %s\n"
1868
1903
  "note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
1869
1904
  value.c_str()
@@ -1872,135 +1907,316 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1872
1907
  params.chat_template = value;
1873
1908
  }
1874
1909
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
1875
- add_opt(llama_arg(
1910
+ add_opt(common_arg(
1876
1911
  {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
1877
- format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
1878
- [](gpt_params & params, const std::string & value) {
1912
+ string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
1913
+ [](common_params & params, const std::string & value) {
1879
1914
  params.slot_prompt_similarity = std::stof(value);
1880
1915
  }
1881
1916
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
1882
- add_opt(llama_arg(
1917
+ add_opt(common_arg(
1883
1918
  {"--lora-init-without-apply"},
1884
- format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
1885
- [](gpt_params & params) {
1919
+ string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
1920
+ [](common_params & params) {
1886
1921
  params.lora_init_without_apply = true;
1887
1922
  }
1888
1923
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
1889
- add_opt(llama_arg(
1924
+ add_opt(common_arg(
1890
1925
  {"--simple-io"},
1891
1926
  "use basic IO for better compatibility in subprocesses and limited consoles",
1892
- [](gpt_params & params) {
1927
+ [](common_params & params) {
1893
1928
  params.simple_io = true;
1894
1929
  }
1895
1930
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
1896
- add_opt(llama_arg(
1897
- {"-ld", "--logdir"}, "LOGDIR",
1898
- "path under which to save YAML logs (no logging if unset)",
1899
- [](gpt_params & params, const std::string & value) {
1900
- params.logdir = value;
1901
-
1902
- if (params.logdir.back() != DIRECTORY_SEPARATOR) {
1903
- params.logdir += DIRECTORY_SEPARATOR;
1904
- }
1905
- }
1906
- ));
1907
- add_opt(llama_arg(
1931
+ add_opt(common_arg(
1908
1932
  {"--positive-file"}, "FNAME",
1909
- format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
1910
- [](gpt_params & params, const std::string & value) {
1933
+ string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
1934
+ [](common_params & params, const std::string & value) {
1911
1935
  params.cvector_positive_file = value;
1912
1936
  }
1913
1937
  ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
1914
- add_opt(llama_arg(
1938
+ add_opt(common_arg(
1915
1939
  {"--negative-file"}, "FNAME",
1916
- format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
1917
- [](gpt_params & params, const std::string & value) {
1940
+ string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
1941
+ [](common_params & params, const std::string & value) {
1918
1942
  params.cvector_negative_file = value;
1919
1943
  }
1920
1944
  ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
1921
- add_opt(llama_arg(
1945
+ add_opt(common_arg(
1922
1946
  {"--pca-batch"}, "N",
1923
- format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
1924
- [](gpt_params & params, int value) {
1947
+ string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
1948
+ [](common_params & params, int value) {
1925
1949
  params.n_pca_batch = value;
1926
1950
  }
1927
1951
  ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
1928
- add_opt(llama_arg(
1952
+ add_opt(common_arg(
1929
1953
  {"--pca-iter"}, "N",
1930
- format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
1931
- [](gpt_params & params, int value) {
1954
+ string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
1955
+ [](common_params & params, int value) {
1932
1956
  params.n_pca_iterations = value;
1933
1957
  }
1934
1958
  ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
1935
- add_opt(llama_arg(
1959
+ add_opt(common_arg(
1936
1960
  {"--method"}, "{pca, mean}",
1937
1961
  "dimensionality reduction method to be used (default: pca)",
1938
- [](gpt_params & params, const std::string & value) {
1962
+ [](common_params & params, const std::string & value) {
1939
1963
  /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
1940
1964
  else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
1941
1965
  else { throw std::invalid_argument("invalid value"); }
1942
1966
  }
1943
1967
  ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
1944
- add_opt(llama_arg(
1968
+ add_opt(common_arg(
1945
1969
  {"--output-format"}, "{md,jsonl}",
1946
1970
  "output format for batched-bench results (default: md)",
1947
- [](gpt_params & params, const std::string & value) {
1971
+ [](common_params & params, const std::string & value) {
1948
1972
  /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
1949
1973
  else if (value == "md") { params.batched_bench_output_jsonl = false; }
1950
1974
  else { std::invalid_argument("invalid value"); }
1951
1975
  }
1952
1976
  ).set_examples({LLAMA_EXAMPLE_BENCH}));
1953
- add_opt(llama_arg(
1977
+ add_opt(common_arg(
1954
1978
  {"--log-disable"},
1955
1979
  "Log disable",
1956
- [](gpt_params &) {
1957
- gpt_log_pause(gpt_log_main());
1980
+ [](common_params &) {
1981
+ common_log_pause(common_log_main());
1958
1982
  }
1959
1983
  ));
1960
- add_opt(llama_arg(
1984
+ add_opt(common_arg(
1961
1985
  {"--log-file"}, "FNAME",
1962
1986
  "Log to file",
1963
- [](gpt_params &, const std::string & value) {
1964
- gpt_log_set_file(gpt_log_main(), value.c_str());
1987
+ [](common_params &, const std::string & value) {
1988
+ common_log_set_file(common_log_main(), value.c_str());
1965
1989
  }
1966
1990
  ));
1967
- add_opt(llama_arg(
1991
+ add_opt(common_arg(
1968
1992
  {"--log-colors"},
1969
1993
  "Enable colored logging",
1970
- [](gpt_params &) {
1971
- gpt_log_set_colors(gpt_log_main(), true);
1994
+ [](common_params &) {
1995
+ common_log_set_colors(common_log_main(), true);
1972
1996
  }
1973
1997
  ).set_env("LLAMA_LOG_COLORS"));
1974
- add_opt(llama_arg(
1998
+ add_opt(common_arg(
1975
1999
  {"-v", "--verbose", "--log-verbose"},
1976
2000
  "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
1977
- [](gpt_params & params) {
2001
+ [](common_params & params) {
1978
2002
  params.verbosity = INT_MAX;
1979
- gpt_log_set_verbosity_thold(INT_MAX);
2003
+ common_log_set_verbosity_thold(INT_MAX);
1980
2004
  }
1981
2005
  ));
1982
- add_opt(llama_arg(
2006
+ add_opt(common_arg(
1983
2007
  {"-lv", "--verbosity", "--log-verbosity"}, "N",
1984
2008
  "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
1985
- [](gpt_params & params, int value) {
2009
+ [](common_params & params, int value) {
1986
2010
  params.verbosity = value;
1987
- gpt_log_set_verbosity_thold(value);
2011
+ common_log_set_verbosity_thold(value);
1988
2012
  }
1989
2013
  ).set_env("LLAMA_LOG_VERBOSITY"));
1990
- add_opt(llama_arg(
2014
+ add_opt(common_arg(
1991
2015
  {"--log-prefix"},
1992
2016
  "Enable prefx in log messages",
1993
- [](gpt_params &) {
1994
- gpt_log_set_prefix(gpt_log_main(), true);
2017
+ [](common_params &) {
2018
+ common_log_set_prefix(common_log_main(), true);
1995
2019
  }
1996
2020
  ).set_env("LLAMA_LOG_PREFIX"));
1997
- add_opt(llama_arg(
2021
+ add_opt(common_arg(
1998
2022
  {"--log-timestamps"},
1999
2023
  "Enable timestamps in log messages",
2000
- [](gpt_params &) {
2001
- gpt_log_set_timestamps(gpt_log_main(), true);
2024
+ [](common_params &) {
2025
+ common_log_set_timestamps(common_log_main(), true);
2002
2026
  }
2003
2027
  ).set_env("LLAMA_LOG_TIMESTAMPS"));
2004
2028
 
2029
+ // speculative parameters
2030
+ add_opt(common_arg(
2031
+ {"-td", "--threads-draft"}, "N",
2032
+ "number of threads to use during generation (default: same as --threads)",
2033
+ [](common_params & params, int value) {
2034
+ params.speculative.cpuparams.n_threads = value;
2035
+ if (params.speculative.cpuparams.n_threads <= 0) {
2036
+ params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
2037
+ }
2038
+ }
2039
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2040
+ add_opt(common_arg(
2041
+ {"-tbd", "--threads-batch-draft"}, "N",
2042
+ "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
2043
+ [](common_params & params, int value) {
2044
+ params.speculative.cpuparams_batch.n_threads = value;
2045
+ if (params.speculative.cpuparams_batch.n_threads <= 0) {
2046
+ params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
2047
+ }
2048
+ }
2049
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2050
+ add_opt(common_arg(
2051
+ {"-Cd", "--cpu-mask-draft"}, "M",
2052
+ "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
2053
+ [](common_params & params, const std::string & mask) {
2054
+ params.speculative.cpuparams.mask_valid = true;
2055
+ if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) {
2056
+ throw std::invalid_argument("invalid cpumask");
2057
+ }
2058
+ }
2059
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2060
+ add_opt(common_arg(
2061
+ {"-Crd", "--cpu-range-draft"}, "lo-hi",
2062
+ "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
2063
+ [](common_params & params, const std::string & range) {
2064
+ params.speculative.cpuparams.mask_valid = true;
2065
+ if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) {
2066
+ throw std::invalid_argument("invalid range");
2067
+ }
2068
+ }
2069
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2070
+ add_opt(common_arg(
2071
+ {"--cpu-strict-draft"}, "<0|1>",
2072
+ "Use strict CPU placement for draft model (default: same as --cpu-strict)",
2073
+ [](common_params & params, int value) {
2074
+ params.speculative.cpuparams.strict_cpu = value;
2075
+ }
2076
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2077
+ add_opt(common_arg(
2078
+ {"--prio-draft"}, "N",
2079
+ string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority),
2080
+ [](common_params & params, int prio) {
2081
+ if (prio < 0 || prio > 3) {
2082
+ throw std::invalid_argument("invalid value");
2083
+ }
2084
+ params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio;
2085
+ }
2086
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2087
+ add_opt(common_arg(
2088
+ {"--poll-draft"}, "<0|1>",
2089
+ "Use polling to wait for draft model work (default: same as --poll])",
2090
+ [](common_params & params, int value) {
2091
+ params.speculative.cpuparams.poll = value;
2092
+ }
2093
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2094
+ add_opt(common_arg(
2095
+ {"-Cbd", "--cpu-mask-batch-draft"}, "M",
2096
+ "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
2097
+ [](common_params & params, const std::string & mask) {
2098
+ params.speculative.cpuparams_batch.mask_valid = true;
2099
+ if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) {
2100
+ throw std::invalid_argument("invalid cpumask");
2101
+ }
2102
+ }
2103
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2104
+ add_opt(common_arg(
2105
+ {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
2106
+ "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
2107
+ [](common_params & params, const std::string & range) {
2108
+ params.speculative.cpuparams_batch.mask_valid = true;
2109
+ if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) {
2110
+ throw std::invalid_argument("invalid cpumask");
2111
+ }
2112
+ }
2113
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2114
+ add_opt(common_arg(
2115
+ {"--cpu-strict-batch-draft"}, "<0|1>",
2116
+ "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
2117
+ [](common_params & params, int value) {
2118
+ params.speculative.cpuparams_batch.strict_cpu = value;
2119
+ }
2120
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2121
+ add_opt(common_arg(
2122
+ {"--prio-batch-draft"}, "N",
2123
+ string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority),
2124
+ [](common_params & params, int prio) {
2125
+ if (prio < 0 || prio > 3) {
2126
+ throw std::invalid_argument("invalid value");
2127
+ }
2128
+ params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
2129
+ }
2130
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2131
+ add_opt(common_arg(
2132
+ {"--poll-batch-draft"}, "<0|1>",
2133
+ "Use polling to wait for draft model work (default: --poll-draft)",
2134
+ [](common_params & params, int value) {
2135
+ params.speculative.cpuparams_batch.poll = value;
2136
+ }
2137
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2138
+ add_opt(common_arg(
2139
+ {"--draft-max", "--draft", "--draft-n"}, "N",
2140
+ string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
2141
+ [](common_params & params, int value) {
2142
+ params.speculative.n_max = value;
2143
+ }
2144
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
2145
+ add_opt(common_arg(
2146
+ {"--draft-min", "--draft-n-min"}, "N",
2147
+ string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
2148
+ [](common_params & params, int value) {
2149
+ params.speculative.n_min = value;
2150
+ }
2151
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
2152
+ add_opt(common_arg(
2153
+ {"--draft-p-split"}, "P",
2154
+ string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
2155
+ [](common_params & params, const std::string & value) {
2156
+ params.speculative.p_split = std::stof(value);
2157
+ }
2158
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
2159
+ add_opt(common_arg(
2160
+ {"--draft-p-min"}, "P",
2161
+ string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
2162
+ [](common_params & params, const std::string & value) {
2163
+ params.speculative.p_min = std::stof(value);
2164
+ }
2165
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
2166
+ add_opt(common_arg(
2167
+ {"-cd", "--ctx-size-draft"}, "N",
2168
+ string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
2169
+ [](common_params & params, int value) {
2170
+ params.speculative.n_ctx = value;
2171
+ }
2172
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
2173
+ add_opt(common_arg(
2174
+ {"-devd", "--device-draft"}, "<dev1,dev2,..>",
2175
+ "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
2176
+ "use --list-devices to see a list of available devices",
2177
+ [](common_params & params, const std::string & value) {
2178
+ params.speculative.devices = parse_device_list(value);
2179
+ }
2180
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2181
+ add_opt(common_arg(
2182
+ {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
2183
+ "number of layers to store in VRAM for the draft model",
2184
+ [](common_params & params, int value) {
2185
+ params.speculative.n_gpu_layers = value;
2186
+ if (!llama_supports_gpu_offload()) {
2187
+ fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
2188
+ fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
2189
+ fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
2190
+ }
2191
+ }
2192
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
2193
+ add_opt(common_arg(
2194
+ {"-md", "--model-draft"}, "FNAME",
2195
+ "draft model for speculative decoding (default: unused)",
2196
+ [](common_params & params, const std::string & value) {
2197
+ params.speculative.model = value;
2198
+ }
2199
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
2200
+
2201
+ add_opt(common_arg(
2202
+ {"-mv", "--model-vocoder"}, "FNAME",
2203
+ "vocoder model for audio generation (default: unused)",
2204
+ [](common_params & params, const std::string & value) {
2205
+ params.vocoder.model = value;
2206
+ }
2207
+ ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
2208
+
2209
+ // model-specific
2210
+ add_opt(common_arg(
2211
+ {"--tts-oute-default"},
2212
+ string_format("use default OuteTTS models (note: can download weights from the internet)"),
2213
+ [](common_params & params) {
2214
+ params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
2215
+ params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
2216
+ params.vocoder.hf_repo = "ggml-org/WavTokenizer";
2217
+ params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
2218
+ }
2219
+ ).set_examples({LLAMA_EXAMPLE_TTS}));
2220
+
2005
2221
  return ctx_arg;
2006
2222
  }