@fugood/llama.node 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. package/CMakeLists.txt +5 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +1 -1
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/LoadSessionWorker.cpp +1 -0
  23. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  27. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  28. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  29. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  31. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  32. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  33. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  34. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  35. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  36. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  37. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  38. package/src/llama.cpp/CMakeLists.txt +91 -1245
  39. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  40. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  41. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  42. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  43. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  44. package/src/llama.cpp/common/common.cpp +1116 -877
  45. package/src/llama.cpp/common/common.h +191 -77
  46. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  47. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  48. package/src/llama.cpp/common/log.h +1 -1
  49. package/src/llama.cpp/common/ngram-cache.h +10 -3
  50. package/src/llama.cpp/common/sampling.cpp +19 -10
  51. package/src/llama.cpp/docs/build.md +353 -0
  52. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  53. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  55. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  57. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  59. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  61. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  63. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  64. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  65. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  66. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  67. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  68. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  69. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  70. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  71. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  72. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  73. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  74. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  76. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  77. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  78. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  80. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  87. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  88. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  89. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  90. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  91. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  92. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  93. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  94. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  95. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  97. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  98. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  99. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  100. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  102. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  103. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  104. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  105. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  106. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  107. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  108. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  109. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  110. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  111. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  112. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  113. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  114. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  115. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  116. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  117. package/src/llama.cpp/examples/main/main.cpp +98 -75
  118. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  119. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  120. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  121. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  122. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  123. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  124. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  125. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  126. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  127. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  129. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  130. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  131. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  133. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  134. package/src/llama.cpp/examples/server/server.cpp +274 -671
  135. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  136. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  137. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  138. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  139. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  140. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  141. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  142. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  143. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  144. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  145. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  146. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  147. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  148. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  149. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  150. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  151. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  152. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  153. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  154. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  155. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  156. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  157. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  159. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  160. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  161. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  162. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  163. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  178. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  179. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  180. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  181. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  182. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  183. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  184. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  185. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  208. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  209. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  210. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  211. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  212. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  214. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  215. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  216. package/src/llama.cpp/models/.editorconfig +1 -0
  217. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  221. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  224. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  230. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  233. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  237. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  243. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  246. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  249. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  259. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  260. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  261. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  263. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  264. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  265. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  266. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  267. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  268. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  269. package/src/llama.cpp/requirements.txt +5 -4
  270. package/src/llama.cpp/scripts/build-info.sh +30 -0
  271. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  272. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  273. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  274. package/src/llama.cpp/src/llama-grammar.h +39 -0
  275. package/src/llama.cpp/src/llama-impl.h +26 -0
  276. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  277. package/src/llama.cpp/src/llama-sampling.h +56 -0
  278. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  279. package/src/llama.cpp/src/llama-vocab.h +130 -0
  280. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  281. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  282. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  283. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  284. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  285. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  286. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  287. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  289. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  290. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  291. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  292. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  293. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  294. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  295. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  296. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  297. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  298. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  299. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  300. package/bin/darwin/arm64/default.metallib +0 -0
  301. package/bin/darwin/x64/default.metallib +0 -0
  302. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  303. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  304. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  305. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  306. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  307. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  308. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  309. package/src/llama.cpp/ggml-opencl.h +0 -36
  310. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  311. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  314. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  315. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  316. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  317. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  318. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  319. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  320. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -1,3 +1,7 @@
1
+ #if defined(_MSC_VER)
2
+ #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
3
+ #endif
4
+
1
5
  #include "common.h"
2
6
  // Change JSON_ASSERT from assert() to GGML_ASSERT:
3
7
  #define JSON_ASSERT GGML_ASSERT
@@ -6,21 +10,21 @@
6
10
  #include "llama.h"
7
11
 
8
12
  #include <algorithm>
9
- #include <cassert>
13
+ #include <cinttypes>
10
14
  #include <cmath>
15
+ #include <codecvt>
16
+ #include <cstdarg>
11
17
  #include <cstring>
12
18
  #include <ctime>
13
19
  #include <fstream>
14
- #include <iterator>
15
20
  #include <iostream>
21
+ #include <iterator>
16
22
  #include <regex>
17
23
  #include <sstream>
18
24
  #include <string>
19
25
  #include <unordered_map>
20
26
  #include <unordered_set>
21
27
  #include <vector>
22
- #include <cinttypes>
23
- #include <codecvt>
24
28
 
25
29
  #if defined(__APPLE__) && defined(__MACH__)
26
30
  #include <sys/types.h>
@@ -190,6 +194,12 @@ int32_t cpu_get_num_math() {
190
194
  // CLI argument parsing
191
195
  //
192
196
 
197
+ void gpt_params_handle_hf_token(gpt_params & params) {
198
+ if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
199
+ params.hf_token = std::getenv("HF_TOKEN");
200
+ }
201
+ }
202
+
193
203
  void gpt_params_handle_model_default(gpt_params & params) {
194
204
  if (!params.hf_repo.empty()) {
195
205
  // short-hand to avoid specifying --hf-file -> default it to --model
@@ -199,19 +209,13 @@ void gpt_params_handle_model_default(gpt_params & params) {
199
209
  }
200
210
  params.hf_file = params.model;
201
211
  } else if (params.model.empty()) {
202
- std::string cache_directory = fs_get_cache_directory();
203
- const bool success = fs_create_directory_with_parents(cache_directory);
204
- if (!success) {
205
- throw std::runtime_error("failed to create cache directory: " + cache_directory);
206
- }
207
- params.model = cache_directory + string_split(params.hf_file, '/').back();
212
+ params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
208
213
  }
209
214
  } else if (!params.model_url.empty()) {
210
215
  if (params.model.empty()) {
211
216
  auto f = string_split(params.model_url, '#').front();
212
217
  f = string_split(f, '?').front();
213
- f = string_split(f, '/').back();
214
- params.model = "models/" + f;
218
+ params.model = fs_get_cache_file(string_split(f, '/').back());
215
219
  }
216
220
  } else if (params.model.empty()) {
217
221
  params.model = DEFAULT_MODEL_PATH;
@@ -237,15 +241,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
237
241
  }
238
242
  }
239
243
 
240
- if (params.prompt_cache_all &&
241
- (params.interactive || params.interactive_first ||
242
- params.instruct)) {
243
-
244
+ if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
244
245
  throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
245
246
  }
246
247
 
247
248
  gpt_params_handle_model_default(params);
248
249
 
250
+ gpt_params_handle_hf_token(params);
251
+
249
252
  if (params.escape) {
250
253
  string_process_escapes(params.prompt);
251
254
  string_process_escapes(params.input_prefix);
@@ -265,39 +268,39 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
265
268
  }
266
269
 
267
270
  bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
268
- bool result = true;
271
+ const auto params_org = params; // the example can modify the default params
272
+
269
273
  try {
270
- if (!gpt_params_parse_ex(argc, argv, params)) {
271
- gpt_params_print_usage(argc, argv, gpt_params());
272
- exit(0);
274
+ if (!gpt_params_parse_ex(argc, argv, params) || params.usage) {
275
+ params = params_org;
276
+ params.usage = true;
277
+ return false;
273
278
  }
274
- }
275
- catch (const std::invalid_argument & ex) {
279
+ } catch (const std::invalid_argument & ex) {
276
280
  fprintf(stderr, "%s\n", ex.what());
277
- gpt_params_print_usage(argc, argv, gpt_params());
278
- exit(1);
281
+ params = params_org;
282
+ return false;
279
283
  }
280
- return result;
284
+
285
+ return true;
281
286
  }
282
287
 
288
+ #define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
289
+
283
290
  bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
291
+ const char split_delim = ',';
292
+
284
293
  llama_sampling_params & sparams = params.sparams;
285
294
 
286
295
  if (arg == "-s" || arg == "--seed") {
287
- if (++i >= argc) {
288
- invalid_param = true;
289
- return true;
290
- }
291
- // This is temporary, in the future the samplign state will be moved fully to llama_sampling_context.
296
+ CHECK_ARG
297
+ // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
292
298
  params.seed = std::stoul(argv[i]);
293
299
  sparams.seed = std::stoul(argv[i]);
294
300
  return true;
295
301
  }
296
302
  if (arg == "-t" || arg == "--threads") {
297
- if (++i >= argc) {
298
- invalid_param = true;
299
- return true;
300
- }
303
+ CHECK_ARG
301
304
  params.n_threads = std::stoi(argv[i]);
302
305
  if (params.n_threads <= 0) {
303
306
  params.n_threads = std::thread::hardware_concurrency();
@@ -305,10 +308,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
305
308
  return true;
306
309
  }
307
310
  if (arg == "-tb" || arg == "--threads-batch") {
308
- if (++i >= argc) {
309
- invalid_param = true;
310
- return true;
311
- }
311
+ CHECK_ARG
312
312
  params.n_threads_batch = std::stoi(argv[i]);
313
313
  if (params.n_threads_batch <= 0) {
314
314
  params.n_threads_batch = std::thread::hardware_concurrency();
@@ -316,10 +316,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
316
316
  return true;
317
317
  }
318
318
  if (arg == "-td" || arg == "--threads-draft") {
319
- if (++i >= argc) {
320
- invalid_param = true;
321
- return true;
322
- }
319
+ CHECK_ARG
323
320
  params.n_threads_draft = std::stoi(argv[i]);
324
321
  if (params.n_threads_draft <= 0) {
325
322
  params.n_threads_draft = std::thread::hardware_concurrency();
@@ -327,10 +324,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
327
324
  return true;
328
325
  }
329
326
  if (arg == "-tbd" || arg == "--threads-batch-draft") {
330
- if (++i >= argc) {
331
- invalid_param = true;
332
- return true;
333
- }
327
+ CHECK_ARG
334
328
  params.n_threads_batch_draft = std::stoi(argv[i]);
335
329
  if (params.n_threads_batch_draft <= 0) {
336
330
  params.n_threads_batch_draft = std::thread::hardware_concurrency();
@@ -338,10 +332,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
338
332
  return true;
339
333
  }
340
334
  if (arg == "-p" || arg == "--prompt") {
341
- if (++i >= argc) {
342
- invalid_param = true;
343
- return true;
344
- }
335
+ CHECK_ARG
345
336
  params.prompt = argv[i];
346
337
  return true;
347
338
  }
@@ -349,11 +340,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
349
340
  params.escape = true;
350
341
  return true;
351
342
  }
343
+ if (arg == "--no-escape") {
344
+ params.escape = false;
345
+ return true;
346
+ }
352
347
  if (arg == "--prompt-cache") {
353
- if (++i >= argc) {
354
- invalid_param = true;
355
- return true;
356
- }
348
+ CHECK_ARG
357
349
  params.path_prompt_cache = argv[i];
358
350
  return true;
359
351
  }
@@ -366,10 +358,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
366
358
  return true;
367
359
  }
368
360
  if (arg == "-bf" || arg == "--binary-file") {
369
- if (++i >= argc) {
370
- invalid_param = true;
371
- return true;
372
- }
361
+ CHECK_ARG
373
362
  std::ifstream file(argv[i], std::ios::binary);
374
363
  if (!file) {
375
364
  fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -385,10 +374,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
385
374
  return true;
386
375
  }
387
376
  if (arg == "-f" || arg == "--file") {
388
- if (++i >= argc) {
389
- invalid_param = true;
390
- return true;
391
- }
377
+ CHECK_ARG
392
378
  std::ifstream file(argv[i]);
393
379
  if (!file) {
394
380
  fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -403,67 +389,54 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
403
389
  }
404
390
  return true;
405
391
  }
406
- if (arg == "-n" || arg == "--n-predict") {
407
- if (++i >= argc) {
392
+ if (arg == "--in-file") {
393
+ CHECK_ARG
394
+ std::ifstream file(argv[i]);
395
+ if (!file) {
396
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
408
397
  invalid_param = true;
409
398
  return true;
410
399
  }
400
+ params.in_files.push_back(argv[i]);
401
+ return true;
402
+ }
403
+ if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
404
+ CHECK_ARG
411
405
  params.n_predict = std::stoi(argv[i]);
412
406
  return true;
413
407
  }
414
408
  if (arg == "--top-k") {
415
- if (++i >= argc) {
416
- invalid_param = true;
417
- return true;
418
- }
409
+ CHECK_ARG
419
410
  sparams.top_k = std::stoi(argv[i]);
420
411
  return true;
421
412
  }
422
413
  if (arg == "-c" || arg == "--ctx-size") {
423
- if (++i >= argc) {
424
- invalid_param = true;
425
- return true;
426
- }
414
+ CHECK_ARG
427
415
  params.n_ctx = std::stoi(argv[i]);
428
416
  return true;
429
417
  }
430
418
  if (arg == "--grp-attn-n" || arg == "-gan") {
431
- if (++i >= argc) {
432
- invalid_param = true;
433
- return true;
434
- }
419
+ CHECK_ARG
435
420
  params.grp_attn_n = std::stoi(argv[i]);
436
421
  return true;
437
422
  }
438
423
  if (arg == "--grp-attn-w" || arg == "-gaw") {
439
- if (++i >= argc) {
440
- invalid_param = true;
441
- return true;
442
- }
424
+ CHECK_ARG
443
425
  params.grp_attn_w = std::stoi(argv[i]);
444
426
  return true;
445
427
  }
446
428
  if (arg == "--rope-freq-base") {
447
- if (++i >= argc) {
448
- invalid_param = true;
449
- return true;
450
- }
429
+ CHECK_ARG
451
430
  params.rope_freq_base = std::stof(argv[i]);
452
431
  return true;
453
432
  }
454
433
  if (arg == "--rope-freq-scale") {
455
- if (++i >= argc) {
456
- invalid_param = true;
457
- return true;
458
- }
434
+ CHECK_ARG
459
435
  params.rope_freq_scale = std::stof(argv[i]);
460
436
  return true;
461
437
  }
462
438
  if (arg == "--rope-scaling") {
463
- if (++i >= argc) {
464
- invalid_param = true;
465
- return true;
466
- }
439
+ CHECK_ARG
467
440
  std::string value(argv[i]);
468
441
  /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
469
442
  else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
@@ -472,217 +445,148 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
472
445
  return true;
473
446
  }
474
447
  if (arg == "--rope-scale") {
475
- if (++i >= argc) {
476
- invalid_param = true;
477
- return true;
478
- }
448
+ CHECK_ARG
479
449
  params.rope_freq_scale = 1.0f / std::stof(argv[i]);
480
450
  return true;
481
451
  }
482
452
  if (arg == "--yarn-orig-ctx") {
483
- if (++i >= argc) {
484
- invalid_param = true;
485
- return true;
486
- }
453
+ CHECK_ARG
487
454
  params.yarn_orig_ctx = std::stoi(argv[i]);
488
455
  return true;
489
456
  }
490
457
  if (arg == "--yarn-ext-factor") {
491
- if (++i >= argc) {
492
- invalid_param = true;
493
- return true;
494
- }
458
+ CHECK_ARG
495
459
  params.yarn_ext_factor = std::stof(argv[i]);
496
460
  return true;
497
461
  }
498
462
  if (arg == "--yarn-attn-factor") {
499
- if (++i >= argc) {
500
- invalid_param = true;
501
- return true;
502
- }
463
+ CHECK_ARG
503
464
  params.yarn_attn_factor = std::stof(argv[i]);
504
465
  return true;
505
466
  }
506
467
  if (arg == "--yarn-beta-fast") {
507
- if (++i >= argc) {
508
- invalid_param = true;
509
- return true;
510
- }
468
+ CHECK_ARG
511
469
  params.yarn_beta_fast = std::stof(argv[i]);
512
470
  return true;
513
471
  }
514
472
  if (arg == "--yarn-beta-slow") {
515
- if (++i >= argc) {
516
- invalid_param = true;
517
- return true;
518
- }
473
+ CHECK_ARG
519
474
  params.yarn_beta_slow = std::stof(argv[i]);
520
475
  return true;
521
476
  }
522
477
  if (arg == "--pooling") {
523
- if (++i >= argc) {
524
- invalid_param = true;
525
- return true;
526
- }
478
+ CHECK_ARG
527
479
  std::string value(argv[i]);
528
480
  /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
529
481
  else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
530
482
  else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
483
+ else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
484
+ else { invalid_param = true; }
485
+ return true;
486
+ }
487
+ if (arg == "--attention") {
488
+ CHECK_ARG
489
+ std::string value(argv[i]);
490
+ /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
491
+ else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
531
492
  else { invalid_param = true; }
532
493
  return true;
533
494
  }
534
495
  if (arg == "--defrag-thold" || arg == "-dt") {
535
- if (++i >= argc) {
536
- invalid_param = true;
537
- return true;
538
- }
496
+ CHECK_ARG
539
497
  params.defrag_thold = std::stof(argv[i]);
540
498
  return true;
541
499
  }
542
500
  if (arg == "--samplers") {
543
- if (++i >= argc) {
544
- invalid_param = true;
545
- return true;
546
- }
501
+ CHECK_ARG
547
502
  const auto sampler_names = string_split(argv[i], ';');
548
503
  sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
549
504
  return true;
550
505
  }
551
506
  if (arg == "--sampling-seq") {
552
- if (++i >= argc) {
553
- invalid_param = true;
554
- return true;
555
- }
507
+ CHECK_ARG
556
508
  sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]);
557
509
  return true;
558
510
  }
559
511
  if (arg == "--top-p") {
560
- if (++i >= argc) {
561
- invalid_param = true;
562
- return true;
563
- }
512
+ CHECK_ARG
564
513
  sparams.top_p = std::stof(argv[i]);
565
514
  return true;
566
515
  }
567
516
  if (arg == "--min-p") {
568
- if (++i >= argc) {
569
- invalid_param = true;
570
- return true;
571
- }
517
+ CHECK_ARG
572
518
  sparams.min_p = std::stof(argv[i]);
573
519
  return true;
574
520
  }
575
521
  if (arg == "--temp") {
576
- if (++i >= argc) {
577
- invalid_param = true;
578
- return true;
579
- }
522
+ CHECK_ARG
580
523
  sparams.temp = std::stof(argv[i]);
581
524
  sparams.temp = std::max(sparams.temp, 0.0f);
582
525
  return true;
583
526
  }
584
527
  if (arg == "--tfs") {
585
- if (++i >= argc) {
586
- invalid_param = true;
587
- return true;
588
- }
528
+ CHECK_ARG
589
529
  sparams.tfs_z = std::stof(argv[i]);
590
530
  return true;
591
531
  }
592
532
  if (arg == "--typical") {
593
- if (++i >= argc) {
594
- invalid_param = true;
595
- return true;
596
- }
533
+ CHECK_ARG
597
534
  sparams.typical_p = std::stof(argv[i]);
598
535
  return true;
599
536
  }
600
537
  if (arg == "--repeat-last-n") {
601
- if (++i >= argc) {
602
- invalid_param = true;
603
- return true;
604
- }
538
+ CHECK_ARG
605
539
  sparams.penalty_last_n = std::stoi(argv[i]);
606
540
  sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
607
541
  return true;
608
542
  }
609
543
  if (arg == "--repeat-penalty") {
610
- if (++i >= argc) {
611
- invalid_param = true;
612
- return true;
613
- }
544
+ CHECK_ARG
614
545
  sparams.penalty_repeat = std::stof(argv[i]);
615
546
  return true;
616
547
  }
617
548
  if (arg == "--frequency-penalty") {
618
- if (++i >= argc) {
619
- invalid_param = true;
620
- return true;
621
- }
549
+ CHECK_ARG
622
550
  sparams.penalty_freq = std::stof(argv[i]);
623
551
  return true;
624
552
  }
625
553
  if (arg == "--presence-penalty") {
626
- if (++i >= argc) {
627
- invalid_param = true;
628
- return true;
629
- }
554
+ CHECK_ARG
630
555
  sparams.penalty_present = std::stof(argv[i]);
631
556
  return true;
632
557
  }
633
558
  if (arg == "--dynatemp-range") {
634
- if (++i >= argc) {
635
- invalid_param = true;
636
- return true;
637
- }
559
+ CHECK_ARG
638
560
  sparams.dynatemp_range = std::stof(argv[i]);
639
561
  return true;
640
562
  }
641
563
  if (arg == "--dynatemp-exp") {
642
- if (++i >= argc) {
643
- invalid_param = true;
644
- return true;
645
- }
564
+ CHECK_ARG
646
565
  sparams.dynatemp_exponent = std::stof(argv[i]);
647
566
  return true;
648
567
  }
649
568
  if (arg == "--mirostat") {
650
- if (++i >= argc) {
651
- invalid_param = true;
652
- return true;
653
- }
569
+ CHECK_ARG
654
570
  sparams.mirostat = std::stoi(argv[i]);
655
571
  return true;
656
572
  }
657
573
  if (arg == "--mirostat-lr") {
658
- if (++i >= argc) {
659
- invalid_param = true;
660
- return true;
661
- }
574
+ CHECK_ARG
662
575
  sparams.mirostat_eta = std::stof(argv[i]);
663
576
  return true;
664
577
  }
665
578
  if (arg == "--mirostat-ent") {
666
- if (++i >= argc) {
667
- invalid_param = true;
668
- return true;
669
- }
579
+ CHECK_ARG
670
580
  sparams.mirostat_tau = std::stof(argv[i]);
671
581
  return true;
672
582
  }
673
583
  if (arg == "--cfg-negative-prompt") {
674
- if (++i >= argc) {
675
- invalid_param = true;
676
- return true;
677
- }
584
+ CHECK_ARG
678
585
  sparams.cfg_negative_prompt = argv[i];
679
586
  return true;
680
587
  }
681
588
  if (arg == "--cfg-negative-prompt-file") {
682
- if (++i >= argc) {
683
- invalid_param = true;
684
- return true;
685
- }
589
+ CHECK_ARG
686
590
  std::ifstream file(argv[i]);
687
591
  if (!file) {
688
592
  fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -696,203 +600,126 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
696
600
  return true;
697
601
  }
698
602
  if (arg == "--cfg-scale") {
699
- if (++i >= argc) {
700
- invalid_param = true;
701
- return true;
702
- }
603
+ CHECK_ARG
703
604
  sparams.cfg_scale = std::stof(argv[i]);
704
605
  return true;
705
606
  }
706
607
  if (arg == "-b" || arg == "--batch-size") {
707
- if (++i >= argc) {
708
- invalid_param = true;
709
- return true;
710
- }
608
+ CHECK_ARG
711
609
  params.n_batch = std::stoi(argv[i]);
712
610
  return true;
713
611
  }
714
612
  if (arg == "-ub" || arg == "--ubatch-size") {
715
- if (++i >= argc) {
716
- invalid_param = true;
717
- return true;
718
- }
613
+ CHECK_ARG
719
614
  params.n_ubatch = std::stoi(argv[i]);
720
615
  return true;
721
616
  }
722
617
  if (arg == "--keep") {
723
- if (++i >= argc) {
724
- invalid_param = true;
725
- return true;
726
- }
618
+ CHECK_ARG
727
619
  params.n_keep = std::stoi(argv[i]);
728
620
  return true;
729
621
  }
730
622
  if (arg == "--draft") {
731
- if (++i >= argc) {
732
- invalid_param = true;
733
- return true;
734
- }
623
+ CHECK_ARG
735
624
  params.n_draft = std::stoi(argv[i]);
736
625
  return true;
737
626
  }
738
627
  if (arg == "--chunks") {
739
- if (++i >= argc) {
740
- invalid_param = true;
741
- return true;
742
- }
628
+ CHECK_ARG
743
629
  params.n_chunks = std::stoi(argv[i]);
744
630
  return true;
745
631
  }
746
632
  if (arg == "-np" || arg == "--parallel") {
747
- if (++i >= argc) {
748
- invalid_param = true;
749
- return true;
750
- }
633
+ CHECK_ARG
751
634
  params.n_parallel = std::stoi(argv[i]);
752
635
  return true;
753
636
  }
754
637
  if (arg == "-ns" || arg == "--sequences") {
755
- if (++i >= argc) {
756
- invalid_param = true;
757
- return true;
758
- }
638
+ CHECK_ARG
759
639
  params.n_sequences = std::stoi(argv[i]);
760
640
  return true;
761
641
  }
762
642
  if (arg == "--p-split" || arg == "-ps") {
763
- if (++i >= argc) {
764
- invalid_param = true;
765
- return true;
766
- }
643
+ CHECK_ARG
767
644
  params.p_split = std::stof(argv[i]);
768
645
  return true;
769
646
  }
770
647
  if (arg == "-m" || arg == "--model") {
771
- if (++i >= argc) {
772
- invalid_param = true;
773
- return true;
774
- }
648
+ CHECK_ARG
775
649
  params.model = argv[i];
776
650
  return true;
777
651
  }
778
652
  if (arg == "-md" || arg == "--model-draft") {
779
- if (++i >= argc) {
780
- invalid_param = true;
781
- return true;
782
- }
653
+ CHECK_ARG
783
654
  params.model_draft = argv[i];
784
655
  return true;
785
656
  }
786
657
  if (arg == "-a" || arg == "--alias") {
787
- if (++i >= argc) {
788
- invalid_param = true;
789
- return true;
790
- }
658
+ CHECK_ARG
791
659
  params.model_alias = argv[i];
792
660
  return true;
793
661
  }
794
662
  if (arg == "-mu" || arg == "--model-url") {
795
- if (++i >= argc) {
796
- invalid_param = true;
797
- return true;
798
- }
663
+ CHECK_ARG
799
664
  params.model_url = argv[i];
800
665
  return true;
801
666
  }
802
- if (arg == "-hfr" || arg == "--hf-repo") {
667
+ if (arg == "-hft" || arg == "--hf-token") {
803
668
  if (++i >= argc) {
804
- invalid_param = true;
805
- return true;
669
+ invalid_param = true;
670
+ return true;
806
671
  }
672
+ params.hf_token = argv[i];
673
+ return true;
674
+ }
675
+ if (arg == "-hfr" || arg == "--hf-repo") {
676
+ CHECK_ARG
807
677
  params.hf_repo = argv[i];
808
678
  return true;
809
679
  }
810
680
  if (arg == "-hff" || arg == "--hf-file") {
811
- if (++i >= argc) {
812
- invalid_param = true;
813
- return true;
814
- }
681
+ CHECK_ARG
815
682
  params.hf_file = argv[i];
816
683
  return true;
817
684
  }
818
685
  if (arg == "--lora") {
819
- if (++i >= argc) {
820
- invalid_param = true;
821
- return true;
822
- }
686
+ CHECK_ARG
823
687
  params.lora_adapter.emplace_back(argv[i], 1.0f);
824
- params.use_mmap = false;
825
688
  return true;
826
689
  }
827
690
  if (arg == "--lora-scaled") {
828
- if (++i >= argc) {
829
- invalid_param = true;
830
- return true;
831
- }
691
+ CHECK_ARG
832
692
  const char* lora_adapter = argv[i];
833
- if (++i >= argc) {
834
- invalid_param = true;
835
- return true;
836
- }
693
+ CHECK_ARG
837
694
  params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
838
- params.use_mmap = false;
839
- return true;
840
- }
841
- if (arg == "--lora-base") {
842
- if (++i >= argc) {
843
- invalid_param = true;
844
- return true;
845
- }
846
- params.lora_base = argv[i];
847
695
  return true;
848
696
  }
849
697
  if (arg == "--control-vector") {
850
- if (++i >= argc) {
851
- invalid_param = true;
852
- return true;
853
- }
698
+ CHECK_ARG
854
699
  params.control_vectors.push_back({ 1.0f, argv[i], });
855
700
  return true;
856
701
  }
857
702
  if (arg == "--control-vector-scaled") {
858
- if (++i >= argc) {
859
- invalid_param = true;
860
- return true;
861
- }
703
+ CHECK_ARG
862
704
  const char* fname = argv[i];
863
- if (++i >= argc) {
864
- invalid_param = true;
865
- return true;
866
- }
705
+ CHECK_ARG
867
706
  params.control_vectors.push_back({ std::stof(argv[i]), fname, });
868
707
  return true;
869
708
  }
870
709
  if (arg == "--control-vector-layer-range") {
871
- if (++i >= argc) {
872
- invalid_param = true;
873
- return true;
874
- }
710
+ CHECK_ARG
875
711
  params.control_vector_layer_start = std::stoi(argv[i]);
876
- if (++i >= argc) {
877
- invalid_param = true;
878
- return true;
879
- }
712
+ CHECK_ARG
880
713
  params.control_vector_layer_end = std::stoi(argv[i]);
881
714
  return true;
882
715
  }
883
716
  if (arg == "--mmproj") {
884
- if (++i >= argc) {
885
- invalid_param = true;
886
- return true;
887
- }
717
+ CHECK_ARG
888
718
  params.mmproj = argv[i];
889
719
  return true;
890
720
  }
891
721
  if (arg == "--image") {
892
- if (++i >= argc) {
893
- invalid_param = true;
894
- return true;
895
- }
722
+ CHECK_ARG
896
723
  params.image.emplace_back(argv[i]);
897
724
  return true;
898
725
  }
@@ -900,32 +727,35 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
900
727
  params.interactive = true;
901
728
  return true;
902
729
  }
903
- if (arg == "--interactive-specials") {
904
- params.interactive_specials = true;
905
- return true;
906
- }
907
- if (arg == "--special") {
730
+ if (arg == "-sp" || arg == "--special") {
908
731
  params.special = true;
909
732
  return true;
910
733
  }
911
- if (arg == "--embedding") {
734
+ if (arg == "--embedding" || arg == "--embeddings") {
912
735
  params.embedding = true;
913
736
  return true;
914
737
  }
915
- if (arg == "--interactive-first") {
916
- params.interactive_first = true;
738
+ if (arg == "--embd-normalize") {
739
+ CHECK_ARG
740
+ params.embd_normalize = std::stoi(argv[i]);
917
741
  return true;
918
742
  }
919
- if (arg == "-ins" || arg == "--instruct") {
920
- params.instruct = true;
743
+ if (arg == "--embd-output-format") {
744
+ CHECK_ARG
745
+ params.embd_out = argv[i];
921
746
  return true;
922
747
  }
923
- if (arg == "-cnv" || arg == "--conversation") {
924
- params.conversation = true;
748
+ if (arg == "--embd-separator") {
749
+ CHECK_ARG
750
+ params.embd_sep = argv[i];
925
751
  return true;
926
752
  }
927
- if (arg == "-cml" || arg == "--chatml") {
928
- params.chatml = true;
753
+ if (arg == "-if" || arg == "--interactive-first") {
754
+ params.interactive_first = true;
755
+ return true;
756
+ }
757
+ if (arg == "-cnv" || arg == "--conversation") {
758
+ params.conversation = true;
929
759
  return true;
930
760
  }
931
761
  if (arg == "--infill") {
@@ -948,7 +778,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
948
778
  params.cache_type_v = argv[++i];
949
779
  return true;
950
780
  }
951
- if (arg == "--multiline-input") {
781
+ if (arg == "-mli" || arg == "--multiline-input") {
952
782
  params.multiline_input = true;
953
783
  return true;
954
784
  }
@@ -960,11 +790,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
960
790
  params.cont_batching = true;
961
791
  return true;
962
792
  }
793
+ if (arg == "-nocb" || arg == "--no-cont-batching") {
794
+ params.cont_batching = false;
795
+ return true;
796
+ }
963
797
  if (arg == "-fa" || arg == "--flash-attn") {
964
798
  params.flash_attn = true;
965
799
  return true;
966
800
  }
967
- if (arg == "--color") {
801
+ if (arg == "-co" || arg == "--color") {
968
802
  params.use_color = true;
969
803
  return true;
970
804
  }
@@ -972,46 +806,34 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
972
806
  params.use_mlock = true;
973
807
  return true;
974
808
  }
975
- if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
976
- if (++i >= argc) {
977
- invalid_param = true;
978
- return true;
979
- }
809
+ if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
810
+ CHECK_ARG
980
811
  params.n_gpu_layers = std::stoi(argv[i]);
981
812
  if (!llama_supports_gpu_offload()) {
982
- fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
813
+ fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
983
814
  fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
984
815
  }
985
816
  return true;
986
817
  }
987
- if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
988
- if (++i >= argc) {
989
- invalid_param = true;
990
- return true;
991
- }
818
+ if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
819
+ CHECK_ARG
992
820
  params.n_gpu_layers_draft = std::stoi(argv[i]);
993
821
  if (!llama_supports_gpu_offload()) {
994
- fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
822
+ fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
995
823
  fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
996
824
  }
997
825
  return true;
998
826
  }
999
827
  if (arg == "--main-gpu" || arg == "-mg") {
1000
- if (++i >= argc) {
1001
- invalid_param = true;
1002
- return true;
1003
- }
828
+ CHECK_ARG
1004
829
  params.main_gpu = std::stoi(argv[i]);
1005
- #ifndef GGML_USE_CUDA_SYCL
1006
- fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n");
1007
- #endif // GGML_USE_CUDA_SYCL
830
+ #ifndef GGML_USE_CUDA_SYCL_VULKAN
831
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
832
+ #endif // GGML_USE_CUDA_SYCL_VULKAN
1008
833
  return true;
1009
834
  }
1010
835
  if (arg == "--split-mode" || arg == "-sm") {
1011
- if (++i >= argc) {
1012
- invalid_param = true;
1013
- return true;
1014
- }
836
+ CHECK_ARG
1015
837
  std::string arg_next = argv[i];
1016
838
  if (arg_next == "none") {
1017
839
  params.split_mode = LLAMA_SPLIT_MODE_NONE;
@@ -1030,16 +852,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1030
852
  invalid_param = true;
1031
853
  return true;
1032
854
  }
1033
- #ifndef GGML_USE_CUDA_SYCL
1034
- fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n");
1035
- #endif // GGML_USE_CUDA_SYCL
855
+ #ifndef GGML_USE_CUDA_SYCL_VULKAN
856
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
857
+ #endif // GGML_USE_CUDA_SYCL_VULKAN
1036
858
  return true;
1037
859
  }
1038
860
  if (arg == "--tensor-split" || arg == "-ts") {
1039
- if (++i >= argc) {
1040
- invalid_param = true;
1041
- return true;
1042
- }
861
+ CHECK_ARG
1043
862
  std::string arg_next = argv[i];
1044
863
 
1045
864
  // split string by , and /
@@ -1064,10 +883,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1064
883
  return true;
1065
884
  }
1066
885
  if (arg == "--rpc") {
1067
- if (++i >= argc) {
1068
- invalid_param = true;
1069
- return true;
1070
- }
886
+ CHECK_ARG
1071
887
  params.rpc_servers = argv[i];
1072
888
  return true;
1073
889
  }
@@ -1076,10 +892,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1076
892
  return true;
1077
893
  }
1078
894
  if (arg == "--numa") {
1079
- if (++i >= argc) {
1080
- invalid_param = true;
1081
- return true;
1082
- }
895
+ CHECK_ARG
1083
896
  std::string value(argv[i]);
1084
897
  /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
1085
898
  else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
@@ -1087,6 +900,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1087
900
  else { invalid_param = true; }
1088
901
  return true;
1089
902
  }
903
+ if (arg == "-v" || arg == "--verbose") {
904
+ params.verbosity = 1;
905
+ return true;
906
+ }
907
+ if (arg == "--verbosity") {
908
+ CHECK_ARG
909
+ params.verbosity = std::stoi(argv[i]);
910
+ return true;
911
+ }
1090
912
  if (arg == "--verbose-prompt") {
1091
913
  params.verbose_prompt = true;
1092
914
  return true;
@@ -1096,18 +918,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1096
918
  return true;
1097
919
  }
1098
920
  if (arg == "-r" || arg == "--reverse-prompt") {
1099
- if (++i >= argc) {
1100
- invalid_param = true;
1101
- return true;
1102
- }
921
+ CHECK_ARG
1103
922
  params.antiprompt.emplace_back(argv[i]);
1104
923
  return true;
1105
924
  }
1106
925
  if (arg == "-ld" || arg == "--logdir") {
1107
- if (++i >= argc) {
1108
- invalid_param = true;
1109
- return true;
1110
- }
926
+ CHECK_ARG
1111
927
  params.logdir = argv[i];
1112
928
 
1113
929
  if (params.logdir.back() != DIRECTORY_SEPARATOR) {
@@ -1116,209 +932,400 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1116
932
  return true;
1117
933
  }
1118
934
  if (arg == "-lcs" || arg == "--lookup-cache-static") {
1119
- if (++i >= argc) {
935
+ CHECK_ARG
936
+ params.lookup_cache_static = argv[i];
937
+ return true;
938
+ }
939
+ if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
940
+ CHECK_ARG
941
+ params.lookup_cache_dynamic = argv[i];
942
+ return true;
943
+ }
944
+ if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
945
+ CHECK_ARG
946
+ params.logits_file = argv[i];
947
+ return true;
948
+ }
949
+ if (arg == "--perplexity" || arg == "--all-logits") {
950
+ params.logits_all = true;
951
+ return true;
952
+ }
953
+ if (arg == "--ppl-stride") {
954
+ CHECK_ARG
955
+ params.ppl_stride = std::stoi(argv[i]);
956
+ return true;
957
+ }
958
+ if (arg == "--ppl-output-type") {
959
+ CHECK_ARG
960
+ params.ppl_output_type = std::stoi(argv[i]);
961
+ return true;
962
+ }
963
+ if (arg == "-ptc" || arg == "--print-token-count") {
964
+ CHECK_ARG
965
+ params.n_print = std::stoi(argv[i]);
966
+ return true;
967
+ }
968
+ if (arg == "--check-tensors") {
969
+ params.check_tensors = true;
970
+ return true;
971
+ }
972
+ if (arg == "--hellaswag") {
973
+ params.hellaswag = true;
974
+ return true;
975
+ }
976
+ if (arg == "--hellaswag-tasks") {
977
+ CHECK_ARG
978
+ params.hellaswag_tasks = std::stoi(argv[i]);
979
+ return true;
980
+ }
981
+ if (arg == "--winogrande") {
982
+ params.winogrande = true;
983
+ return true;
984
+ }
985
+ if (arg == "--winogrande-tasks") {
986
+ CHECK_ARG
987
+ params.winogrande_tasks = std::stoi(argv[i]);
988
+ return true;
989
+ }
990
+ if (arg == "--multiple-choice") {
991
+ params.multiple_choice = true;
992
+ return true;
993
+ }
994
+ if (arg == "--multiple-choice-tasks") {
995
+ CHECK_ARG
996
+ params.multiple_choice_tasks = std::stoi(argv[i]);
997
+ return true;
998
+ }
999
+ if (arg == "--kl-divergence") {
1000
+ params.kl_divergence = true;
1001
+ return true;
1002
+ }
1003
+ if (arg == "--ignore-eos") {
1004
+ params.ignore_eos = true;
1005
+ return true;
1006
+ }
1007
+ if (arg == "--penalize-nl") {
1008
+ sparams.penalize_nl = true;
1009
+ return true;
1010
+ }
1011
+ if (arg == "-l" || arg == "--logit-bias") {
1012
+ CHECK_ARG
1013
+ std::stringstream ss(argv[i]);
1014
+ llama_token key;
1015
+ char sign;
1016
+ std::string value_str;
1017
+ try {
1018
+ if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
1019
+ sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
1020
+ }
1021
+ else {
1022
+ throw std::exception();
1023
+ }
1024
+ }
1025
+ catch (const std::exception&) {
1120
1026
  invalid_param = true;
1121
1027
  return true;
1122
1028
  }
1123
- params.lookup_cache_static = argv[i];
1124
1029
  return true;
1125
1030
  }
1126
- if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
1127
- if (++i >= argc) {
1031
+ if (arg == "-h" || arg == "--help" || arg == "--usage" ) {
1032
+ params.usage = true;
1033
+ return true;
1034
+ }
1035
+ if (arg == "--version") {
1036
+ fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
1037
+ fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
1038
+ exit(0);
1039
+ }
1040
+ if (arg == "--in-prefix-bos") {
1041
+ params.input_prefix_bos = true;
1042
+ params.enable_chat_template = false;
1043
+ return true;
1044
+ }
1045
+ if (arg == "--in-prefix") {
1046
+ CHECK_ARG
1047
+ params.input_prefix = argv[i];
1048
+ params.enable_chat_template = false;
1049
+ return true;
1050
+ }
1051
+ if (arg == "--in-suffix") {
1052
+ CHECK_ARG
1053
+ params.input_suffix = argv[i];
1054
+ params.enable_chat_template = false;
1055
+ return true;
1056
+ }
1057
+ if (arg == "--spm-infill") {
1058
+ params.spm_infill = true;
1059
+ return true;
1060
+ }
1061
+ if (arg == "--grammar") {
1062
+ CHECK_ARG
1063
+ sparams.grammar = argv[i];
1064
+ return true;
1065
+ }
1066
+ if (arg == "--grammar-file") {
1067
+ CHECK_ARG
1068
+ std::ifstream file(argv[i]);
1069
+ if (!file) {
1070
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
1128
1071
  invalid_param = true;
1129
1072
  return true;
1130
1073
  }
1131
- params.lookup_cache_dynamic = argv[i];
1074
+ std::copy(
1075
+ std::istreambuf_iterator<char>(file),
1076
+ std::istreambuf_iterator<char>(),
1077
+ std::back_inserter(sparams.grammar)
1078
+ );
1132
1079
  return true;
1133
1080
  }
1134
- if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
1135
- if (++i >= argc) {
1081
+ if (arg == "-j" || arg == "--json-schema") {
1082
+ CHECK_ARG
1083
+ sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
1084
+ return true;
1085
+ }
1086
+ if (arg == "--override-kv") {
1087
+ CHECK_ARG
1088
+ if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
1089
+ fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
1136
1090
  invalid_param = true;
1137
1091
  return true;
1138
1092
  }
1139
- params.logits_file = argv[i];
1140
1093
  return true;
1141
1094
  }
1142
- if (arg == "--perplexity" || arg == "--all-logits") {
1143
- params.logits_all = true;
1095
+ if (arg == "--host") {
1096
+ CHECK_ARG
1097
+ params.hostname = argv[i];
1144
1098
  return true;
1145
1099
  }
1146
- if (arg == "--ppl-stride") {
1147
- if (++i >= argc) {
1100
+ if (arg == "--port") {
1101
+ CHECK_ARG
1102
+ params.port = std::stoi(argv[i]);
1103
+ return true;
1104
+ }
1105
+ if (arg == "--path") {
1106
+ CHECK_ARG
1107
+ params.public_path = argv[i];
1108
+ return true;
1109
+ }
1110
+ if (arg == "--api-key") {
1111
+ CHECK_ARG
1112
+ params.api_keys.push_back(argv[i]);
1113
+ return true;
1114
+ }
1115
+ if (arg == "--api-key-file") {
1116
+ CHECK_ARG
1117
+ std::ifstream key_file(argv[i]);
1118
+ if (!key_file) {
1119
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
1148
1120
  invalid_param = true;
1149
1121
  return true;
1150
1122
  }
1151
- params.ppl_stride = std::stoi(argv[i]);
1123
+ std::string key;
1124
+ while (std::getline(key_file, key)) {
1125
+ if (!key.empty()) {
1126
+ params.api_keys.push_back(key);
1127
+ }
1128
+ }
1129
+ key_file.close();
1130
+ return true;
1131
+ }
1132
+ if (arg == "--ssl-key-file") {
1133
+ CHECK_ARG
1134
+ params.ssl_file_key = argv[i];
1135
+ return true;
1136
+ }
1137
+ if (arg == "--ssl-cert-file") {
1138
+ CHECK_ARG
1139
+ params.ssl_file_cert = argv[i];
1140
+ return true;
1141
+ }
1142
+ if (arg == "--timeout" || arg == "-to") {
1143
+ CHECK_ARG
1144
+ params.timeout_read = std::stoi(argv[i]);
1145
+ params.timeout_write = std::stoi(argv[i]);
1146
+ return true;
1147
+ }
1148
+ if (arg == "--threads-http") {
1149
+ CHECK_ARG
1150
+ params.n_threads_http = std::stoi(argv[i]);
1151
+ return true;
1152
+ }
1153
+ if (arg == "-spf" || arg == "--system-prompt-file") {
1154
+ CHECK_ARG
1155
+ std::ifstream file(argv[i]);
1156
+ if (!file) {
1157
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
1158
+ invalid_param = true;
1159
+ return true;
1160
+ }
1161
+ std::string system_prompt;
1162
+ std::copy(
1163
+ std::istreambuf_iterator<char>(file),
1164
+ std::istreambuf_iterator<char>(),
1165
+ std::back_inserter(system_prompt)
1166
+ );
1167
+ params.system_prompt = system_prompt;
1168
+ return true;
1169
+ }
1170
+ if (arg == "--log-format") {
1171
+ CHECK_ARG
1172
+ if (std::strcmp(argv[i], "json") == 0) {
1173
+ params.log_json = true;
1174
+ } else if (std::strcmp(argv[i], "text") == 0) {
1175
+ params.log_json = false;
1176
+ } else {
1177
+ invalid_param = true;
1178
+ return true;
1179
+ }
1180
+ return true;
1181
+ }
1182
+ if (arg == "--no-slots") {
1183
+ params.endpoint_slots = false;
1184
+ return true;
1185
+ }
1186
+ if (arg == "--metrics") {
1187
+ params.endpoint_metrics = true;
1188
+ return true;
1189
+ }
1190
+ if (arg == "--slot-save-path") {
1191
+ CHECK_ARG
1192
+ params.slot_save_path = argv[i];
1193
+ // if doesn't end with DIRECTORY_SEPARATOR, add it
1194
+ if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
1195
+ params.slot_save_path += DIRECTORY_SEPARATOR;
1196
+ }
1152
1197
  return true;
1153
1198
  }
1154
- if (arg == "-ptc" || arg == "--print-token-count") {
1155
- if (++i >= argc) {
1199
+ if (arg == "--chat-template") {
1200
+ CHECK_ARG
1201
+ if (!llama_chat_verify_template(argv[i])) {
1202
+ fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
1203
+ fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
1156
1204
  invalid_param = true;
1157
1205
  return true;
1158
1206
  }
1159
- params.n_print = std::stoi(argv[i]);
1207
+ params.chat_template = argv[i];
1160
1208
  return true;
1161
1209
  }
1162
- if (arg == "--check-tensors") {
1163
- params.check_tensors = true;
1210
+ if (arg == "--slot-prompt-similarity" || arg == "-sps") {
1211
+ CHECK_ARG
1212
+ params.slot_prompt_similarity = std::stof(argv[i]);
1164
1213
  return true;
1165
1214
  }
1166
- if (arg == "--ppl-output-type") {
1167
- if (++i >= argc) {
1168
- invalid_param = true;
1169
- return true;
1170
- }
1171
- params.ppl_output_type = std::stoi(argv[i]);
1215
+ if (arg == "-pps") {
1216
+ params.is_pp_shared = true;
1172
1217
  return true;
1173
1218
  }
1174
- if (arg == "--hellaswag") {
1175
- params.hellaswag = true;
1219
+ if (arg == "-npp") {
1220
+ CHECK_ARG
1221
+ auto p = string_split<int>(argv[i], split_delim);
1222
+ params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
1176
1223
  return true;
1177
1224
  }
1178
- if (arg == "--hellaswag-tasks") {
1179
- if (++i >= argc) {
1180
- invalid_param = true;
1181
- return true;
1182
- }
1183
- params.hellaswag_tasks = std::stoi(argv[i]);
1225
+ if (arg == "-ntg") {
1226
+ CHECK_ARG
1227
+ auto p = string_split<int>(argv[i], split_delim);
1228
+ params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
1184
1229
  return true;
1185
1230
  }
1186
- if (arg == "--winogrande") {
1187
- params.winogrande = true;
1231
+ if (arg == "-npl") {
1232
+ CHECK_ARG
1233
+ auto p = string_split<int>(argv[i], split_delim);
1234
+ params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
1188
1235
  return true;
1189
1236
  }
1190
- if (arg == "--winogrande-tasks") {
1191
- if (++i >= argc) {
1237
+ if (arg == "--context-file") {
1238
+ CHECK_ARG
1239
+ std::ifstream file(argv[i], std::ios::binary);
1240
+ if (!file) {
1241
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
1192
1242
  invalid_param = true;
1193
1243
  return true;
1194
1244
  }
1195
- params.winogrande_tasks = std::stoi(argv[i]);
1245
+ params.context_files.push_back(argv[i]);
1196
1246
  return true;
1197
1247
  }
1198
- if (arg == "--multiple-choice") {
1199
- params.multiple_choice = true;
1248
+ if (arg == "--chunk-size") {
1249
+ CHECK_ARG
1250
+ params.chunk_size = std::stoi(argv[i]);
1200
1251
  return true;
1201
1252
  }
1202
- if (arg == "--multiple-choice-tasks") {
1203
- if (++i >= argc) {
1204
- invalid_param = true;
1205
- return true;
1206
- }
1207
- params.multiple_choice_tasks = std::stoi(argv[i]);
1253
+ if (arg == "--chunk-separator") {
1254
+ CHECK_ARG
1255
+ params.chunk_separator = argv[i];
1208
1256
  return true;
1209
1257
  }
1210
- if (arg == "--kl-divergence") {
1211
- params.kl_divergence = true;
1258
+ if (arg == "--junk") {
1259
+ CHECK_ARG
1260
+ params.n_junk = std::stoi(argv[i]);
1212
1261
  return true;
1213
1262
  }
1214
- if (arg == "--ignore-eos") {
1215
- params.ignore_eos = true;
1263
+ if (arg == "--pos") {
1264
+ CHECK_ARG
1265
+ params.i_pos = std::stoi(argv[i]);
1216
1266
  return true;
1217
1267
  }
1218
- if (arg == "--penalize-nl") {
1219
- sparams.penalize_nl = true;
1268
+ if (arg == "-o" || arg == "--output" || arg == "--output-file") {
1269
+ CHECK_ARG
1270
+ params.out_file = argv[i];
1271
+ params.cvector_outfile = argv[i];
1272
+ params.lora_outfile = argv[i];
1220
1273
  return true;
1221
1274
  }
1222
- if (arg == "-l" || arg == "--logit-bias") {
1223
- if (++i >= argc) {
1224
- invalid_param = true;
1225
- return true;
1226
- }
1227
- std::stringstream ss(argv[i]);
1228
- llama_token key;
1229
- char sign;
1230
- std::string value_str;
1231
- try {
1232
- if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
1233
- sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
1234
- }
1235
- else {
1236
- throw std::exception();
1237
- }
1238
- }
1239
- catch (const std::exception&) {
1240
- invalid_param = true;
1241
- return true;
1242
- }
1275
+ if (arg == "-ofreq" || arg == "--output-frequency") {
1276
+ CHECK_ARG
1277
+ params.n_out_freq = std::stoi(argv[i]);
1243
1278
  return true;
1244
1279
  }
1245
- if (arg == "-h" || arg == "--help") {
1246
- gpt_params_print_usage(argc, argv, gpt_params());
1247
- exit(0);
1280
+ if (arg == "--save-frequency") {
1281
+ CHECK_ARG
1282
+ params.n_save_freq = std::stoi(argv[i]);
1283
+ return true;
1248
1284
  }
1249
- if (arg == "--version") {
1250
- fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
1251
- fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
1252
- exit(0);
1285
+ if (arg == "--process-output") {
1286
+ params.process_output = true;
1287
+ return true;
1253
1288
  }
1254
- if (arg == "--random-prompt") {
1255
- params.random_prompt = true;
1289
+ if (arg == "--no-ppl") {
1290
+ params.compute_ppl = false;
1256
1291
  return true;
1257
1292
  }
1258
- if (arg == "--in-prefix-bos") {
1259
- params.input_prefix_bos = true;
1293
+ if (arg == "--chunk" || arg == "--from-chunk") {
1294
+ CHECK_ARG
1295
+ params.i_chunk = std::stoi(argv[i]);
1260
1296
  return true;
1261
1297
  }
1262
- if (arg == "--in-prefix") {
1263
- if (++i >= argc) {
1264
- invalid_param = true;
1265
- return true;
1266
- }
1267
- params.input_prefix = argv[i];
1298
+ // cvector params
1299
+ if (arg == "--positive-file") {
1300
+ CHECK_ARG
1301
+ params.cvector_positive_file = argv[i];
1268
1302
  return true;
1269
1303
  }
1270
- if (arg == "--in-suffix") {
1271
- if (++i >= argc) {
1272
- invalid_param = true;
1273
- return true;
1274
- }
1275
- params.input_suffix = argv[i];
1304
+ if (arg == "--negative-file") {
1305
+ CHECK_ARG
1306
+ params.cvector_negative_file = argv[i];
1276
1307
  return true;
1277
1308
  }
1278
- if (arg == "--grammar") {
1279
- if (++i >= argc) {
1280
- invalid_param = true;
1281
- return true;
1282
- }
1283
- sparams.grammar = argv[i];
1309
+ if (arg == "--pca-batch") {
1310
+ CHECK_ARG
1311
+ params.n_pca_batch = std::stoi(argv[i]);
1284
1312
  return true;
1285
1313
  }
1286
- if (arg == "--grammar-file") {
1287
- if (++i >= argc) {
1288
- invalid_param = true;
1289
- return true;
1290
- }
1291
- std::ifstream file(argv[i]);
1292
- if (!file) {
1293
- fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
1294
- invalid_param = true;
1295
- return true;
1296
- }
1297
- std::copy(
1298
- std::istreambuf_iterator<char>(file),
1299
- std::istreambuf_iterator<char>(),
1300
- std::back_inserter(sparams.grammar)
1301
- );
1314
+ if (arg == "--pca-iter") {
1315
+ CHECK_ARG
1316
+ params.n_pca_iterations = std::stoi(argv[i]);
1302
1317
  return true;
1303
1318
  }
1304
- if (arg == "-j" || arg == "--json-schema") {
1305
- if (++i >= argc) {
1306
- invalid_param = true;
1307
- return true;
1308
- }
1309
- sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
1319
+ if (arg == "--method") {
1320
+ CHECK_ARG
1321
+ std::string value(argv[i]);
1322
+ /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
1323
+ else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
1324
+ else { invalid_param = true; }
1310
1325
  return true;
1311
1326
  }
1312
- if (arg == "--override-kv") {
1313
- if (++i >= argc) {
1314
- invalid_param = true;
1315
- return true;
1316
- }
1317
- if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
1318
- fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
1319
- invalid_param = true;
1320
- return true;
1321
- }
1327
+ if (arg == "--no-warmup") {
1328
+ params.warmup = false;
1322
1329
  return true;
1323
1330
  }
1324
1331
  #ifndef LOG_DISABLE_LOGS
@@ -1332,10 +1339,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1332
1339
  // We have a matching known parameter requiring an argument,
1333
1340
  // now we need to check if there is anything after this argv
1334
1341
  // and flag invalid_param or parse it.
1335
- if (++i >= argc) {
1336
- invalid_param = true;
1337
- return true;
1338
- }
1342
+ CHECK_ARG
1339
1343
  if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) {
1340
1344
  invalid_param = true;
1341
1345
  return true;
@@ -1348,6 +1352,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1348
1352
  return false;
1349
1353
  }
1350
1354
 
1355
+ #ifdef __GNUC__
1356
+ #ifdef __MINGW32__
1357
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
1358
+ #else
1359
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
1360
+ #endif
1361
+ #else
1362
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
1363
+ #endif
1364
+
1351
1365
  void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1352
1366
  const llama_sampling_params & sparams = params.sparams;
1353
1367
 
@@ -1359,198 +1373,340 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
1359
1373
  }
1360
1374
  sampler_type_names.pop_back();
1361
1375
 
1362
- printf("\n");
1363
- printf("usage: %s [options]\n", argv[0]);
1364
- printf("\n");
1365
- printf("options:\n");
1366
- printf(" -h, --help show this help message and exit\n");
1367
- printf(" --version show version and build info\n");
1368
- printf(" -i, --interactive run in interactive mode\n");
1369
- printf(" --special special tokens output enabled\n");
1370
- printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
1371
- printf(" --interactive-first run in interactive mode and wait for input right away\n");
1372
- printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
1373
- printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
1374
- printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
1375
- printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
1376
- printf(" -r PROMPT, --reverse-prompt PROMPT\n");
1377
- printf(" halt generation at PROMPT, return control in interactive mode\n");
1378
- printf(" (can be specified more than once for multiple prompts).\n");
1379
- printf(" --color colorise output to distinguish prompt and user input from generations\n");
1380
- printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
1381
- printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
1382
- printf(" -tb N, --threads-batch N\n");
1383
- printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
1384
- printf(" -td N, --threads-draft N");
1385
- printf(" number of threads to use during generation (default: same as --threads)\n");
1386
- printf(" -tbd N, --threads-batch-draft N\n");
1387
- printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
1388
- printf(" -p PROMPT, --prompt PROMPT\n");
1389
- printf(" prompt to start generation with (default: empty)\n");
1390
- printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
1391
- printf(" --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
1392
- printf(" --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
1393
- printf(" not supported with --interactive or other interactive options\n");
1394
- printf(" --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
1395
- printf(" --random-prompt start with a randomized prompt.\n");
1396
- printf(" --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n");
1397
- printf(" --in-prefix STRING string to prefix user inputs with (default: empty)\n");
1398
- printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
1399
- printf(" -f FNAME, --file FNAME\n");
1400
- printf(" prompt file to start generation.\n");
1401
- printf(" -bf FNAME, --binary-file FNAME\n");
1402
- printf(" binary file containing multiple choice tasks.\n");
1403
- printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
1404
- printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
1405
- printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch);
1406
- printf(" -ub N, --ubatch-size N\n");
1407
- printf(" physical maximum batch size (default: %d)\n", params.n_ubatch);
1408
- printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n");
1409
- printf(" (default: %s)\n", sampler_type_names.c_str());
1410
- printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
1411
- printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
1412
- printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
1413
- printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
1414
- printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
1415
- printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
1416
- printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
1417
- printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
1418
- printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
1419
- printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
1420
- printf(" --dynatemp-range N dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range);
1421
- printf(" --dynatemp-exp N dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent);
1422
- printf(" --mirostat N use Mirostat sampling.\n");
1423
- printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
1424
- printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
1425
- printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
1426
- printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
1427
- printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
1428
- printf(" modifies the likelihood of token appearing in the completion,\n");
1429
- printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
1430
- printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
1431
- printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
1432
- printf(" --grammar-file FNAME file to read grammar from\n");
1433
- printf(" -j SCHEMA, --json-schema SCHEMA\n");
1434
- printf(" JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n");
1435
- printf(" For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n");
1436
- printf(" --cfg-negative-prompt PROMPT\n");
1437
- printf(" negative prompt to use for guidance. (default: empty)\n");
1438
- printf(" --cfg-negative-prompt-file FNAME\n");
1439
- printf(" negative prompt file to use for guidance. (default: empty)\n");
1440
- printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
1441
- printf(" --rope-scaling {none,linear,yarn}\n");
1442
- printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
1443
- printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n");
1444
- printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
1445
- printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
1446
- printf(" --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)\n");
1447
- printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
1448
- printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
1449
- printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
1450
- printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
1451
- printf(" --pooling {none,mean,cls}\n");
1452
- printf(" pooling type for embeddings, use model default if unspecified\n");
1453
- printf(" -dt N, --defrag-thold N\n");
1454
- printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
1455
- printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
1456
- printf(" --penalize-nl penalize newline tokens\n");
1457
- printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
1458
- printf(" --all-logits return logits for all tokens in the batch (default: disabled)\n");
1459
- printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
1460
- printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
1461
- printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
1462
- printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
1463
- printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n");
1464
- printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
1465
- printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base\n");
1466
- printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
1467
- printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
1468
- printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
1469
- printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
1470
- printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
1471
- printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
1472
- printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
1473
- printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
1474
- printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
1475
- printf(" --image IMAGE_FILE path to an image file. use with multimodal models. Specify multiple times for batching\n");
1376
+ struct option_info {
1377
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5)
1378
+ option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) {
1379
+ va_list args_list;
1380
+ va_start(args_list, desc);
1381
+ char buffer[1024];
1382
+ vsnprintf(buffer, sizeof(buffer), desc, args_list);
1383
+ va_end(args_list);
1384
+ this->desc = buffer;
1385
+ }
1386
+
1387
+ option_info(const std::string & grp) : grp(grp) {}
1388
+
1389
+ std::string tags;
1390
+ std::string args;
1391
+ std::string desc;
1392
+ std::string grp;
1393
+ };
1394
+
1395
+ std::vector<option_info> options;
1396
+
1397
+ // TODO: filter by tags
1398
+
1399
+ options.push_back({ "general" });
1400
+ options.push_back({ "*", "-h, --help, --usage", "print usage and exit" });
1401
+ options.push_back({ "*", " --version", "show version and build info" });
1402
+ options.push_back({ "*", "-v, --verbose", "print verbose information" });
1403
+ options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity });
1404
+ options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
1405
+ options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
1406
+ options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
1407
+ options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
1408
+ options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
1409
+ options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
1410
+ options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
1411
+ options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
1412
+ "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
1413
+ options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
1414
+ options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
1415
+ options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
1416
+ "path to static lookup cache to use for lookup decoding (not updated by generation)" });
1417
+ options.push_back({ "*", "-lcd, --lookup-cache-dynamic FNAME",
1418
+ "path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
1419
+
1420
+ options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
1421
+ options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
1422
+ options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch });
1423
+ options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch });
1424
+ options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
1425
+ options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
1426
+ options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
1427
+ options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n"
1428
+ "in conversation mode, this will be used as system prompt\n"
1429
+ "(default: '%s')", params.prompt.c_str() });
1430
+ options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
1431
+ options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" });
1432
+ options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
1433
+ options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
1434
+ options.push_back({ "*", " --no-escape", "do not process escape sequences" });
1435
+ options.push_back({ "main", "-ptc, --print-token-count N", "print token count every N tokens (default: %d)", params.n_print });
1436
+ options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" });
1437
+ options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n"
1438
+ "not supported with --interactive or other interactive options" });
1439
+ options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" });
1440
+ options.push_back({ "main", "-r, --reverse-prompt PROMPT",
1441
+ "halt generation at PROMPT, return control in interactive mode\n"
1442
+ "can be specified more than once for multiple prompts" });
1443
+ options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
1444
+ options.push_back({ "main", "-cnv, --conversation", "run in conversation mode, does not print special tokens and suffix/prefix\n"
1445
+ "if suffix/prefix are not specified, default chat template will be used\n"
1446
+ "(default: %s)", params.conversation ? "true" : "false" });
1447
+ options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
1448
+ options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
1449
+ options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
1450
+ options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
1451
+ options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
1452
+ options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
1453
+ options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" });
1454
+ options.push_back({ "server infill",
1455
+ " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
1456
+
1457
+ options.push_back({ "sampling" });
1458
+ options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
1459
+ "(default: %s)", sampler_type_names.c_str() });
1460
+ options.push_back({ "*", " --sampling-seq SEQUENCE",
1461
+ "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() });
1462
+ options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" });
1463
+ options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" });
1464
+ options.push_back({ "*", " --temp N", "temperature (default: %.1f)", (double)sparams.temp });
1465
+ options.push_back({ "*", " --top-k N", "top-k sampling (default: %d, 0 = disabled)", sparams.top_k });
1466
+ options.push_back({ "*", " --top-p N", "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p });
1467
+ options.push_back({ "*", " --min-p N", "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p });
1468
+ options.push_back({ "*", " --tfs N", "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z });
1469
+ options.push_back({ "*", " --typical N", "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p });
1470
+ options.push_back({ "*", " --repeat-last-n N", "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n });
1471
+ options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat });
1472
+ options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present });
1473
+ options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq });
1474
+ options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range });
1475
+ options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent });
1476
+ options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n"
1477
+ "Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
1478
+ "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat });
1479
+ options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
1480
+ options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
1481
+ options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n"
1482
+ "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
1483
+ "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
1484
+ options.push_back({ "main", " --cfg-negative-prompt PROMPT",
1485
+ "negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() });
1486
+ options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
1487
+ "negative prompt file to use for guidance" });
1488
+ options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
1489
+ options.push_back({ "main", " --chat-template JINJA_TEMPLATE",
1490
+ "set custom jinja chat template (default: template taken from model's metadata)\n"
1491
+ "if suffix/prefix are specified, template will be disabled\n"
1492
+ "only commonly used templates are accepted:\n"
1493
+ "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
1494
+ options.push_back({ "grammar" });
1495
+ options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
1496
+ options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
1497
+ options.push_back({ "*", "-j, --json-schema SCHEMA",
1498
+ "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n"
1499
+ "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
1500
+
1501
+ options.push_back({ "embedding" });
1502
+ options.push_back({ "embedding", " --pooling {none,mean,cls,last}",
1503
+ "pooling type for embeddings, use model default if unspecified" });
1504
+ options.push_back({ "embedding", " --attention {causal,non-causal}",
1505
+ "attention type for embeddings, use model default if unspecified" });
1506
+
1507
+ options.push_back({ "context hacking" });
1508
+ options.push_back({ "*", " --rope-scaling {none,linear,yarn}",
1509
+ "RoPE frequency scaling method, defaults to linear unless specified by the model" });
1510
+ options.push_back({ "*", " --rope-scale N", "RoPE context scaling factor, expands context by a factor of N" });
1511
+ options.push_back({ "*", " --rope-freq-base N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" });
1512
+ options.push_back({ "*", " --rope-freq-scale N", "RoPE frequency scaling factor, expands context by a factor of 1/N" });
1513
+ options.push_back({ "*", " --yarn-orig-ctx N", "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx });
1514
+ options.push_back({ "*", " --yarn-ext-factor N", "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor });
1515
+ options.push_back({ "*", " --yarn-attn-factor N", "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor });
1516
+ options.push_back({ "*", " --yarn-beta-slow N", "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow });
1517
+ options.push_back({ "*", " --yarn-beta-fast N", "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast });
1518
+ options.push_back({ "*", "-gan, --grp-attn-n N", "group-attention factor (default: %d)", params.grp_attn_n });
1519
+ options.push_back({ "*", "-gaw, --grp-attn-w N", "group-attention width (default: %.1f)", (double)params.grp_attn_w });
1520
+ options.push_back({ "*", "-dkvc, --dump-kv-cache", "verbose print of the KV cache" });
1521
+ options.push_back({ "*", "-nkvo, --no-kv-offload", "disable KV offload" });
1522
+ options.push_back({ "*", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() });
1523
+ options.push_back({ "*", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() });
1524
+
1525
+ options.push_back({ "perplexity" });
1526
+ options.push_back({ "perplexity", " --all-logits", "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" });
1527
+ options.push_back({ "perplexity", " --hellaswag", "compute HellaSwag score over random tasks from datafile supplied with -f" });
1528
+ options.push_back({ "perplexity", " --hellaswag-tasks N", "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks });
1529
+ options.push_back({ "perplexity", " --winogrande", "compute Winogrande score over random tasks from datafile supplied with -f" });
1530
+ options.push_back({ "perplexity", " --winogrande-tasks N", "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks });
1531
+ options.push_back({ "perplexity", " --multiple-choice", "compute multiple choice score over random tasks from datafile supplied with -f" });
1532
+ options.push_back({ "perplexity", " --multiple-choice-tasks N",
1533
+ "number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks });
1534
+ options.push_back({ "perplexity", " --kl-divergence", "computes KL-divergence to logits provided via --kl-divergence-base" });
1535
+ options.push_back({ "perplexity", " --ppl-stride N", "stride for perplexity calculation (default: %d)", params.ppl_stride });
1536
+ options.push_back({ "perplexity", " --ppl-output-type {0,1}",
1537
+ "output type for perplexity calculation (default: %d)", params.ppl_output_type });
1538
+
1539
+ options.push_back({ "parallel" });
1540
+ options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold });
1541
+ options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
1542
+ options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
1543
+ options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
1544
+ options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" });
1545
+
1546
+ options.push_back({ "multi-modality" });
1547
+ options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
1548
+ options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
1549
+
1550
+ options.push_back({ "backend" });
1551
+ options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
1552
+
1476
1553
  if (llama_supports_mlock()) {
1477
- printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
1554
+ options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
1478
1555
  }
1479
1556
  if (llama_supports_mmap()) {
1480
- printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
1481
- }
1482
- printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
1483
- printf(" - distribute: spread execution evenly over all nodes\n");
1484
- printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
1485
- printf(" - numactl: use the CPU map provided by numactl\n");
1486
- printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
1487
- printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
1557
+ options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
1558
+ }
1559
+ options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n"
1560
+ " - distribute: spread execution evenly over all nodes\n"
1561
+ " - isolate: only spawn threads on CPUs on the node that execution started on\n"
1562
+ " - numactl: use the CPU map provided by numactl\n"
1563
+ "if run without this previously, it is recommended to drop the system page cache before using this\n"
1564
+ "see https://github.com/ggerganov/llama.cpp/issues/1437" });
1565
+
1488
1566
  if (llama_supports_gpu_offload()) {
1489
- printf(" -ngl N, --n-gpu-layers N\n");
1490
- printf(" number of layers to store in VRAM\n");
1491
- printf(" -ngld N, --n-gpu-layers-draft N\n");
1492
- printf(" number of layers to store in VRAM for the draft model\n");
1493
- printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
1494
- printf(" how to split the model across multiple GPUs, one of:\n");
1495
- printf(" - none: use one GPU only\n");
1496
- printf(" - layer (default): split layers and KV across GPUs\n");
1497
- printf(" - row: split rows across GPUs\n");
1498
- printf(" -ts SPLIT, --tensor-split SPLIT\n");
1499
- printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
1500
- printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
1501
- printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
1502
- }
1503
- printf(" --rpc SERVERS comma separated list of RPC servers\n");
1504
- printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
1505
- printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
1506
- printf(" -gan N, --grp-attn-n N\n");
1507
- printf(" group-attention factor (default: %d)\n", params.grp_attn_n);
1508
- printf(" -gaw N, --grp-attn-w N\n");
1509
- printf(" group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
1510
- printf(" -dkvc, --dump-kv-cache\n");
1511
- printf(" verbose print of the KV cache\n");
1512
- printf(" -nkvo, --no-kv-offload\n");
1513
- printf(" disable KV offload\n");
1514
- printf(" -ctk TYPE, --cache-type-k TYPE\n");
1515
- printf(" KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
1516
- printf(" -ctv TYPE, --cache-type-v TYPE\n");
1517
- printf(" KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
1518
- printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
1519
- printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
1520
- printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
1521
- printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
1522
- printf(" --control-vector FNAME\n");
1523
- printf(" add a control vector\n");
1524
- printf(" --control-vector-scaled FNAME S\n");
1525
- printf(" add a control vector with user defined scaling S\n");
1526
- printf(" --control-vector-layer-range START END\n");
1527
- printf(" layer range to apply the control vector(s) to, start and end inclusive\n");
1528
- printf(" -m FNAME, --model FNAME\n");
1529
- printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
1530
- printf(" -md FNAME, --model-draft FNAME\n");
1531
- printf(" draft model for speculative decoding (default: unused)\n");
1532
- printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
1533
- printf(" model download url (default: unused)\n");
1534
- printf(" -hfr REPO, --hf-repo REPO\n");
1535
- printf(" Hugging Face model repository (default: unused)\n");
1536
- printf(" -hff FILE, --hf-file FILE\n");
1537
- printf(" Hugging Face model file (default: unused)\n");
1538
- printf(" -ld LOGDIR, --logdir LOGDIR\n");
1539
- printf(" path under which to save YAML logs (no logging if unset)\n");
1540
- printf(" -lcs FNAME, --lookup-cache-static FNAME\n");
1541
- printf(" path to static lookup cache to use for lookup decoding (not updated by generation)\n");
1542
- printf(" -lcd FNAME, --lookup-cache-dynamic FNAME\n");
1543
- printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
1544
- printf(" --override-kv KEY=TYPE:VALUE\n");
1545
- printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
1546
- printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
1547
- printf(" -ptc N, --print-token-count N\n");
1548
- printf(" print token count every N tokens (default: %d)\n", params.n_print);
1549
- printf(" --check-tensors check model tensor data for invalid values\n");
1550
- printf("\n");
1567
+ options.push_back({ "*", "-ngl, --gpu-layers N",
1568
+ "number of layers to store in VRAM" });
1569
+ options.push_back({ "*", "-ngld, --gpu-layers-draft N",
1570
+ "number of layers to store in VRAM for the draft model" });
1571
+ options.push_back({ "*", "-sm, --split-mode SPLIT_MODE",
1572
+ "how to split the model across multiple GPUs, one of:\n"
1573
+ " - none: use one GPU only\n"
1574
+ " - layer (default): split layers and KV across GPUs\n"
1575
+ " - row: split rows across GPUs" });
1576
+ options.push_back({ "*", "-ts, --tensor-split SPLIT",
1577
+ "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
1578
+ options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n"
1579
+ "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
1580
+ }
1581
+
1582
+ options.push_back({ "model" });
1583
+ options.push_back({ "*", " --check-tensors", "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" });
1584
+ options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
1585
+ "advanced option to override model metadata by key. may be specified multiple times.\n"
1586
+ "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
1587
+ options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" });
1588
+ options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
1589
+ options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
1590
+ "note: this argument can be repeated to add multiple control vectors" });
1591
+ options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
1592
+ "add a control vector with user defined scaling SCALE\n"
1593
+ "note: this argument can be repeated to add multiple scaled control vectors" });
1594
+ options.push_back({ "*", " --control-vector-layer-range START END",
1595
+ "layer range to apply the control vector(s) to, start and end inclusive" });
1596
+ options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
1597
+ "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
1598
+ options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" });
1599
+ options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
1600
+ options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
1601
+ options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" });
1602
+ options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
1603
+
1604
+ options.push_back({ "retrieval" });
1605
+ options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" });
1606
+ options.push_back({ "retrieval", " --chunk-size N", "minimum length of embedded text chunks (default: %d)", params.chunk_size });
1607
+ options.push_back({ "retrieval", " --chunk-separator STRING",
1608
+ "separator between chunks (default: '%s')", params.chunk_separator.c_str() });
1609
+
1610
+ options.push_back({ "passkey" });
1611
+ options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk });
1612
+ options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos });
1613
+
1614
+ options.push_back({ "imatrix" });
1615
+ options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() });
1616
+ options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq });
1617
+ options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
1618
+ options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
1619
+ options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
1620
+ options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk });
1621
+
1622
+ options.push_back({ "bench" });
1623
+ options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
1624
+ options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" });
1625
+ options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" });
1626
+ options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" });
1627
+
1628
+ options.push_back({ "embedding" });
1629
+ options.push_back({ "embedding", " --embd-normalize", "normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize });
1630
+ options.push_back({ "embedding", " --embd-output-format", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix" });
1631
+ options.push_back({ "embedding", " --embd-separator", "separator of embendings (default \\n) for example \"<#sep#>\"" });
1632
+
1633
+ options.push_back({ "server" });
1634
+ options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
1635
+ options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
1636
+ options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
1637
+ options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
1638
+ options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
1639
+ options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
1640
+ options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
1641
+ options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" });
1642
+ options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read });
1643
+ options.push_back({ "server", " --threads-http N", "number of threads used to process HTTP requests (default: %d)", params.n_threads_http });
1644
+ options.push_back({ "server", " --system-prompt-file FNAME",
1645
+ "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" });
1646
+ options.push_back({ "server", " --log-format {text,json}",
1647
+ "log output format: json or text (default: json)" });
1648
+ options.push_back({ "server", " --metrics", "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" });
1649
+ options.push_back({ "server", " --no-slots", "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" });
1650
+ options.push_back({ "server", " --slot-save-path PATH", "path to save slot kv cache (default: disabled)" });
1651
+ options.push_back({ "server", " --chat-template JINJA_TEMPLATE",
1652
+ "set custom jinja chat template (default: template taken from model's metadata)\n"
1653
+ "only commonly used templates are accepted:\n"
1654
+ "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
1655
+ options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
1656
+ "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
1657
+
1551
1658
  #ifndef LOG_DISABLE_LOGS
1552
- log_print_usage();
1659
+ options.push_back({ "logging" });
1660
+ options.push_back({ "*", " --simple-io", "use basic IO for better compatibility in subprocesses and limited consoles" });
1661
+ options.push_back({ "*", "-ld, --logdir LOGDIR", "path under which to save YAML logs (no logging if unset)" });
1662
+ options.push_back({ "logging", " --log-test", "Run simple logging test" });
1663
+ options.push_back({ "logging", " --log-disable", "Disable trace logs" });
1664
+ options.push_back({ "logging", " --log-enable", "Enable trace logs" });
1665
+ options.push_back({ "logging", " --log-file FNAME", "Specify a log filename (without extension)" });
1666
+ options.push_back({ "logging", " --log-new", "Create a separate new log file on start. "
1667
+ "Each log file will have unique name: \"<name>.<ID>.log\"" });
1668
+ options.push_back({ "logging", " --log-append", "Don't truncate the old log file." });
1553
1669
  #endif // LOG_DISABLE_LOGS
1670
+
1671
+ options.push_back({ "cvector" });
1672
+ options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() });
1673
+ options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
1674
+ options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
1675
+ options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
1676
+ options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
1677
+ options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
1678
+
1679
+ options.push_back({ "export-lora" });
1680
+ options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
1681
+ options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
1682
+ options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
1683
+ options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
1684
+ options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
1685
+
1686
+ printf("usage: %s [options]\n", argv[0]);
1687
+
1688
+ for (const auto & o : options) {
1689
+ if (!o.grp.empty()) {
1690
+ printf("\n%s:\n\n", o.grp.c_str());
1691
+ continue;
1692
+ }
1693
+ printf(" %-32s", o.args.c_str());
1694
+ if (o.args.length() > 30) {
1695
+ printf("\n%34s", "");
1696
+ }
1697
+
1698
+ const auto desc = o.desc;
1699
+ size_t start = 0;
1700
+ size_t end = desc.find('\n');
1701
+ while (end != std::string::npos) {
1702
+ printf("%s\n%34s", desc.substr(start, end - start).c_str(), "");
1703
+ start = end + 1;
1704
+ end = desc.find('\n', start);
1705
+ }
1706
+
1707
+ printf("%s\n", desc.substr(start).c_str());
1708
+ }
1709
+ printf("\n");
1554
1710
  }
1555
1711
 
1556
1712
  std::string gpt_params_get_system_info(const gpt_params & params) {
@@ -1610,24 +1766,6 @@ std::string string_get_sortable_timestamp() {
1610
1766
  return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
1611
1767
  }
1612
1768
 
1613
- std::string string_random_prompt(std::mt19937 & rng) {
1614
- const int r = rng() % 10;
1615
- switch (r) {
1616
- case 0: return "So";
1617
- case 1: return "Once upon a time";
1618
- case 2: return "When";
1619
- case 3: return "The";
1620
- case 4: return "After";
1621
- case 5: return "If";
1622
- case 6: return "import";
1623
- case 7: return "He";
1624
- case 8: return "She";
1625
- case 9: return "They";
1626
- }
1627
-
1628
- GGML_UNREACHABLE();
1629
- }
1630
-
1631
1769
  void string_process_escapes(std::string & input) {
1632
1770
  std::size_t input_len = input.length();
1633
1771
  std::size_t output_idx = 0;
@@ -1887,6 +2025,16 @@ std::string fs_get_cache_directory() {
1887
2025
  return ensure_trailing_slash(cache_directory);
1888
2026
  }
1889
2027
 
2028
+ std::string fs_get_cache_file(const std::string & filename) {
2029
+ GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
2030
+ std::string cache_directory = fs_get_cache_directory();
2031
+ const bool success = fs_create_directory_with_parents(cache_directory);
2032
+ if (!success) {
2033
+ throw std::runtime_error("failed to create cache directory: " + cache_directory);
2034
+ }
2035
+ return cache_directory + filename;
2036
+ }
2037
+
1890
2038
 
1891
2039
  //
1892
2040
  // Model utils
@@ -1898,9 +2046,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
1898
2046
  llama_model * model = nullptr;
1899
2047
 
1900
2048
  if (!params.hf_repo.empty() && !params.hf_file.empty()) {
1901
- model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
2049
+ model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
1902
2050
  } else if (!params.model_url.empty()) {
1903
- model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
2051
+ model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
1904
2052
  } else {
1905
2053
  model = llama_load_model_from_file(params.model.c_str(), mparams);
1906
2054
  }
@@ -1946,19 +2094,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
1946
2094
  for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
1947
2095
  const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
1948
2096
  float lora_scale = std::get<1>(params.lora_adapter[i]);
1949
- int err = llama_model_apply_lora_from_file(model,
1950
- lora_adapter.c_str(),
1951
- lora_scale,
1952
- ((i > 0) || params.lora_base.empty())
1953
- ? NULL
1954
- : params.lora_base.c_str(),
1955
- params.n_threads);
1956
- if (err != 0) {
2097
+ auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
2098
+ if (adapter == nullptr) {
1957
2099
  fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
1958
2100
  llama_free(lctx);
1959
2101
  llama_free_model(model);
1960
2102
  return std::make_tuple(nullptr, nullptr);
1961
2103
  }
2104
+ llama_lora_adapter_set(lctx, adapter, lora_scale);
1962
2105
  }
1963
2106
 
1964
2107
  if (params.ignore_eos) {
@@ -1968,7 +2111,24 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
1968
2111
  if (params.warmup) {
1969
2112
  LOG("warming up the model with an empty run\n");
1970
2113
 
1971
- std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
2114
+ std::vector<llama_token> tmp;
2115
+ llama_token bos = llama_token_bos(model);
2116
+ llama_token eos = llama_token_eos(model);
2117
+ // some models (e.g. T5) don't have a BOS token
2118
+ if (bos != -1) {
2119
+ tmp.push_back(bos);
2120
+ }
2121
+ tmp.push_back(eos);
2122
+
2123
+ if (llama_model_has_encoder(model)) {
2124
+ llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
2125
+ llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
2126
+ if (decoder_start_token_id == -1) {
2127
+ decoder_start_token_id = bos;
2128
+ }
2129
+ tmp.clear();
2130
+ tmp.push_back(decoder_start_token_id);
2131
+ }
1972
2132
  llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
1973
2133
  llama_kv_cache_clear(lctx);
1974
2134
  llama_synchronize(lctx);
@@ -2051,6 +2211,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
2051
2211
  cparams.yarn_beta_slow = params.yarn_beta_slow;
2052
2212
  cparams.yarn_orig_ctx = params.yarn_orig_ctx;
2053
2213
  cparams.pooling_type = params.pooling_type;
2214
+ cparams.attention_type = params.attention_type;
2054
2215
  cparams.defrag_thold = params.defrag_thold;
2055
2216
  cparams.cb_eval = params.cb_eval;
2056
2217
  cparams.cb_eval_user_data = params.cb_eval_user_data;
@@ -2070,7 +2231,7 @@ static bool starts_with(const std::string & str, const std::string & prefix) {
2070
2231
  return str.rfind(prefix, 0) == 0;
2071
2232
  }
2072
2233
 
2073
- static bool llama_download_file(const std::string & url, const std::string & path) {
2234
+ static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
2074
2235
 
2075
2236
  // Initialize libcurl
2076
2237
  std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
@@ -2085,6 +2246,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
2085
2246
  curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
2086
2247
  curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
2087
2248
 
2249
+ // Check if hf-token or bearer-token was specified
2250
+ if (!hf_token.empty()) {
2251
+ std::string auth_header = "Authorization: Bearer ";
2252
+ auth_header += hf_token.c_str();
2253
+ struct curl_slist *http_headers = NULL;
2254
+ http_headers = curl_slist_append(http_headers, auth_header.c_str());
2255
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
2256
+ }
2257
+
2088
2258
  #if defined(_WIN32)
2089
2259
  // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
2090
2260
  // operating system. Currently implemented under MS-Windows.
@@ -2201,7 +2371,14 @@ static bool llama_download_file(const std::string & url, const std::string & pat
2201
2371
  }
2202
2372
 
2203
2373
  // Set the output file
2204
- std::unique_ptr<FILE, decltype(&fclose)> outfile(fopen(path_temporary.c_str(), "wb"), fclose);
2374
+
2375
+ struct FILE_deleter {
2376
+ void operator()(FILE * f) const {
2377
+ fclose(f);
2378
+ }
2379
+ };
2380
+
2381
+ std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
2205
2382
  if (!outfile) {
2206
2383
  fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
2207
2384
  return false;
@@ -2273,6 +2450,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
2273
2450
  struct llama_model * llama_load_model_from_url(
2274
2451
  const char * model_url,
2275
2452
  const char * path_model,
2453
+ const char * hf_token,
2276
2454
  const struct llama_model_params & params) {
2277
2455
  // Basic validation of the model_url
2278
2456
  if (!model_url || strlen(model_url) == 0) {
@@ -2280,7 +2458,7 @@ struct llama_model * llama_load_model_from_url(
2280
2458
  return NULL;
2281
2459
  }
2282
2460
 
2283
- if (!llama_download_file(model_url, path_model)) {
2461
+ if (!llama_download_file(model_url, path_model, hf_token)) {
2284
2462
  return NULL;
2285
2463
  }
2286
2464
 
@@ -2328,14 +2506,14 @@ struct llama_model * llama_load_model_from_url(
2328
2506
  // Prepare download in parallel
2329
2507
  std::vector<std::future<bool>> futures_download;
2330
2508
  for (int idx = 1; idx < n_split; idx++) {
2331
- futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool {
2509
+ futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
2332
2510
  char split_path[PATH_MAX] = {0};
2333
2511
  llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
2334
2512
 
2335
2513
  char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
2336
2514
  llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
2337
2515
 
2338
- return llama_download_file(split_url, split_path);
2516
+ return llama_download_file(split_url, split_path, hf_token);
2339
2517
  }, idx));
2340
2518
  }
2341
2519
 
@@ -2354,6 +2532,7 @@ struct llama_model * llama_load_model_from_hf(
2354
2532
  const char * repo,
2355
2533
  const char * model,
2356
2534
  const char * path_model,
2535
+ const char * hf_token,
2357
2536
  const struct llama_model_params & params) {
2358
2537
  // construct hugging face model url:
2359
2538
  //
@@ -2369,7 +2548,7 @@ struct llama_model * llama_load_model_from_hf(
2369
2548
  model_url += "/resolve/main/";
2370
2549
  model_url += model;
2371
2550
 
2372
- return llama_load_model_from_url(model_url.c_str(), path_model, params);
2551
+ return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
2373
2552
  }
2374
2553
 
2375
2554
  #else
@@ -2377,6 +2556,7 @@ struct llama_model * llama_load_model_from_hf(
2377
2556
  struct llama_model * llama_load_model_from_url(
2378
2557
  const char * /*model_url*/,
2379
2558
  const char * /*path_model*/,
2559
+ const char * /*hf_token*/,
2380
2560
  const struct llama_model_params & /*params*/) {
2381
2561
  fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
2382
2562
  return nullptr;
@@ -2386,6 +2566,7 @@ struct llama_model * llama_load_model_from_hf(
2386
2566
  const char * /*repo*/,
2387
2567
  const char * /*model*/,
2388
2568
  const char * /*path_model*/,
2569
+ const char * /*hf_token*/,
2389
2570
  const struct llama_model_params & /*params*/) {
2390
2571
  fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
2391
2572
  return nullptr;
@@ -2450,57 +2631,126 @@ std::vector<llama_token> llama_tokenize(
2450
2631
  }
2451
2632
 
2452
2633
  std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
2453
- std::vector<char> result(8, 0);
2454
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
2455
- if (n_tokens < 0) {
2456
- result.resize(-n_tokens);
2457
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
2458
- GGML_ASSERT(check == -n_tokens);
2459
- } else {
2460
- result.resize(n_tokens);
2634
+ std::string piece;
2635
+ piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
2636
+ const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
2637
+ if (n_chars < 0) {
2638
+ piece.resize(-n_chars);
2639
+ int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
2640
+ GGML_ASSERT(check == -n_chars);
2641
+ }
2642
+ else {
2643
+ piece.resize(n_chars);
2461
2644
  }
2462
2645
 
2463
- return std::string(result.data(), result.size());
2646
+ return piece;
2464
2647
  }
2465
2648
 
2466
- std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
2467
- const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
2468
-
2469
- std::string piece;
2470
- std::string result;
2649
+ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
2650
+ std::string text;
2651
+ text.resize(std::max(text.capacity(), tokens.size()));
2652
+ int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
2653
+ if (n_chars < 0) {
2654
+ text.resize(-n_chars);
2655
+ n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
2656
+ GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
2657
+ }
2471
2658
 
2472
- for (size_t i = 0; i < tokens.size(); ++i) {
2473
- piece = llama_token_to_piece(ctx, tokens[i]);
2659
+ text.resize(n_chars);
2474
2660
 
2475
- // remove the leading space of the first non-BOS token
2476
- if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
2477
- piece = piece.substr(1);
2478
- }
2661
+ // NOTE: the original tokenizer decodes bytes after collecting the pieces.
2662
+ return text;
2663
+ }
2479
2664
 
2480
- result += piece;
2481
- }
2665
+ bool llama_should_add_bos_token(const llama_model * model) {
2666
+ const int add_bos = llama_add_bos_token(model);
2482
2667
 
2483
- return result;
2668
+ return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
2484
2669
  }
2485
2670
 
2486
- std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
2487
- std::string piece;
2488
- std::string result;
2671
+ //
2672
+ // Chat template utils
2673
+ //
2674
+
2675
+ bool llama_chat_verify_template(const std::string & tmpl) {
2676
+ llama_chat_message chat[] = {{"user", "test"}};
2677
+ int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
2678
+ return res >= 0;
2679
+ }
2489
2680
 
2490
- for (size_t i = 0; i < tokens.size(); ++i) {
2491
- piece = llama_token_to_piece(ctx, tokens[i]);
2681
+ std::string llama_chat_apply_template(const struct llama_model * model,
2682
+ const std::string & tmpl,
2683
+ const std::vector<llama_chat_msg> & msgs,
2684
+ bool add_ass) {
2685
+ int alloc_size = 0;
2686
+ bool fallback = false; // indicate if we must fallback to default chatml
2687
+ std::vector<llama_chat_message> chat;
2688
+ for (auto & msg : msgs) {
2689
+ chat.push_back({msg.role.c_str(), msg.content.c_str()});
2690
+ alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
2691
+ }
2692
+
2693
+ const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
2694
+ std::vector<char> buf(alloc_size);
2695
+
2696
+ // run the first time to get the total output length
2697
+ int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
2698
+
2699
+ // error: chat template is not supported
2700
+ if (res < 0) {
2701
+ if (ptr_tmpl != nullptr) {
2702
+ // if the custom "tmpl" is not supported, we throw an error
2703
+ // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
2704
+ throw std::runtime_error("this custom template is not supported");
2705
+ } else {
2706
+ // If the built-in template is not supported, we default to chatml
2707
+ res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
2708
+ fallback = true;
2709
+ }
2710
+ }
2492
2711
 
2493
- result += piece;
2712
+ // if it turns out that our buffer is too small, we resize it
2713
+ if ((size_t) res > buf.size()) {
2714
+ buf.resize(res);
2715
+ res = llama_chat_apply_template(
2716
+ fallback ? nullptr : model,
2717
+ fallback ? "chatml" : ptr_tmpl,
2718
+ chat.data(), chat.size(), add_ass, buf.data(), buf.size());
2494
2719
  }
2495
2720
 
2496
- // NOTE: the original tokenizer decodes bytes after collecting the pieces.
2497
- return result;
2721
+ std::string formatted_chat(buf.data(), res);
2722
+ return formatted_chat;
2498
2723
  }
2499
2724
 
2500
- bool llama_should_add_bos_token(const llama_model * model) {
2501
- const int add_bos = llama_add_bos_token(model);
2725
+ std::string llama_chat_format_single(const struct llama_model * model,
2726
+ const std::string & tmpl,
2727
+ const std::vector<llama_chat_msg> & past_msg,
2728
+ const llama_chat_msg & new_msg,
2729
+ bool add_ass) {
2730
+ std::ostringstream ss;
2731
+ auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
2732
+ std::vector<llama_chat_msg> chat_new(past_msg);
2733
+ // if the past_msg ends with a newline, we must preserve it in the formatted version
2734
+ if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
2735
+ ss << "\n";
2736
+ };
2737
+ // format chat with new_msg
2738
+ chat_new.push_back(new_msg);
2739
+ auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
2740
+ // get the diff part
2741
+ ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
2742
+ return ss.str();
2743
+ }
2502
2744
 
2503
- return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
2745
+ std::string llama_chat_format_example(const struct llama_model * model,
2746
+ const std::string & tmpl) {
2747
+ std::vector<llama_chat_msg> msgs = {
2748
+ {"system", "You are a helpful assistant"},
2749
+ {"user", "Hello"},
2750
+ {"assistant", "Hi there"},
2751
+ {"user", "How are you?"},
2752
+ };
2753
+ return llama_chat_apply_template(model, tmpl, msgs, true);
2504
2754
  }
2505
2755
 
2506
2756
  //
@@ -2582,14 +2832,34 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
2582
2832
  // Embedding utils
2583
2833
  //
2584
2834
 
2585
- void llama_embd_normalize(const float * inp, float * out, int n) {
2835
+ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
2586
2836
  double sum = 0.0;
2587
- for (int i = 0; i < n; i++) {
2588
- sum += inp[i] * inp[i];
2837
+
2838
+ switch (embd_norm) {
2839
+ case -1: // no normalisation
2840
+ sum = 1.0;
2841
+ break;
2842
+ case 0: // max absolute
2843
+ for (int i = 0; i < n; i++) {
2844
+ if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
2845
+ }
2846
+ sum /= 32760.0; // make an int16 range
2847
+ break;
2848
+ case 2: // euclidean
2849
+ for (int i = 0; i < n; i++) {
2850
+ sum += inp[i] * inp[i];
2851
+ }
2852
+ sum = std::sqrt(sum);
2853
+ break;
2854
+ default: // p-norm (euclidean is p-norm p=2)
2855
+ for (int i = 0; i < n; i++) {
2856
+ sum += std::pow(std::abs(inp[i]), embd_norm);
2857
+ }
2858
+ sum = std::pow(sum, 1.0 / embd_norm);
2859
+ break;
2589
2860
  }
2590
- sum = sqrt(sum);
2591
2861
 
2592
- const float norm = sum > 0.0 ? 1.0f / sum : 0.0f;
2862
+ const float norm = sum > 0.0 ? 1.0 / sum : 0.0f;
2593
2863
 
2594
2864
  for (int i = 0; i < n; i++) {
2595
2865
  out[i] = inp[i] * norm;
@@ -2607,6 +2877,14 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
2607
2877
  sum2 += embd2[i] * embd2[i];
2608
2878
  }
2609
2879
 
2880
+ // Handle the case where one or both vectors are zero vectors
2881
+ if (sum1 == 0.0 || sum2 == 0.0) {
2882
+ if (sum1 == 0.0 && sum2 == 0.0) {
2883
+ return 1.0f; // two zero vectors are similar
2884
+ }
2885
+ return 0.0f;
2886
+ }
2887
+
2610
2888
  return sum / (sqrt(sum1) * sqrt(sum2));
2611
2889
  }
2612
2890
 
@@ -2615,125 +2893,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
2615
2893
  //
2616
2894
 
2617
2895
  static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
2618
- int32_t n_tensors;
2619
-
2620
- size_t n_bytes = 0;
2621
-
2622
- uint32_t max_direction_layer = 0;
2623
-
2624
2896
  llama_control_vector_data result = { -1, {} };
2625
2897
 
2626
- // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
2627
- {
2628
- struct ggml_init_params meta_params = {
2629
- /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
2630
- /* .mem_buffer = */ nullptr,
2631
- /* .no_alloc = */ true,
2632
- };
2633
- ggml_context * meta_ctx = ggml_init(meta_params);
2634
- struct gguf_init_params meta_gguf_params = {
2635
- /* .no_alloc = */ true,
2636
- /* .ctx = */ &meta_ctx,
2637
- };
2638
- struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
2639
- if (!meta_ctx_gguf) {
2640
- fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
2641
- ggml_free(meta_ctx);
2642
- return result;
2643
- }
2644
-
2645
- n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
2646
- for (int i = 0; i < n_tensors; i++) {
2647
- std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
2648
-
2649
- // split on '.'
2650
- size_t dotpos = name.find('.');
2651
- if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
2652
- try {
2653
- uint32_t layer = std::stoi(name.substr(dotpos + 1));
2654
- if (layer == 0) {
2655
- fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
2656
- ggml_free(meta_ctx);
2657
- gguf_free(meta_ctx_gguf);
2658
- return result;
2659
- }
2660
- if (layer > max_direction_layer) {
2661
- max_direction_layer = layer;
2662
- }
2663
- } catch (...) {
2664
- fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
2665
- ggml_free(meta_ctx);
2666
- gguf_free(meta_ctx_gguf);
2667
- return result;
2668
- }
2669
- }
2670
-
2671
- struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
2672
- if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
2673
- fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
2674
- ggml_free(meta_ctx);
2675
- gguf_free(meta_ctx_gguf);
2676
- return result;
2677
- }
2678
- if (result.n_embd == -1) {
2679
- result.n_embd = ggml_nelements(tensor_meta);
2680
- } else if (ggml_nelements(tensor_meta) != result.n_embd) {
2681
- fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
2682
- ggml_free(meta_ctx);
2683
- gguf_free(meta_ctx_gguf);
2684
- return result;
2685
- }
2686
- n_bytes += ggml_nbytes(tensor_meta);
2687
- }
2688
- ggml_free(meta_ctx);
2689
- gguf_free(meta_ctx_gguf);
2898
+ ggml_context * ctx = nullptr;
2899
+ struct gguf_init_params meta_gguf_params = {
2900
+ /* .no_alloc = */ false,
2901
+ /* .ctx = */ &ctx,
2902
+ };
2903
+ struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
2904
+ if (!ctx_gguf) {
2905
+ fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
2906
+ return result;
2690
2907
  }
2691
2908
 
2909
+ int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
2692
2910
  if (n_tensors == 0) {
2693
2911
  fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
2694
- return result;
2695
2912
  }
2696
2913
 
2697
- // load and scale tensors into final control vector context
2698
- struct ggml_init_params ggml_params = {
2699
- /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
2700
- /* .mem_buffer = */ nullptr,
2701
- /* .no_alloc = */ false,
2702
- };
2703
- struct ggml_context * ctx = ggml_init(ggml_params);
2914
+ for (int i = 0; i < n_tensors; i++) {
2915
+ std::string name = gguf_get_tensor_name(ctx_gguf, i);
2704
2916
 
2705
- struct gguf_init_params params = {
2706
- /*.no_alloc = */ false,
2707
- /*.ctx = */ &ctx,
2708
- };
2709
- struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
2710
- if (!ctx_gguf) {
2711
- fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
2712
- ggml_free(ctx);
2713
- return result;
2714
- }
2917
+ int layer_idx = -1;
2918
+
2919
+ // split on '.'
2920
+ size_t dotpos = name.find('.');
2921
+ if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
2922
+ try {
2923
+ layer_idx = std::stoi(name.substr(dotpos + 1));
2924
+ } catch (...) {
2925
+ layer_idx = -1;
2926
+ }
2927
+ }
2928
+ if (layer_idx < 0) {
2929
+ fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
2930
+ result.n_embd = -1;
2931
+ break;
2932
+ } else if (layer_idx == 0) {
2933
+ fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
2934
+ result.n_embd = -1;
2935
+ break;
2936
+ }
2715
2937
 
2716
- // do not store data for layer 0 (it's not used)
2717
- result.data.resize(result.n_embd * max_direction_layer);
2938
+ struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
2939
+ if (tensor->type != GGML_TYPE_F32) {
2940
+ fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
2941
+ result.n_embd = -1;
2942
+ break;
2943
+ }
2944
+ if (ggml_n_dims(tensor) != 1) {
2945
+ fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
2946
+ result.n_embd = -1;
2947
+ break;
2948
+ }
2718
2949
 
2719
- for (uint32_t il = 1; il <= max_direction_layer; il++) {
2720
- const std::string name = "direction." + std::to_string(il);
2721
- const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
2950
+ if (result.n_embd == -1) {
2951
+ result.n_embd = ggml_nelements(tensor);
2952
+ } else if (ggml_nelements(tensor) != result.n_embd) {
2953
+ fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
2954
+ result.n_embd = -1;
2955
+ break;
2956
+ }
2722
2957
 
2723
- float * dst = result.data.data() + result.n_embd * (il - 1);
2958
+ // extend if necessary - do not store data for layer 0 (it's not used)
2959
+ result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
2724
2960
 
2725
- if (tensor) {
2726
- const float * src = (const float *) tensor->data;
2727
- for (int j = 0; j < result.n_embd; j++) {
2728
- dst[j] = src[j] * load_info.strength;
2729
- }
2730
- } else {
2731
- for (int j = 0; j < result.n_embd; j++) {
2732
- dst[j] = 0.0f;
2733
- }
2961
+ const float * src = (const float *) tensor->data;
2962
+ float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
2963
+ for (int j = 0; j < result.n_embd; j++) {
2964
+ dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
2734
2965
  }
2966
+
2735
2967
  }
2736
2968
 
2969
+ if (result.n_embd == -1) {
2970
+ fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
2971
+ result.data.clear();
2972
+ }
2973
+
2974
+ gguf_free(ctx_gguf);
2975
+ ggml_free(ctx);
2976
+
2737
2977
  return result;
2738
2978
  }
2739
2979
 
@@ -2744,16 +2984,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
2744
2984
  auto cur = llama_control_vector_load_one(info);
2745
2985
 
2746
2986
  if (cur.n_embd == -1) {
2747
- return result;
2987
+ result.n_embd = -1;
2988
+ break;
2748
2989
  }
2749
- if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) {
2750
- fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str());
2751
- return result;
2990
+ if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
2991
+ fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
2992
+ result.n_embd = -1;
2993
+ break;
2752
2994
  }
2753
2995
 
2754
2996
  if (result.n_embd == -1) {
2755
2997
  result = std::move(cur);
2756
2998
  } else {
2999
+ result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f); // extend if necessary
2757
3000
  for (size_t i = 0; i < cur.data.size(); i++) {
2758
3001
  result.data[i] += cur.data[i];
2759
3002
  }
@@ -2761,7 +3004,8 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
2761
3004
  }
2762
3005
 
2763
3006
  if (result.n_embd == -1) {
2764
- fprintf(stderr, "%s: no vectors passed\n", __func__);
3007
+ fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
3008
+ result.data.clear();
2765
3009
  }
2766
3010
 
2767
3011
  return result;
@@ -2844,7 +3088,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
2844
3088
  fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
2845
3089
  fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
2846
3090
  fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
2847
- fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
2848
3091
  fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
2849
3092
  fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
2850
3093
  fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
@@ -2903,9 +3146,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
2903
3146
  yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
2904
3147
  fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
2905
3148
  yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
2906
- fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
2907
3149
  fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
2908
- fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
2909
3150
  fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
2910
3151
  fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
2911
3152
  fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
@@ -2932,7 +3173,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
2932
3173
  }
2933
3174
  fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
2934
3175
  }
2935
- fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
2936
3176
  fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
2937
3177
  fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
2938
3178
  fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
@@ -2955,7 +3195,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
2955
3195
  fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
2956
3196
  fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
2957
3197
  yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
2958
- fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
2959
3198
  fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
2960
3199
 
2961
3200
  fprintf(stream, "reverse_prompt:\n");