@fugood/llama.node 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. package/CMakeLists.txt +5 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +1 -1
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/LoadSessionWorker.cpp +1 -0
  23. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  27. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  28. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  29. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  31. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  32. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  33. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  34. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  35. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  36. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  37. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  38. package/src/llama.cpp/CMakeLists.txt +91 -1245
  39. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  40. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  41. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  42. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  43. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  44. package/src/llama.cpp/common/common.cpp +1116 -877
  45. package/src/llama.cpp/common/common.h +191 -77
  46. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  47. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  48. package/src/llama.cpp/common/log.h +1 -1
  49. package/src/llama.cpp/common/ngram-cache.h +10 -3
  50. package/src/llama.cpp/common/sampling.cpp +19 -10
  51. package/src/llama.cpp/docs/build.md +353 -0
  52. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  53. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  55. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  57. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  59. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  61. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  63. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  64. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  65. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  66. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  67. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  68. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  69. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  70. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  71. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  72. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  73. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  74. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  76. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  77. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  78. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  80. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  87. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  88. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  89. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  90. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  91. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  92. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  93. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  94. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  95. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  97. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  98. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  99. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  100. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  102. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  103. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  104. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  105. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  106. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  107. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  108. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  109. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  110. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  111. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  112. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  113. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  114. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  115. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  116. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  117. package/src/llama.cpp/examples/main/main.cpp +98 -75
  118. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  119. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  120. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  121. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  122. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  123. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  124. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  125. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  126. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  127. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  129. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  130. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  131. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  133. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  134. package/src/llama.cpp/examples/server/server.cpp +274 -671
  135. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  136. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  137. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  138. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  139. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  140. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  141. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  142. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  143. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  144. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  145. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  146. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  147. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  148. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  149. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  150. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  151. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  152. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  153. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  154. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  155. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  156. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  157. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  159. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  160. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  161. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  162. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  163. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  178. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  179. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  180. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  181. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  182. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  183. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  184. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  185. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  208. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  209. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  210. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  211. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  212. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  214. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  215. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  216. package/src/llama.cpp/models/.editorconfig +1 -0
  217. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  221. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  224. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  230. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  233. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  237. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  243. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  246. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  249. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  259. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  260. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  261. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  263. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  264. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  265. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  266. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  267. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  268. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  269. package/src/llama.cpp/requirements.txt +5 -4
  270. package/src/llama.cpp/scripts/build-info.sh +30 -0
  271. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  272. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  273. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  274. package/src/llama.cpp/src/llama-grammar.h +39 -0
  275. package/src/llama.cpp/src/llama-impl.h +26 -0
  276. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  277. package/src/llama.cpp/src/llama-sampling.h +56 -0
  278. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  279. package/src/llama.cpp/src/llama-vocab.h +130 -0
  280. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  281. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  282. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  283. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  284. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  285. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  286. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  287. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  289. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  290. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  291. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  292. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  293. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  294. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  295. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  296. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  297. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  298. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  299. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  300. package/bin/darwin/arm64/default.metallib +0 -0
  301. package/bin/darwin/x64/default.metallib +0 -0
  302. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  303. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  304. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  305. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  306. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  307. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  308. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  309. package/src/llama.cpp/ggml-opencl.h +0 -36
  310. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  311. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  314. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  315. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  316. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  317. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  318. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  319. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  320. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -17,9 +17,20 @@
17
17
  #include "json.hpp"
18
18
 
19
19
  // auto generated files (update with ./deps.sh)
20
+ #include "colorthemes.css.hpp"
21
+ #include "style.css.hpp"
22
+ #include "theme-beeninorder.css.hpp"
23
+ #include "theme-ketivah.css.hpp"
24
+ #include "theme-mangotango.css.hpp"
25
+ #include "theme-playground.css.hpp"
26
+ #include "theme-polarnight.css.hpp"
27
+ #include "theme-snowstorm.css.hpp"
20
28
  #include "index.html.hpp"
29
+ #include "index-new.html.hpp"
21
30
  #include "index.js.hpp"
22
31
  #include "completion.js.hpp"
32
+ #include "system-prompts.js.hpp"
33
+ #include "prompt-formats.js.hpp"
23
34
  #include "json-schema-to-grammar.mjs.hpp"
24
35
 
25
36
  #include <atomic>
@@ -112,29 +123,6 @@ struct slot_params {
112
123
  json input_suffix;
113
124
  };
114
125
 
115
- struct server_params {
116
- int32_t port = 8080;
117
- int32_t read_timeout = 600;
118
- int32_t write_timeout = 600;
119
- int32_t n_threads_http = -1;
120
-
121
- std::string hostname = "127.0.0.1";
122
- std::string public_path = "";
123
- std::string chat_template = "";
124
- std::string system_prompt = "";
125
-
126
- std::vector<std::string> api_keys;
127
-
128
- #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
129
- std::string ssl_key_file = "";
130
- std::string ssl_cert_file = "";
131
- #endif
132
-
133
- bool slots_endpoint = true;
134
- bool metrics_endpoint = false;
135
- std::string slot_save_path;
136
- };
137
-
138
126
  struct server_slot {
139
127
  int id;
140
128
  int id_task = -1;
@@ -159,7 +147,7 @@ struct server_slot {
159
147
  int32_t n_prompt_tokens = 0;
160
148
  int32_t n_prompt_tokens_processed = 0;
161
149
 
162
- json prompt;
150
+ json prompt; // can be either a string, array of strings or array of token ids
163
151
 
164
152
  // when a task is submitted, we first tokenize the prompt and store it here
165
153
  std::vector<llama_token> prompt_tokens;
@@ -659,6 +647,9 @@ struct server_context {
659
647
 
660
648
  server_metrics metrics;
661
649
 
650
+ // Necessary similarity of prompt for slot selection
651
+ float slot_prompt_similarity = 0.0f;
652
+
662
653
  ~server_context() {
663
654
  if (ctx) {
664
655
  llama_free(ctx);
@@ -746,6 +737,8 @@ struct server_context {
746
737
  slot.ga_n = ga_n;
747
738
  slot.ga_w = ga_w;
748
739
 
740
+ slot.sparams = params.sparams;
741
+
749
742
  slot.reset();
750
743
 
751
744
  slots.push_back(slot);
@@ -807,29 +800,94 @@ struct server_context {
807
800
  return prompt_tokens;
808
801
  }
809
802
 
810
- server_slot * get_slot(int id) {
811
- int64_t t_last = ggml_time_us();
812
-
813
- server_slot * last_used = nullptr;
814
-
803
+ server_slot * get_slot_by_id(int id) {
815
804
  for (server_slot & slot : slots) {
816
- if (slot.id == id && slot.available()) {
805
+ if (slot.id == id) {
817
806
  return &slot;
818
807
  }
808
+ }
809
+
810
+ return nullptr;
811
+ }
812
+
813
+ server_slot * get_available_slot(const std::string & prompt) {
814
+ server_slot * ret = nullptr;
815
+
816
+ // find the slot that has at least n% prompt similarity
817
+ if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) {
818
+ int max_lcp_len = 0;
819
+ float similarity = 0;
820
+
821
+ for (server_slot & slot : slots) {
822
+ // skip the slot if it is not available
823
+ if (!slot.available()) {
824
+ continue;
825
+ }
826
+
827
+ // skip the slot if it does not contains prompt
828
+ if (!slot.prompt.is_string()) {
829
+ continue;
830
+ }
831
+
832
+ // current slot's prompt
833
+ std::string slot_prompt = slot.prompt.get<std::string>();
834
+
835
+ // length of the current slot's prompt
836
+ int slot_prompt_len = slot_prompt.size();
837
+
838
+ // length of the Longest Common Prefix between the current slot's prompt and the input prompt
839
+ int lcp_len = common_part(slot_prompt, prompt);
840
+
841
+ // fraction of the common substring length compared to the current slot's prompt length
842
+ similarity = static_cast<float>(lcp_len) / slot_prompt_len;
843
+
844
+ // select the current slot if the criteria match
845
+ if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
846
+ max_lcp_len = lcp_len;
847
+ ret = &slot;
848
+ }
849
+ }
850
+
851
+ if (ret != nullptr) {
852
+ LOG_VERBOSE("selected slot by lcp similarity", {
853
+ {"id_slot", ret->id},
854
+ {"max_lcp_len", max_lcp_len},
855
+ {"similarity", similarity},
856
+ });
857
+ }
858
+ }
859
+
860
+ // find the slot that has been least recently used
861
+ if (ret == nullptr) {
862
+ int64_t t_last = ggml_time_us();
863
+ for (server_slot & slot : slots) {
864
+ // skip the slot if it is not available
865
+ if (!slot.available()) {
866
+ continue;
867
+ }
868
+
869
+ // select the current slot if the criteria match
870
+ if (slot.t_last_used < t_last) {
871
+ t_last = slot.t_last_used;
872
+ ret = &slot;
873
+ }
874
+ }
819
875
 
820
- // among all available slots, find the one that has been least recently used
821
- if (slot.available() && slot.t_last_used < t_last) {
822
- last_used = &slot;
823
- t_last = slot.t_last_used;
876
+ if (ret != nullptr) {
877
+ LOG_VERBOSE("selected slot by lru", {
878
+ {"id_slot", ret->id},
879
+ {"t_last", t_last},
880
+ });
824
881
  }
825
882
  }
826
883
 
827
- return last_used;
884
+ return ret;
828
885
  }
829
886
 
830
887
  bool launch_slot_with_task(server_slot & slot, const server_task & task) {
831
888
  slot_params default_params;
832
- llama_sampling_params default_sparams;
889
+ // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
890
+ llama_sampling_params default_sparams = params.sparams;
833
891
  auto & data = task.data;
834
892
 
835
893
  if (data.count("__oaicompat") != 0) {
@@ -900,16 +958,19 @@ struct server_context {
900
958
  slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
901
959
 
902
960
  // get prompt
903
- {
961
+ if (!task.infill) {
904
962
  const auto & prompt = data.find("prompt");
905
963
  if (prompt == data.end()) {
906
- send_error(task, "Either \"prompt\" or \"messages\" must be provided", ERROR_TYPE_INVALID_REQUEST);
964
+ send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST);
907
965
  return false;
908
- } else {
909
- slot.prompt = *prompt;
910
966
  }
911
- if (slot.prompt.is_array() && slot.prompt.size() == 0) {
912
- send_error(task, "\"prompt\" cannot be an empty array", ERROR_TYPE_INVALID_REQUEST);
967
+
968
+ if ((prompt->is_string()) ||
969
+ (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) ||
970
+ (prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer())) {
971
+ slot.prompt = *prompt;
972
+ } else {
973
+ send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST);
913
974
  return false;
914
975
  }
915
976
  }
@@ -1121,7 +1182,7 @@ struct server_context {
1121
1182
 
1122
1183
  bool process_token(completion_token_output & result, server_slot & slot) {
1123
1184
  // remember which tokens were sampled - used for repetition penalties during sampling
1124
- const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
1185
+ const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special);
1125
1186
  slot.sampled = result.tok;
1126
1187
 
1127
1188
  // search stop word and delete it
@@ -1250,7 +1311,7 @@ struct server_context {
1250
1311
  }
1251
1312
 
1252
1313
  json get_formated_generation(const server_slot & slot) const {
1253
- const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
1314
+ const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
1254
1315
  const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
1255
1316
 
1256
1317
  std::vector<std::string> samplers_sequence;
@@ -1527,13 +1588,33 @@ struct server_context {
1527
1588
  switch (task.type) {
1528
1589
  case SERVER_TASK_TYPE_COMPLETION:
1529
1590
  {
1530
- server_slot * slot = get_slot(json_value(task.data, "id_slot", -1));
1591
+ const int id_slot = json_value(task.data, "id_slot", -1);
1592
+
1593
+ server_slot * slot;
1594
+
1595
+ if (id_slot != -1) {
1596
+ slot = get_slot_by_id(id_slot);
1597
+ } else {
1598
+ std::string prompt;
1599
+ if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
1600
+ prompt = json_value(task.data, "prompt", std::string());
1601
+ }
1602
+
1603
+ slot = get_available_slot(prompt);
1604
+ }
1605
+
1531
1606
  if (slot == nullptr) {
1532
1607
  // if no slot is available, we defer this task for processing later
1533
1608
  LOG_VERBOSE("no slot is available", {{"id_task", task.id}});
1534
1609
  queue_tasks.defer(task);
1535
1610
  break;
1536
1611
  }
1612
+ if (!slot->available()) {
1613
+ // if requested slot is unavailable, we defer this task for processing later
1614
+ LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
1615
+ queue_tasks.defer(task);
1616
+ break;
1617
+ }
1537
1618
 
1538
1619
  if (task.data.contains("system_prompt")) {
1539
1620
  std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
@@ -1650,11 +1731,17 @@ struct server_context {
1650
1731
  case SERVER_TASK_TYPE_SLOT_SAVE:
1651
1732
  {
1652
1733
  int id_slot = task.data.at("id_slot");
1653
- server_slot * slot = get_slot(id_slot);
1734
+ server_slot * slot = get_slot_by_id(id_slot);
1654
1735
  if (slot == nullptr) {
1655
1736
  send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
1656
1737
  break;
1657
1738
  }
1739
+ if (!slot->available()) {
1740
+ // if requested slot is unavailable, we defer this task for processing later
1741
+ LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
1742
+ queue_tasks.defer(task);
1743
+ break;
1744
+ }
1658
1745
 
1659
1746
  const size_t token_count = slot->cache_tokens.size();
1660
1747
  const int64_t t_start = ggml_time_us();
@@ -1685,11 +1772,17 @@ struct server_context {
1685
1772
  case SERVER_TASK_TYPE_SLOT_RESTORE:
1686
1773
  {
1687
1774
  int id_slot = task.data.at("id_slot");
1688
- server_slot * slot = get_slot(id_slot);
1775
+ server_slot * slot = get_slot_by_id(id_slot);
1689
1776
  if (slot == nullptr) {
1690
1777
  send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
1691
1778
  break;
1692
1779
  }
1780
+ if (!slot->available()) {
1781
+ // if requested slot is unavailable, we defer this task for processing later
1782
+ LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
1783
+ queue_tasks.defer(task);
1784
+ break;
1785
+ }
1693
1786
 
1694
1787
  const int64_t t_start = ggml_time_us();
1695
1788
 
@@ -1727,11 +1820,17 @@ struct server_context {
1727
1820
  case SERVER_TASK_TYPE_SLOT_ERASE:
1728
1821
  {
1729
1822
  int id_slot = task.data.at("id_slot");
1730
- server_slot * slot = get_slot(id_slot);
1823
+ server_slot * slot = get_slot_by_id(id_slot);
1731
1824
  if (slot == nullptr) {
1732
1825
  send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
1733
1826
  break;
1734
1827
  }
1828
+ if (!slot->available()) {
1829
+ // if requested slot is unavailable, we defer this task for processing later
1830
+ LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
1831
+ queue_tasks.defer(task);
1832
+ break;
1833
+ }
1735
1834
 
1736
1835
  // Erase token cache
1737
1836
  const size_t n_erased = slot->cache_tokens.size();
@@ -1906,6 +2005,11 @@ struct server_context {
1906
2005
  int32_t n_batch = llama_n_batch(ctx);
1907
2006
  int32_t n_ubatch = llama_n_ubatch(ctx);
1908
2007
 
2008
+ // track if this is an embedding or non-embedding batch
2009
+ // if we've added sampled tokens above, we are in non-embedding mode
2010
+ // -1: none, 0: non-embedding, 1: embedding
2011
+ int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
2012
+
1909
2013
  // next, batch any pending prompts without exceeding n_batch
1910
2014
  if (params.cont_batching || batch.n_tokens == 0) {
1911
2015
  for (auto & slot : slots) {
@@ -1924,6 +2028,7 @@ struct server_context {
1924
2028
  slot.t_start_generation = 0;
1925
2029
 
1926
2030
  if (slot.infill) {
2031
+ const bool add_bos = llama_should_add_bos_token(model);
1927
2032
  bool suff_rm_leading_spc = true;
1928
2033
  if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
1929
2034
  params.input_suffix.erase(0, 1);
@@ -1939,11 +2044,21 @@ struct server_context {
1939
2044
  }
1940
2045
 
1941
2046
  prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
1942
- prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
1943
- prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
1944
- prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
1945
- prefix_tokens.push_back(llama_token_middle(model));
1946
- prompt_tokens = prefix_tokens;
2047
+ suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
2048
+
2049
+ auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
2050
+ auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
2051
+ if (add_bos) {
2052
+ embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
2053
+ }
2054
+ embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
2055
+
2056
+ const llama_token middle_token = llama_token_middle(model);
2057
+ if (middle_token >= 0) {
2058
+ embd_inp.push_back(middle_token);
2059
+ }
2060
+
2061
+ prompt_tokens = embd_inp;
1947
2062
  } else {
1948
2063
  prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
1949
2064
  }
@@ -2065,6 +2180,14 @@ struct server_context {
2065
2180
  }
2066
2181
  }
2067
2182
 
2183
+ // check that we are in the right batch_type, if not defer the slot
2184
+ bool slot_type = slot.embedding ? 1 : 0;
2185
+ if (batch_type == -1) {
2186
+ batch_type = slot_type;
2187
+ } else if (batch_type != slot_type) {
2188
+ continue;
2189
+ }
2190
+
2068
2191
  // keep only the common part
2069
2192
  int p0 = (int) system_tokens.size() + slot.n_past;
2070
2193
  if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
@@ -2166,6 +2289,9 @@ struct server_context {
2166
2289
  {"n_tokens", batch.n_tokens},
2167
2290
  });
2168
2291
 
2292
+ // make sure we're in the right embedding mode
2293
+ llama_set_embeddings(ctx, batch_type == 1);
2294
+
2169
2295
  // process the created batch of tokens
2170
2296
  for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
2171
2297
  const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
@@ -2323,561 +2449,6 @@ struct server_context {
2323
2449
  }
2324
2450
  };
2325
2451
 
2326
- static void server_print_usage(const char * argv0, const gpt_params & params, const server_params & sparams) {
2327
- printf("usage: %s [options]\n", argv0);
2328
- printf("\n");
2329
- printf("options:\n");
2330
- printf(" -h, --help show this help message and exit\n");
2331
- printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
2332
- printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
2333
- printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
2334
- printf(" --threads-http N number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
2335
- printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
2336
- printf(" --rope-scaling {none,linear,yarn}\n");
2337
- printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
2338
- printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
2339
- printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
2340
- printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
2341
- printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
2342
- printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
2343
- printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
2344
- printf(" --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n");
2345
- printf(" -dt N, --defrag-thold N\n");
2346
- printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
2347
- printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch);
2348
- printf(" -ub N, --ubatch-size N physical maximum batch size (default: %d)\n", params.n_ubatch);
2349
- if (llama_supports_mlock()) {
2350
- printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
2351
- }
2352
- if (llama_supports_mmap()) {
2353
- printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
2354
- }
2355
- printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
2356
- printf(" - distribute: spread execution evenly over all nodes\n");
2357
- printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
2358
- printf(" - numactl: use the CPU map provided my numactl\n");
2359
- if (llama_supports_gpu_offload()) {
2360
- printf(" -ngl N, --n-gpu-layers N\n");
2361
- printf(" number of layers to store in VRAM\n");
2362
- printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
2363
- printf(" how to split the model across multiple GPUs, one of:\n");
2364
- printf(" - none: use one GPU only\n");
2365
- printf(" - layer (default): split layers and KV across GPUs\n");
2366
- printf(" - row: split rows across GPUs\n");
2367
- printf(" -ts SPLIT --tensor-split SPLIT\n");
2368
- printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
2369
- printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
2370
- printf(" or for intermediate results and KV (with split-mode = row)\n");
2371
- printf(" -nkvo, --no-kv-offload\n");
2372
- printf(" disable KV offload\n");
2373
- }
2374
- printf(" -m FNAME, --model FNAME\n");
2375
- printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
2376
- printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
2377
- printf(" model download url (default: unused)\n");
2378
- printf(" -hfr REPO, --hf-repo REPO\n");
2379
- printf(" Hugging Face model repository (default: unused)\n");
2380
- printf(" -hff FILE, --hf-file FILE\n");
2381
- printf(" Hugging Face model file (default: unused)\n");
2382
- printf(" -a ALIAS, --alias ALIAS\n");
2383
- printf(" set an alias for the model, will be added as `model` field in completion response\n");
2384
- printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
2385
- printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
2386
- printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
2387
- printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
2388
- printf(" --rpc SERVERS comma separated list of RPC servers\n");
2389
- printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
2390
- printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
2391
- printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
2392
- #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
2393
- printf(" --ssl-key-file FNAME path to file a PEM-encoded SSL private key\n");
2394
- printf(" --ssl-cert-file FNAME path to file a PEM-encoded SSL certificate\n");
2395
- #endif
2396
- printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
2397
- printf(" --embeddings enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
2398
- printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
2399
- printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)\n");
2400
- printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
2401
- printf(" -spf FNAME, --system-prompt-file FNAME\n");
2402
- printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
2403
- printf(" -ctk TYPE, --cache-type-k TYPE\n");
2404
- printf(" KV cache data type for K (default: f16)\n");
2405
- printf(" -ctv TYPE, --cache-type-v TYPE\n");
2406
- printf(" KV cache data type for V (default: f16)\n");
2407
- printf(" --log-format log output format: json or text (default: json)\n");
2408
- printf(" --log-disable disables logging to a file.\n");
2409
- printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
2410
- printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
2411
- printf(" --slot-save-path PATH path to save slot kv cache (default: disabled)\n");
2412
- printf("\n");
2413
- printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
2414
- printf(" --override-kv KEY=TYPE:VALUE\n");
2415
- printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
2416
- printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
2417
- printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
2418
- printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
2419
- printf(" --chat-template JINJA_TEMPLATE\n");
2420
- printf(" set custom jinja chat template (default: template taken from model's metadata)\n");
2421
- printf(" only commonly used templates are accepted:\n");
2422
- printf(" https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template\n");
2423
- printf("\n");
2424
- }
2425
-
2426
- static void server_params_parse(int argc, char ** argv, server_params & sparams, gpt_params & params) {
2427
- gpt_params default_params;
2428
- server_params default_sparams;
2429
-
2430
- std::string arg;
2431
- bool invalid_param = false;
2432
-
2433
- for (int i = 1; i < argc; i++) {
2434
- arg = argv[i];
2435
- if (arg == "--port") {
2436
- if (++i >= argc) {
2437
- invalid_param = true;
2438
- break;
2439
- }
2440
- sparams.port = std::stoi(argv[i]);
2441
- } else if (arg == "--rpc") {
2442
- if (++i >= argc) {
2443
- invalid_param = true;
2444
- break;
2445
- }
2446
- params.rpc_servers = argv[i];
2447
- } else if (arg == "--host") {
2448
- if (++i >= argc) {
2449
- invalid_param = true;
2450
- break;
2451
- }
2452
- sparams.hostname = argv[i];
2453
- } else if (arg == "--path") {
2454
- if (++i >= argc) {
2455
- invalid_param = true;
2456
- break;
2457
- }
2458
- sparams.public_path = argv[i];
2459
- } else if (arg == "--api-key") {
2460
- if (++i >= argc) {
2461
- invalid_param = true;
2462
- break;
2463
- }
2464
- sparams.api_keys.push_back(argv[i]);
2465
- } else if (arg == "--api-key-file") {
2466
- if (++i >= argc) {
2467
- invalid_param = true;
2468
- break;
2469
- }
2470
- std::ifstream key_file(argv[i]);
2471
- if (!key_file) {
2472
- fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
2473
- invalid_param = true;
2474
- break;
2475
- }
2476
- std::string key;
2477
- while (std::getline(key_file, key)) {
2478
- if (key.size() > 0) {
2479
- sparams.api_keys.push_back(key);
2480
- }
2481
- }
2482
- key_file.close();
2483
-
2484
- }
2485
- #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
2486
- else if (arg == "--ssl-key-file") {
2487
- if (++i >= argc) {
2488
- invalid_param = true;
2489
- break;
2490
- }
2491
- sparams.ssl_key_file = argv[i];
2492
- } else if (arg == "--ssl-cert-file") {
2493
- if (++i >= argc) {
2494
- invalid_param = true;
2495
- break;
2496
- }
2497
- sparams.ssl_cert_file = argv[i];
2498
- }
2499
- #endif
2500
- else if (arg == "--timeout" || arg == "-to") {
2501
- if (++i >= argc) {
2502
- invalid_param = true;
2503
- break;
2504
- }
2505
- sparams.read_timeout = std::stoi(argv[i]);
2506
- sparams.write_timeout = std::stoi(argv[i]);
2507
- } else if (arg == "-m" || arg == "--model") {
2508
- if (++i >= argc) {
2509
- invalid_param = true;
2510
- break;
2511
- }
2512
- params.model = argv[i];
2513
- } else if (arg == "-mu" || arg == "--model-url") {
2514
- if (++i >= argc) {
2515
- invalid_param = true;
2516
- break;
2517
- }
2518
- params.model_url = argv[i];
2519
- } else if (arg == "-hfr" || arg == "--hf-repo") {
2520
- if (++i >= argc) {
2521
- invalid_param = true;
2522
- break;
2523
- }
2524
- params.hf_repo = argv[i];
2525
- } else if (arg == "-hff" || arg == "--hf-file") {
2526
- if (++i >= argc) {
2527
- invalid_param = true;
2528
- break;
2529
- }
2530
- params.hf_file = argv[i];
2531
- } else if (arg == "-a" || arg == "--alias") {
2532
- if (++i >= argc) {
2533
- invalid_param = true;
2534
- break;
2535
- }
2536
- params.model_alias = argv[i];
2537
- } else if (arg == "-h" || arg == "--help") {
2538
- server_print_usage(argv[0], default_params, default_sparams);
2539
- exit(0);
2540
- } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") {
2541
- if (++i >= argc) {
2542
- invalid_param = true;
2543
- break;
2544
- }
2545
- params.n_ctx = std::stoi(argv[i]);
2546
- } else if (arg == "--rope-scaling") {
2547
- if (++i >= argc) {
2548
- invalid_param = true;
2549
- break;
2550
- }
2551
- std::string value(argv[i]);
2552
- /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
2553
- else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
2554
- else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
2555
- else { invalid_param = true; break; }
2556
- } else if (arg == "--rope-freq-base") {
2557
- if (++i >= argc) {
2558
- invalid_param = true;
2559
- break;
2560
- }
2561
- params.rope_freq_base = std::stof(argv[i]);
2562
- } else if (arg == "--rope-freq-scale") {
2563
- if (++i >= argc) {
2564
- invalid_param = true;
2565
- break;
2566
- }
2567
- params.rope_freq_scale = std::stof(argv[i]);
2568
- } else if (arg == "--yarn-ext-factor") {
2569
- if (++i >= argc) {
2570
- invalid_param = true;
2571
- break;
2572
- }
2573
- params.yarn_ext_factor = std::stof(argv[i]);
2574
- }
2575
- else if (arg == "--yarn-attn-factor") {
2576
- if (++i >= argc) {
2577
- invalid_param = true;
2578
- break;
2579
- }
2580
- params.yarn_attn_factor = std::stof(argv[i]);
2581
- } else if (arg == "--yarn-beta-fast") {
2582
- if (++i >= argc) {
2583
- invalid_param = true;
2584
- break;
2585
- }
2586
- params.yarn_beta_fast = std::stof(argv[i]);
2587
- } else if (arg == "--yarn-beta-slow") {
2588
- if (++i >= argc) {
2589
- invalid_param = true;
2590
- break;
2591
- }
2592
- params.yarn_beta_slow = std::stof(argv[i]);
2593
- } else if (arg == "--pooling") {
2594
- if (++i >= argc) {
2595
- invalid_param = true;
2596
- break;
2597
- }
2598
- std::string value(argv[i]);
2599
- /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
2600
- else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
2601
- else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
2602
- else { invalid_param = true; break; }
2603
- } else if (arg == "--defrag-thold" || arg == "-dt") {
2604
- if (++i >= argc) {
2605
- invalid_param = true;
2606
- break;
2607
- }
2608
- params.defrag_thold = std::stof(argv[i]);
2609
- } else if (arg == "--threads" || arg == "-t") {
2610
- if (++i >= argc)
2611
- {
2612
- invalid_param = true;
2613
- break;
2614
- }
2615
- params.n_threads = std::stoi(argv[i]);
2616
- } else if (arg == "--grp-attn-n" || arg == "-gan") {
2617
- if (++i >= argc) {
2618
- invalid_param = true;
2619
- break;
2620
- }
2621
-
2622
- params.grp_attn_n = std::stoi(argv[i]);
2623
- } else if (arg == "--grp-attn-w" || arg == "-gaw") {
2624
- if (++i >= argc) {
2625
- invalid_param = true;
2626
- break;
2627
- }
2628
-
2629
- params.grp_attn_w = std::stoi(argv[i]);
2630
- } else if (arg == "--threads-batch" || arg == "-tb") {
2631
- if (++i >= argc) {
2632
- invalid_param = true;
2633
- break;
2634
- }
2635
- params.n_threads_batch = std::stoi(argv[i]);
2636
- } else if (arg == "--threads-http") {
2637
- if (++i >= argc) {
2638
- invalid_param = true;
2639
- break;
2640
- }
2641
- sparams.n_threads_http = std::stoi(argv[i]);
2642
- } else if (arg == "-b" || arg == "--batch-size") {
2643
- if (++i >= argc) {
2644
- invalid_param = true;
2645
- break;
2646
- }
2647
- params.n_batch = std::stoi(argv[i]);
2648
- } else if (arg == "-ub" || arg == "--ubatch-size") {
2649
- if (++i >= argc) {
2650
- invalid_param = true;
2651
- break;
2652
- }
2653
- params.n_ubatch = std::stoi(argv[i]);
2654
- } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
2655
- if (++i >= argc) {
2656
- invalid_param = true;
2657
- break;
2658
- }
2659
- if (llama_supports_gpu_offload()) {
2660
- params.n_gpu_layers = std::stoi(argv[i]);
2661
- } else {
2662
- LOG_WARNING(
2663
- "Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
2664
- "See main README.md for information on enabling GPU BLAS support",
2665
- {{"n_gpu_layers", params.n_gpu_layers}});
2666
- }
2667
- } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
2668
- params.no_kv_offload = true;
2669
- } else if (arg == "--split-mode" || arg == "-sm") {
2670
- if (++i >= argc) {
2671
- invalid_param = true;
2672
- break;
2673
- }
2674
- std::string arg_next = argv[i];
2675
- if (arg_next == "none") {
2676
- params.split_mode = LLAMA_SPLIT_MODE_NONE;
2677
- } else if (arg_next == "layer") {
2678
- params.split_mode = LLAMA_SPLIT_MODE_LAYER;
2679
- } else if (arg_next == "row") {
2680
- params.split_mode = LLAMA_SPLIT_MODE_ROW;
2681
- } else {
2682
- invalid_param = true;
2683
- break;
2684
- }
2685
- #ifndef GGML_USE_CUDA
2686
- fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
2687
- #endif // GGML_USE_CUDA
2688
- } else if (arg == "--tensor-split" || arg == "-ts") {
2689
- if (++i >= argc) {
2690
- invalid_param = true;
2691
- break;
2692
- }
2693
- #if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
2694
- std::string arg_next = argv[i];
2695
-
2696
- // split string by , and /
2697
- const std::regex regex{R"([,/]+)"};
2698
- std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
2699
- std::vector<std::string> split_arg{it, {}};
2700
- GGML_ASSERT(split_arg.size() <= llama_max_devices());
2701
-
2702
- for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) {
2703
- if (i_device < split_arg.size()) {
2704
- params.tensor_split[i_device] = std::stof(split_arg[i_device]);
2705
- } else {
2706
- params.tensor_split[i_device] = 0.0f;
2707
- }
2708
- }
2709
- #else
2710
- LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
2711
- #endif // GGML_USE_CUDA
2712
- } else if (arg == "--main-gpu" || arg == "-mg") {
2713
- if (++i >= argc) {
2714
- invalid_param = true;
2715
- break;
2716
- }
2717
- #if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
2718
- params.main_gpu = std::stoi(argv[i]);
2719
- #else
2720
- LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {});
2721
- #endif
2722
- } else if (arg == "--lora") {
2723
- if (++i >= argc) {
2724
- invalid_param = true;
2725
- break;
2726
- }
2727
- params.lora_adapter.emplace_back(argv[i], 1.0f);
2728
- params.use_mmap = false;
2729
- } else if (arg == "--lora-scaled") {
2730
- if (++i >= argc) {
2731
- invalid_param = true;
2732
- break;
2733
- }
2734
- const char * lora_adapter = argv[i];
2735
- if (++i >= argc) {
2736
- invalid_param = true;
2737
- break;
2738
- }
2739
- params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
2740
- params.use_mmap = false;
2741
- } else if (arg == "--lora-base") {
2742
- if (++i >= argc) {
2743
- invalid_param = true;
2744
- break;
2745
- }
2746
- params.lora_base = argv[i];
2747
- } else if (arg == "-v" || arg == "--verbose") {
2748
- #if SERVER_VERBOSE != 1
2749
- LOG_WARNING("server.cpp is not built with verbose logging.", {});
2750
- #else
2751
- server_verbose = true;
2752
- #endif
2753
- } else if (arg == "--mlock") {
2754
- params.use_mlock = true;
2755
- } else if (arg == "--no-mmap") {
2756
- params.use_mmap = false;
2757
- } else if (arg == "--numa") {
2758
- if (++i >= argc) {
2759
- invalid_param = true;
2760
- break;
2761
- } else {
2762
- std::string value(argv[i]);
2763
- /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
2764
- else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
2765
- else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
2766
- else { invalid_param = true; break; }
2767
- }
2768
- } else if (arg == "--embedding" || arg == "--embeddings") {
2769
- params.embedding = true;
2770
- } else if (arg == "-cb" || arg == "--cont-batching") {
2771
- params.cont_batching = true;
2772
- } else if (arg == "-fa" || arg == "--flash-attn") {
2773
- params.flash_attn = true;
2774
- } else if (arg == "-np" || arg == "--parallel") {
2775
- if (++i >= argc) {
2776
- invalid_param = true;
2777
- break;
2778
- }
2779
- params.n_parallel = std::stoi(argv[i]);
2780
- } else if (arg == "-n" || arg == "--n-predict") {
2781
- if (++i >= argc) {
2782
- invalid_param = true;
2783
- break;
2784
- }
2785
- params.n_predict = std::stoi(argv[i]);
2786
- } else if (arg == "-spf" || arg == "--system-prompt-file") {
2787
- if (++i >= argc) {
2788
- invalid_param = true;
2789
- break;
2790
- }
2791
- std::ifstream file(argv[i]);
2792
- if (!file) {
2793
- fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
2794
- invalid_param = true;
2795
- break;
2796
- }
2797
- std::string system_prompt;
2798
- std::copy(
2799
- std::istreambuf_iterator<char>(file),
2800
- std::istreambuf_iterator<char>(),
2801
- std::back_inserter(system_prompt)
2802
- );
2803
- sparams.system_prompt = system_prompt;
2804
- } else if (arg == "-ctk" || arg == "--cache-type-k") {
2805
- params.cache_type_k = argv[++i];
2806
- } else if (arg == "-ctv" || arg == "--cache-type-v") {
2807
- params.cache_type_v = argv[++i];
2808
- } else if (arg == "--log-format") {
2809
- if (++i >= argc) {
2810
- invalid_param = true;
2811
- break;
2812
- }
2813
- if (std::strcmp(argv[i], "json") == 0) {
2814
- server_log_json = true;
2815
- } else if (std::strcmp(argv[i], "text") == 0) {
2816
- server_log_json = false;
2817
- } else {
2818
- invalid_param = true;
2819
- break;
2820
- }
2821
- } else if (arg == "--log-disable") {
2822
- log_set_target(stdout);
2823
- LOG_INFO("logging to file is disabled.", {});
2824
- } else if (arg == "--slots-endpoint-disable") {
2825
- sparams.slots_endpoint = false;
2826
- } else if (arg == "--metrics") {
2827
- sparams.metrics_endpoint = true;
2828
- } else if (arg == "--slot-save-path") {
2829
- if (++i >= argc) {
2830
- invalid_param = true;
2831
- break;
2832
- }
2833
- sparams.slot_save_path = argv[i];
2834
- // if doesn't end with DIRECTORY_SEPARATOR, add it
2835
- if (!sparams.slot_save_path.empty() && sparams.slot_save_path[sparams.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
2836
- sparams.slot_save_path += DIRECTORY_SEPARATOR;
2837
- }
2838
- } else if (arg == "--chat-template") {
2839
- if (++i >= argc) {
2840
- invalid_param = true;
2841
- break;
2842
- }
2843
- if (!verify_custom_template(argv[i])) {
2844
- fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
2845
- fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
2846
- invalid_param = true;
2847
- break;
2848
- }
2849
- sparams.chat_template = argv[i];
2850
- } else if (arg == "--override-kv") {
2851
- if (++i >= argc) {
2852
- invalid_param = true;
2853
- break;
2854
- }
2855
- if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
2856
- fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
2857
- invalid_param = true;
2858
- break;
2859
- }
2860
- } else {
2861
- fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
2862
- server_print_usage(argv[0], default_params, default_sparams);
2863
- exit(1);
2864
- }
2865
- }
2866
-
2867
- gpt_params_handle_model_default(params);
2868
-
2869
- if (!params.kv_overrides.empty()) {
2870
- params.kv_overrides.emplace_back();
2871
- params.kv_overrides.back().key[0] = 0;
2872
- }
2873
-
2874
- if (invalid_param) {
2875
- fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
2876
- server_print_usage(argv[0], default_params, default_sparams);
2877
- exit(1);
2878
- }
2879
- }
2880
-
2881
2452
  static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
2882
2453
  // skip GH copilot requests when using default port
2883
2454
  if (req.path == "/v1/health" || req.path == "/v1/completions") {
@@ -2918,16 +2489,22 @@ int main(int argc, char ** argv) {
2918
2489
  log_disable();
2919
2490
  #endif
2920
2491
  // own arguments required by this example
2921
- gpt_params params;
2922
- server_params sparams;
2492
+ gpt_params params;
2493
+
2494
+ if (!gpt_params_parse(argc, argv, params)) {
2495
+ gpt_params_print_usage(argc, argv, params);
2496
+ return 1;
2497
+ }
2498
+
2499
+ // TODO: not great to use extern vars
2500
+ server_log_json = params.log_json;
2501
+ server_verbose = params.verbosity > 0;
2923
2502
 
2924
2503
  // struct that contains llama context and inference
2925
2504
  server_context ctx_server;
2926
2505
 
2927
- server_params_parse(argc, argv, sparams, params);
2928
-
2929
- if (!sparams.system_prompt.empty()) {
2930
- ctx_server.system_prompt_set(sparams.system_prompt);
2506
+ if (!params.system_prompt.empty()) {
2507
+ ctx_server.system_prompt_set(params.system_prompt);
2931
2508
  }
2932
2509
 
2933
2510
  if (params.model_alias == "unknown") {
@@ -2951,10 +2528,10 @@ int main(int argc, char ** argv) {
2951
2528
 
2952
2529
  std::unique_ptr<httplib::Server> svr;
2953
2530
  #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
2954
- if (sparams.ssl_key_file != "" && sparams.ssl_cert_file != "") {
2955
- LOG_INFO("Running with SSL", {{"key", sparams.ssl_key_file}, {"cert", sparams.ssl_cert_file}});
2531
+ if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
2532
+ LOG_INFO("Running with SSL", {{"key", params.ssl_file_key}, {"cert", params.ssl_file_cert}});
2956
2533
  svr.reset(
2957
- new httplib::SSLServer(sparams.ssl_cert_file.c_str(), sparams.ssl_key_file.c_str())
2534
+ new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
2958
2535
  );
2959
2536
  } else {
2960
2537
  LOG_INFO("Running without SSL", {});
@@ -3008,26 +2585,29 @@ int main(int argc, char ** argv) {
3008
2585
  });
3009
2586
 
3010
2587
  // set timeouts and change hostname and port
3011
- svr->set_read_timeout (sparams.read_timeout);
3012
- svr->set_write_timeout(sparams.write_timeout);
2588
+ svr->set_read_timeout (params.timeout_read);
2589
+ svr->set_write_timeout(params.timeout_write);
3013
2590
 
3014
- if (!svr->bind_to_port(sparams.hostname, sparams.port)) {
3015
- fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
2591
+ if (!svr->bind_to_port(params.hostname, params.port)) {
2592
+ fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", params.hostname.c_str(), params.port);
3016
2593
  return 1;
3017
2594
  }
3018
2595
 
3019
2596
  std::unordered_map<std::string, std::string> log_data;
3020
2597
 
3021
- log_data["hostname"] = sparams.hostname;
3022
- log_data["port"] = std::to_string(sparams.port);
2598
+ log_data["hostname"] = params.hostname;
2599
+ log_data["port"] = std::to_string(params.port);
3023
2600
 
3024
- if (sparams.api_keys.size() == 1) {
3025
- auto key = sparams.api_keys[0];
2601
+ if (params.api_keys.size() == 1) {
2602
+ auto key = params.api_keys[0];
3026
2603
  log_data["api_key"] = "api_key: ****" + key.substr(std::max((int)(key.length() - 4), 0));
3027
- } else if (sparams.api_keys.size() > 1) {
3028
- log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
2604
+ } else if (params.api_keys.size() > 1) {
2605
+ log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded";
3029
2606
  }
3030
2607
 
2608
+ // Necessary similarity of prompt for slot selection
2609
+ ctx_server.slot_prompt_similarity = params.slot_prompt_similarity;
2610
+
3031
2611
  // load the model
3032
2612
  if (!ctx_server.load_model(params)) {
3033
2613
  state.store(SERVER_STATE_ERROR);
@@ -3042,26 +2622,18 @@ int main(int argc, char ** argv) {
3042
2622
  const auto model_meta = ctx_server.model_meta();
3043
2623
 
3044
2624
  // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
3045
- if (sparams.chat_template.empty()) {
2625
+ if (params.chat_template.empty()) {
3046
2626
  if (!ctx_server.validate_model_chat_template()) {
3047
- LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
3048
- sparams.chat_template = "chatml";
2627
+ LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
2628
+ params.chat_template = "chatml";
3049
2629
  }
3050
2630
  }
3051
2631
 
3052
2632
  // print sample chat example to make it clear which template is used
3053
2633
  {
3054
- json chat;
3055
- chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}});
3056
- chat.push_back({{"role", "user"}, {"content", "Hello"}});
3057
- chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
3058
- chat.push_back({{"role", "user"}, {"content", "How are you?"}});
3059
-
3060
- const std::string chat_example = format_chat(ctx_server.model, sparams.chat_template, chat);
3061
-
3062
2634
  LOG_INFO("chat template", {
3063
- {"chat_example", chat_example},
3064
- {"built_in", sparams.chat_template.empty()},
2635
+ {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
2636
+ {"built_in", params.chat_template.empty()},
3065
2637
  });
3066
2638
  }
3067
2639
 
@@ -3069,7 +2641,7 @@ int main(int argc, char ** argv) {
3069
2641
  // Middlewares
3070
2642
  //
3071
2643
 
3072
- auto middleware_validate_api_key = [&sparams, &res_error](const httplib::Request & req, httplib::Response & res) {
2644
+ auto middleware_validate_api_key = [&params, &res_error](const httplib::Request & req, httplib::Response & res) {
3073
2645
  // TODO: should we apply API key to all endpoints, including "/health" and "/models"?
3074
2646
  static const std::set<std::string> protected_endpoints = {
3075
2647
  "/props",
@@ -3087,7 +2659,7 @@ int main(int argc, char ** argv) {
3087
2659
  };
3088
2660
 
3089
2661
  // If API key is not set, skip validation
3090
- if (sparams.api_keys.empty()) {
2662
+ if (params.api_keys.empty()) {
3091
2663
  return true;
3092
2664
  }
3093
2665
 
@@ -3102,7 +2674,7 @@ int main(int argc, char ** argv) {
3102
2674
  std::string prefix = "Bearer ";
3103
2675
  if (auth_header.substr(0, prefix.size()) == prefix) {
3104
2676
  std::string received_api_key = auth_header.substr(prefix.size());
3105
- if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) {
2677
+ if (std::find(params.api_keys.begin(), params.api_keys.end(), received_api_key) != params.api_keys.end()) {
3106
2678
  return true; // API key is valid
3107
2679
  }
3108
2680
  }
@@ -3157,7 +2729,7 @@ int main(int argc, char ** argv) {
3157
2729
  };
3158
2730
 
3159
2731
  res.status = 200; // HTTP OK
3160
- if (sparams.slots_endpoint && req.has_param("include_slots")) {
2732
+ if (params.endpoint_slots && req.has_param("include_slots")) {
3161
2733
  health["slots"] = result.data.at("slots");
3162
2734
  }
3163
2735
 
@@ -3183,7 +2755,7 @@ int main(int argc, char ** argv) {
3183
2755
  };
3184
2756
 
3185
2757
  const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) {
3186
- if (!sparams.slots_endpoint) {
2758
+ if (!params.endpoint_slots) {
3187
2759
  res_error(res, format_error_response("This server does not support slots endpoint.", ERROR_TYPE_NOT_SUPPORTED));
3188
2760
  return;
3189
2761
  }
@@ -3207,7 +2779,7 @@ int main(int argc, char ** argv) {
3207
2779
  };
3208
2780
 
3209
2781
  const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
3210
- if (!sparams.metrics_endpoint) {
2782
+ if (!params.endpoint_metrics) {
3211
2783
  res_error(res, format_error_response("This server does not support metrics endpoint.", ERROR_TYPE_NOT_SUPPORTED));
3212
2784
  return;
3213
2785
  }
@@ -3307,14 +2879,14 @@ int main(int argc, char ** argv) {
3307
2879
  res.status = 200; // HTTP OK
3308
2880
  };
3309
2881
 
3310
- const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
2882
+ const auto handle_slots_save = [&ctx_server, &res_error, &params](const httplib::Request & req, httplib::Response & res, int id_slot) {
3311
2883
  json request_data = json::parse(req.body);
3312
2884
  std::string filename = request_data.at("filename");
3313
2885
  if (!fs_validate_filename(filename)) {
3314
2886
  res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
3315
2887
  return;
3316
2888
  }
3317
- std::string filepath = sparams.slot_save_path + filename;
2889
+ std::string filepath = params.slot_save_path + filename;
3318
2890
 
3319
2891
  server_task task;
3320
2892
  task.type = SERVER_TASK_TYPE_SLOT_SAVE;
@@ -3337,14 +2909,14 @@ int main(int argc, char ** argv) {
3337
2909
  }
3338
2910
  };
3339
2911
 
3340
- const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
2912
+ const auto handle_slots_restore = [&ctx_server, &res_error, &params](const httplib::Request & req, httplib::Response & res, int id_slot) {
3341
2913
  json request_data = json::parse(req.body);
3342
2914
  std::string filename = request_data.at("filename");
3343
2915
  if (!fs_validate_filename(filename)) {
3344
2916
  res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
3345
2917
  return;
3346
2918
  }
3347
- std::string filepath = sparams.slot_save_path + filename;
2919
+ std::string filepath = params.slot_save_path + filename;
3348
2920
 
3349
2921
  server_task task;
3350
2922
  task.type = SERVER_TASK_TYPE_SLOT_RESTORE;
@@ -3414,17 +2986,31 @@ int main(int argc, char ** argv) {
3414
2986
  };
3415
2987
 
3416
2988
  const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
2989
+ std::string template_key = "tokenizer.chat_template", curr_tmpl;
2990
+ int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0);
2991
+ if (tlen > 0) {
2992
+ std::vector<char> curr_tmpl_buf(tlen + 1, 0);
2993
+ if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) {
2994
+ curr_tmpl = std::string(curr_tmpl_buf.data(), tlen);
2995
+ }
2996
+ }
3417
2997
  res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
3418
2998
  json data = {
3419
2999
  { "system_prompt", ctx_server.system_prompt.c_str() },
3420
3000
  { "default_generation_settings", ctx_server.default_generation_settings_for_props },
3421
- { "total_slots", ctx_server.params.n_parallel }
3001
+ { "total_slots", ctx_server.params.n_parallel },
3002
+ { "chat_template", curr_tmpl.c_str() }
3422
3003
  };
3423
3004
 
3424
3005
  res.set_content(data.dump(), "application/json; charset=utf-8");
3425
3006
  };
3426
3007
 
3427
3008
  const auto handle_completions = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
3009
+ if (ctx_server.params.embedding) {
3010
+ res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
3011
+ return;
3012
+ }
3013
+
3428
3014
  res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
3429
3015
 
3430
3016
  json data = json::parse(req.body);
@@ -3519,9 +3105,14 @@ int main(int argc, char ** argv) {
3519
3105
  res.set_content(models.dump(), "application/json; charset=utf-8");
3520
3106
  };
3521
3107
 
3522
- const auto handle_chat_completions = [&ctx_server, &sparams, &res_error](const httplib::Request & req, httplib::Response & res) {
3108
+ const auto handle_chat_completions = [&ctx_server, &params, &res_error](const httplib::Request & req, httplib::Response & res) {
3109
+ if (ctx_server.params.embedding) {
3110
+ res_error(res, format_error_response("This server does not support chat completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
3111
+ return;
3112
+ }
3113
+
3523
3114
  res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
3524
- json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), sparams.chat_template);
3115
+ json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
3525
3116
 
3526
3117
  const int id_task = ctx_server.queue_tasks.get_new_id();
3527
3118
 
@@ -3592,6 +3183,11 @@ int main(int argc, char ** argv) {
3592
3183
  };
3593
3184
 
3594
3185
  const auto handle_infill = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
3186
+ if (ctx_server.params.embedding) {
3187
+ res_error(res, format_error_response("This server does not support infill. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
3188
+ return;
3189
+ }
3190
+
3595
3191
  res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
3596
3192
 
3597
3193
  json data = json::parse(req.body);
@@ -3678,13 +3274,8 @@ int main(int argc, char ** argv) {
3678
3274
  return res.set_content(data.dump(), "application/json; charset=utf-8");
3679
3275
  };
3680
3276
 
3681
- const auto handle_embeddings = [&params, &ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
3277
+ const auto handle_embeddings = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
3682
3278
  res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
3683
- if (!params.embedding) {
3684
- res.status = 501;
3685
- res.set_content("This server does not support embeddings. Start it with `--embeddings`", "text/plain; charset=utf-8");
3686
- return;
3687
- }
3688
3279
 
3689
3280
  const json body = json::parse(req.body);
3690
3281
  bool is_openai = false;
@@ -3746,17 +3337,29 @@ int main(int argc, char ** argv) {
3746
3337
  //
3747
3338
 
3748
3339
  // register static assets routes
3749
- if (!sparams.public_path.empty()) {
3340
+ if (!params.public_path.empty()) {
3750
3341
  // Set the base directory for serving static files
3751
- svr->set_base_dir(sparams.public_path);
3342
+ svr->set_base_dir(params.public_path);
3752
3343
  }
3753
3344
 
3754
3345
  // using embedded static files
3755
- svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
3756
- svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
3757
- svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
3758
- svr->Get("/json-schema-to-grammar.mjs", handle_static_file(
3759
- json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
3346
+ svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
3347
+ svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
3348
+ svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
3349
+ svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
3350
+
3351
+ // add new-ui files
3352
+ svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
3353
+ svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
3354
+ svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
3355
+ svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
3356
+ svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
3357
+ svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
3358
+ svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
3359
+ svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
3360
+ svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
3361
+ svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
3362
+ svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
3760
3363
 
3761
3364
  // register API routes
3762
3365
  svr->Get ("/health", handle_health);
@@ -3775,7 +3378,7 @@ int main(int argc, char ** argv) {
3775
3378
  svr->Post("/v1/embeddings", handle_embeddings);
3776
3379
  svr->Post("/tokenize", handle_tokenize);
3777
3380
  svr->Post("/detokenize", handle_detokenize);
3778
- if (!sparams.slot_save_path.empty()) {
3381
+ if (!params.slot_save_path.empty()) {
3779
3382
  // only enable slot endpoints if slot_save_path is set
3780
3383
  svr->Post("/slots/:id_slot", handle_slots_action);
3781
3384
  }
@@ -3783,12 +3386,12 @@ int main(int argc, char ** argv) {
3783
3386
  //
3784
3387
  // Start the server
3785
3388
  //
3786
- if (sparams.n_threads_http < 1) {
3389
+ if (params.n_threads_http < 1) {
3787
3390
  // +2 threads for monitoring endpoints
3788
- sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
3391
+ params.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
3789
3392
  }
3790
- log_data["n_threads_http"] = std::to_string(sparams.n_threads_http);
3791
- svr->new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
3393
+ log_data["n_threads_http"] = std::to_string(params.n_threads_http);
3394
+ svr->new_task_queue = [&params] { return new httplib::ThreadPool(params.n_threads_http); };
3792
3395
 
3793
3396
  LOG_INFO("HTTP server listening", log_data);
3794
3397