@fugood/llama.node 0.2.3 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/CMakeLists.txt +6 -3
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +3 -3
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  24. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  25. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  26. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  27. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  28. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  29. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  31. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  32. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  33. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  34. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  35. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  36. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  37. package/src/llama.cpp/CMakeLists.txt +91 -1245
  38. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  39. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  40. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  41. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  42. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  43. package/src/llama.cpp/common/common.cpp +1116 -877
  44. package/src/llama.cpp/common/common.h +191 -77
  45. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  46. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  47. package/src/llama.cpp/common/log.h +1 -1
  48. package/src/llama.cpp/common/ngram-cache.h +10 -3
  49. package/src/llama.cpp/common/sampling.cpp +19 -10
  50. package/src/llama.cpp/docs/build.md +353 -0
  51. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  52. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  54. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  56. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  58. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  60. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  61. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  62. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  63. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  64. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  65. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  66. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  67. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  68. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  69. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  71. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  72. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  73. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  75. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  76. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  77. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  79. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  80. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  87. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  88. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  90. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  91. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  92. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  94. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  95. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  96. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  97. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  98. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  99. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  100. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  102. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  103. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  104. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  105. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  106. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  107. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  108. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  110. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  111. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  112. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  113. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  114. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  115. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  116. package/src/llama.cpp/examples/main/main.cpp +98 -75
  117. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  118. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  119. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  120. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  121. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  122. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  123. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  124. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  125. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  126. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  127. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  128. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  129. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  130. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  131. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  132. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  133. package/src/llama.cpp/examples/server/server.cpp +274 -671
  134. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  135. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  136. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  137. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  138. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  139. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  140. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  141. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  142. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  143. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  144. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  145. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  146. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  147. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  148. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  149. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  150. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  151. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  152. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  153. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  154. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  155. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  156. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  157. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  159. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  160. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  161. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  162. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  163. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  178. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  179. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  180. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  181. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  182. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  183. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  184. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  208. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  209. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  210. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  211. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  212. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  214. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  215. package/src/llama.cpp/models/.editorconfig +1 -0
  216. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  217. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  221. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  224. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  230. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  233. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  237. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  243. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  246. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  249. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  252. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  255. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  258. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  259. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  260. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  261. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  263. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  264. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  265. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  266. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  267. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  268. package/src/llama.cpp/requirements.txt +5 -4
  269. package/src/llama.cpp/scripts/build-info.sh +30 -0
  270. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  271. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  272. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  273. package/src/llama.cpp/src/llama-grammar.h +39 -0
  274. package/src/llama.cpp/src/llama-impl.h +26 -0
  275. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  276. package/src/llama.cpp/src/llama-sampling.h +56 -0
  277. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  278. package/src/llama.cpp/src/llama-vocab.h +130 -0
  279. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  280. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  281. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  282. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  283. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  284. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  285. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  286. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  287. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  289. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  290. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  291. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  292. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  293. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  294. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  295. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  296. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  297. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  298. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  299. package/bin/darwin/arm64/default.metallib +0 -0
  300. package/bin/darwin/x64/default.metallib +0 -0
  301. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  302. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  303. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  304. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  305. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  306. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  307. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  308. package/src/llama.cpp/ggml-opencl.h +0 -36
  309. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  310. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  311. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  314. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  315. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  316. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  317. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  318. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  319. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -1,6 +1,6 @@
1
1
  aiohttp~=3.9.3
2
2
  behave~=1.2.6
3
3
  huggingface_hub~=0.20.3
4
- numpy~=1.24.4
5
- openai~=0.25.0
4
+ numpy~=1.26.4
5
+ openai~=1.30.3
6
6
  prometheus-client~=0.20.0
@@ -116,45 +116,37 @@ static inline void server_log(const char * level, const char * function, int lin
116
116
  // chat template utils
117
117
  //
118
118
 
119
- // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
120
- inline bool verify_custom_template(const std::string & tmpl) {
121
- llama_chat_message chat[] = {{"user", "test"}};
122
- int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
123
- return res >= 0;
124
- }
125
-
126
119
  // Format given chat. If tmpl is empty, we take the template from model metadata
127
120
  inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
128
- size_t alloc_size = 0;
129
- // vector holding all allocated string to be passed to llama_chat_apply_template
130
- std::vector<std::string> str(messages.size() * 2);
131
- std::vector<llama_chat_message> chat(messages.size());
121
+ std::vector<llama_chat_msg> chat;
132
122
 
133
123
  for (size_t i = 0; i < messages.size(); ++i) {
134
124
  const auto & curr_msg = messages[i];
135
- str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
136
- str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
137
- alloc_size += str[i*2 + 1].length();
138
- chat[i].role = str[i*2 + 0].c_str();
139
- chat[i].content = str[i*2 + 1].c_str();
140
- }
141
125
 
142
- const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
143
- std::vector<char> buf(alloc_size * 2);
144
-
145
- // run the first time to get the total output length
146
- int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
126
+ std::string role = json_value(curr_msg, "role", std::string(""));
127
+
128
+ std::string content;
129
+ if (curr_msg.contains("content")) {
130
+ if (curr_msg["content"].is_string()) {
131
+ content = curr_msg["content"].get<std::string>();
132
+ } else if (curr_msg["content"].is_array()) {
133
+ for (const auto & part : curr_msg["content"]) {
134
+ if (part.contains("text")) {
135
+ content += "\n" + part["text"].get<std::string>();
136
+ }
137
+ }
138
+ } else {
139
+ throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
140
+ }
141
+ } else {
142
+ throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
143
+ }
147
144
 
148
- // if it turns out that our buffer is too small, we resize it
149
- if ((size_t) res > buf.size()) {
150
- buf.resize(res);
151
- res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
145
+ chat.push_back({role, content});
152
146
  }
153
147
 
154
- const std::string formatted_chat(buf.data(), res);
155
-
148
+ auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
156
149
  LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
157
-
158
150
  return formatted_chat;
159
151
  }
160
152
 
@@ -260,6 +252,13 @@ static size_t common_part(const std::vector<llama_token> & a, const std::vector<
260
252
  return i;
261
253
  }
262
254
 
255
+ static size_t common_part(const std::string & a, const std::string & b) {
256
+ size_t i;
257
+ for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
258
+
259
+ return i;
260
+ }
261
+
263
262
  static bool ends_with(const std::string & str, const std::string & suffix) {
264
263
  return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
265
264
  }
@@ -1,4 +1,4 @@
1
- set(TARGET simple)
1
+ set(TARGET llama-simple)
2
2
  add_executable(${TARGET} simple.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
@@ -6,28 +6,27 @@
6
6
  #include <string>
7
7
  #include <vector>
8
8
 
9
- int main(int argc, char ** argv) {
10
- gpt_params params;
9
+ static void print_usage(int argc, char ** argv, const gpt_params & params) {
10
+ gpt_params_print_usage(argc, argv, params);
11
11
 
12
- if (argc == 1 || argv[1][0] == '-') {
13
- printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
14
- return 1 ;
15
- }
12
+ LOG_TEE("\nexample usage:\n");
13
+ LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
14
+ LOG_TEE("\n");
15
+ }
16
16
 
17
- if (argc >= 2) {
18
- params.model = argv[1];
19
- }
17
+ int main(int argc, char ** argv) {
18
+ gpt_params params;
20
19
 
21
- if (argc >= 3) {
22
- params.prompt = argv[2];
23
- }
20
+ params.prompt = "Hello my name is";
21
+ params.n_predict = 32;
24
22
 
25
- if (params.prompt.empty()) {
26
- params.prompt = "Hello my name is";
23
+ if (!gpt_params_parse(argc, argv, params)) {
24
+ print_usage(argc, argv, params);
25
+ return 1;
27
26
  }
28
27
 
29
28
  // total length of the sequence including the prompt
30
- const int n_len = 32;
29
+ const int n_predict = params.n_predict;
31
30
 
32
31
  // init LLM
33
32
 
@@ -36,9 +35,7 @@ int main(int argc, char ** argv) {
36
35
 
37
36
  // initialize the model
38
37
 
39
- llama_model_params model_params = llama_model_default_params();
40
-
41
- // model_params.n_gpu_layers = 99; // offload all layers to the GPU
38
+ llama_model_params model_params = llama_model_params_from_gpt_params(params);
42
39
 
43
40
  llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
44
41
 
@@ -49,12 +46,7 @@ int main(int argc, char ** argv) {
49
46
 
50
47
  // initialize the context
51
48
 
52
- llama_context_params ctx_params = llama_context_default_params();
53
-
54
- ctx_params.seed = 1234;
55
- ctx_params.n_ctx = 2048;
56
- ctx_params.n_threads = params.n_threads;
57
- ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
49
+ llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
58
50
 
59
51
  llama_context * ctx = llama_new_context_with_model(model, ctx_params);
60
52
 
@@ -69,14 +61,14 @@ int main(int argc, char ** argv) {
69
61
  tokens_list = ::llama_tokenize(ctx, params.prompt, true);
70
62
 
71
63
  const int n_ctx = llama_n_ctx(ctx);
72
- const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
64
+ const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
73
65
 
74
- LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
66
+ LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
75
67
 
76
68
  // make sure the KV cache is big enough to hold all the prompt and generated tokens
77
69
  if (n_kv_req > n_ctx) {
78
70
  LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
79
- LOG_TEE("%s: either reduce n_len or increase n_ctx\n", __func__);
71
+ LOG_TEE("%s: either reduce n_predict or increase n_ctx\n", __func__);
80
72
  return 1;
81
73
  }
82
74
 
@@ -115,7 +107,7 @@ int main(int argc, char ** argv) {
115
107
 
116
108
  const auto t_main_start = ggml_time_us();
117
109
 
118
- while (n_cur <= n_len) {
110
+ while (n_cur <= n_predict) {
119
111
  // sample the next token
120
112
  {
121
113
  auto n_vocab = llama_n_vocab(model);
@@ -134,7 +126,7 @@ int main(int argc, char ** argv) {
134
126
  const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
135
127
 
136
128
  // is it an end of generation?
137
- if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
129
+ if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
138
130
  LOG_TEE("\n");
139
131
 
140
132
  break;
@@ -1,4 +1,4 @@
1
- set(TARGET speculative)
1
+ set(TARGET llama-speculative)
2
2
  add_executable(${TARGET} speculative.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
@@ -27,7 +27,8 @@ struct seq_draft {
27
27
  int main(int argc, char ** argv) {
28
28
  gpt_params params;
29
29
 
30
- if (gpt_params_parse(argc, argv, params) == false) {
30
+ if (!gpt_params_parse(argc, argv, params)) {
31
+ gpt_params_print_usage(argc, argv, params);
31
32
  return 1;
32
33
  }
33
34
 
@@ -2,7 +2,7 @@
2
2
  # Copyright (C) 2024 Intel Corporation
3
3
  # SPDX-License-Identifier: MIT
4
4
 
5
- set(TARGET ls-sycl-device)
5
+ set(TARGET llama-ls-sycl-device)
6
6
  add_executable(${TARGET} ls-sycl-device.cpp)
7
7
  install(TARGETS ${TARGET} RUNTIME)
8
8
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
@@ -0,0 +1,23 @@
1
+
2
+ # MIT license
3
+ # Copyright (C) 2024 Intel Corporation
4
+ # SPDX-License-Identifier: MIT
5
+
6
+ mkdir -p build
7
+ cd build
8
+ source /opt/intel/oneapi/setvars.sh
9
+
10
+ #for FP16
11
+ #cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # faster for long-prompt inference
12
+
13
+ #for FP32
14
+ cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
15
+
16
+ #build example/main
17
+ #cmake --build . --config Release --target main
18
+
19
+ #build example/llama-bench
20
+ #cmake --build . --config Release --target llama-bench
21
+
22
+ #build all binary
23
+ cmake --build . --config Release -j -v
@@ -0,0 +1,36 @@
1
+ #!/bin/bash
2
+
3
+ # MIT license
4
+ # Copyright (C) 2024 Intel Corporation
5
+ # SPDX-License-Identifier: MIT
6
+
7
+ INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
8
+ source /opt/intel/oneapi/setvars.sh
9
+
10
+ if [ $# -gt 0 ]; then
11
+ GGML_SYCL_DEVICE=$1
12
+ GGML_SYCL_SINGLE_GPU=1
13
+ else
14
+ GGML_SYCL_DEVICE=0
15
+ GGML_SYCL_SINGLE_GPU=0
16
+ fi
17
+
18
+ #export GGML_SYCL_DEBUG=1
19
+
20
+
21
+ #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
22
+
23
+ if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
24
+ echo "use $GGML_SYCL_DEVICE as main GPU"
25
+ #use signle GPU only
26
+ ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
27
+ else
28
+ #use multiple GPUs with same max compute units
29
+ ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
30
+ fi
31
+
32
+ #use main GPU only
33
+ #ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
34
+
35
+ #use multiple GPUs with same max compute units
36
+ #ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
@@ -0,0 +1,33 @@
1
+
2
+ :: MIT license
3
+ :: Copyright (C) 2024 Intel Corporation
4
+ :: SPDX-License-Identifier: MIT
5
+
6
+
7
+ IF not exist build (mkdir build)
8
+ cd build
9
+ if %errorlevel% neq 0 goto ERROR
10
+
11
+ @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
12
+ if %errorlevel% neq 0 goto ERROR
13
+
14
+ :: for FP16
15
+ :: faster for long-prompt inference
16
+ :: cmake -G "MinGW Makefiles" .. -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
17
+
18
+ :: for FP32
19
+ cmake -G "Ninja" .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
20
+ if %errorlevel% neq 0 goto ERROR
21
+ :: build example/main only
22
+ :: make main
23
+
24
+ :: build all binary
25
+ cmake --build . -j
26
+ if %errorlevel% neq 0 goto ERROR
27
+
28
+ cd ..
29
+ exit /B 0
30
+
31
+ :ERROR
32
+ echo comomand error: %errorlevel%
33
+ exit /B %errorlevel%
@@ -0,0 +1,9 @@
1
+ :: MIT license
2
+ :: Copyright (C) 2024 Intel Corporation
3
+ :: SPDX-License-Identifier: MIT
4
+
5
+ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
6
+ @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
7
+
8
+
9
+ .\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
@@ -1,4 +1,4 @@
1
- set(TARGET tokenize)
1
+ set(TARGET llama-tokenize)
2
2
  add_executable(${TARGET} tokenize.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
@@ -29,7 +29,9 @@ static void print_usage_information(const char * argv0, FILE * stream) {
29
29
  fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
30
30
  fprintf(stream, " --stdin read prompt from standard input.\n");
31
31
  fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
32
+ fprintf(stream, " --no-parse-special do not parse control tokens.\n");
32
33
  fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
34
+ fprintf(stream, " --show-count print the total number of tokens.\n");
33
35
  }
34
36
 
35
37
  static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
@@ -161,7 +163,7 @@ static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
161
163
  printf(">");
162
164
  return;
163
165
  }
164
- GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way.");
166
+ GGML_ABORT("MultiByteToWideChar() failed in an unexpected way.");
165
167
  }
166
168
 
167
169
  LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));
@@ -194,7 +196,9 @@ int main(int raw_argc, char ** raw_argv) {
194
196
  // variables where to put any arguments we see.
195
197
  bool printing_ids = false;
196
198
  bool no_bos = false;
199
+ bool no_parse_special = false;
197
200
  bool disable_logging = false;
201
+ bool show_token_count = false;
198
202
  const char * model_path = NULL;
199
203
  const char * prompt_path = NULL;
200
204
  const char * prompt_arg = NULL;
@@ -227,6 +231,9 @@ int main(int raw_argc, char ** raw_argv) {
227
231
  else if (arg == "--no-bos") {
228
232
  no_bos = true;
229
233
  }
234
+ else if (arg == "--no-parse-special") {
235
+ no_parse_special = true;
236
+ }
230
237
  else if (arg == "-p" || arg == "--prompt") {
231
238
  if (prompt_set) {
232
239
  fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
@@ -249,6 +256,9 @@ int main(int raw_argc, char ** raw_argv) {
249
256
  else if (arg == "--log-disable") {
250
257
  disable_logging = true;
251
258
  }
259
+ else if (arg == "--show-count") {
260
+ show_token_count = true;
261
+ }
252
262
  else {
253
263
  fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
254
264
  return 1;
@@ -354,9 +364,10 @@ int main(int raw_argc, char ** raw_argv) {
354
364
 
355
365
  const bool model_wants_add_bos = llama_should_add_bos_token(model);
356
366
  const bool add_bos = model_wants_add_bos && !no_bos;
367
+ const bool parse_special = !no_parse_special;
357
368
 
358
369
  std::vector<llama_token> tokens;
359
- tokens = ::llama_tokenize(model, prompt, add_bos, true);
370
+ tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);
360
371
 
361
372
  if (printing_ids) {
362
373
  printf("[");
@@ -384,6 +395,9 @@ int main(int raw_argc, char ** raw_argv) {
384
395
  printf("]\n");
385
396
  }
386
397
 
398
+ if (show_token_count) {
399
+ printf("Total number of tokens: %ld\n", tokens.size());
400
+ }
387
401
  // silence valgrind
388
402
  llama_free(ctx);
389
403
  llama_free_model(model);
@@ -0,0 +1,253 @@
1
+ cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
2
+ project("ggml" C CXX)
3
+ include(CheckIncludeFileCXX)
4
+
5
+ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
6
+
7
+ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
8
+ set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
9
+ set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
10
+ endif()
11
+
12
+ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
13
+ set(GGML_STANDALONE ON)
14
+
15
+ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
16
+
17
+ # configure project version
18
+ # TODO
19
+ else()
20
+ set(GGML_STANDALONE OFF)
21
+ endif()
22
+
23
+ if (EMSCRIPTEN)
24
+ set(BUILD_SHARED_LIBS_DEFAULT OFF)
25
+
26
+ option(GGML_WASM_SINGLE_FILE "ggml: embed WASM inside the generated ggml.js" ON)
27
+ else()
28
+ if (MINGW)
29
+ set(BUILD_SHARED_LIBS_DEFAULT OFF)
30
+ else()
31
+ set(BUILD_SHARED_LIBS_DEFAULT ON)
32
+ endif()
33
+ endif()
34
+
35
+ option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
36
+
37
+ #
38
+ # option list
39
+ #
40
+
41
+ # TODO: mark all options as advanced when not GGML_STANDALONE
42
+
43
+ if (APPLE)
44
+ set(GGML_METAL_DEFAULT ON)
45
+ set(GGML_BLAS_DEFAULT ON)
46
+ set(GGML_BLAS_VENDOR_DEFAULT "Apple")
47
+ else()
48
+ set(GGML_METAL_DEFAULT OFF)
49
+ set(GGML_BLAS_DEFAULT OFF)
50
+ set(GGML_BLAS_VENDOR_DEFAULT "Generic")
51
+ endif()
52
+
53
+ if (CMAKE_CROSSCOMPILING)
54
+ set(GGML_NATIVE_DEFAULT OFF)
55
+ else()
56
+ set(GGML_NATIVE_DEFAULT ON)
57
+ endif()
58
+
59
+ # general
60
+ option(GGML_STATIC "ggml: static link libraries" OFF)
61
+ option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT})
62
+ option(GGML_LTO "ggml: enable link time optimization" OFF)
63
+ option(GGML_CCACHE "ggml: use ccache if available" ON)
64
+
65
+ # debug
66
+ option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON)
67
+ option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
68
+ option(GGML_GPROF "ggml: enable gprof" OFF)
69
+
70
+ # build
71
+ option(GGML_FATAL_WARNINGS "ggml: enable -Werror flag" OFF)
72
+
73
+ # sanitizers
74
+ option(GGML_SANITIZE_THREAD "ggml: enable thread sanitizer" OFF)
75
+ option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF)
76
+ option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
77
+
78
+ # instruction set specific
79
+ if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
80
+ set(INS_ENB OFF)
81
+ else()
82
+ set(INS_ENB ON)
83
+ endif()
84
+
85
+ option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
86
+
87
+ option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
88
+ option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
89
+ option(GGML_AVX512 "ggml: enable AVX512" OFF)
90
+ option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
91
+ option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
92
+ option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
93
+ option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
94
+ if (NOT MSVC)
95
+ option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
96
+ endif()
97
+ option(GGML_LASX "ggml: enable lasx" ON)
98
+ option(GGML_LSX "ggml: enable lsx" ON)
99
+ option(GGML_SVE "ggml: enable SVE" OFF)
100
+
101
+ if (WIN32)
102
+ set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")
103
+ endif()
104
+
105
+ # ggml core
106
+ set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
107
+
108
+ # 3rd party libs / backends
109
+ option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
110
+ option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
111
+ set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
112
+ "ggml: BLAS library vendor")
113
+ option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF)
114
+
115
+ option(GGML_CUDA "ggml: use CUDA" OFF)
116
+ option(GGML_MUSA "ggml: use MUSA" OFF)
117
+ option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
118
+ option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
119
+ option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
120
+ set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
121
+ set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
122
+ option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
123
+ set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
124
+ "ggml: iters./thread per block for Q2_K/Q6_K")
125
+ set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
126
+ "ggml: max. batch size for using peer access")
127
+ option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
128
+ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
129
+ option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
130
+ option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF)
131
+
132
+ option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF)
133
+ option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
134
+ option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
135
+ option(GGML_VULKAN "ggml: use Vulkan" OFF)
136
+ option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
137
+ option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
138
+ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
139
+ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
140
+ option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
141
+ option(GGML_KOMPUTE "ggml: use Kompute" OFF)
142
+ option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
143
+ option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
144
+ option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
145
+ option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
146
+ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
147
+ "ggml: metal minimum macOS version")
148
+ set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
149
+ option(GGML_OPENMP "ggml: use OpenMP" ON)
150
+ option(GGML_RPC "ggml: use RPC" OFF)
151
+ option(GGML_SYCL "ggml: use SYCL" OFF)
152
+ option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
153
+ set (GGML_SYCL_TARGET "INTEL" CACHE STRING
154
+ "ggml: sycl target device")
155
+
156
+ # extra artifacts
157
+ option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
158
+ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
159
+
160
+ #
161
+ # dependencies
162
+ #
163
+
164
+ set(CMAKE_C_STANDARD 11)
165
+ set(CMAKE_C_STANDARD_REQUIRED true)
166
+
167
+ if (GGML_SYCL)
168
+ set(CMAKE_CXX_STANDARD 17)
169
+ else()
170
+ set(CMAKE_CXX_STANDARD 11)
171
+ endif()
172
+ set(CMAKE_CXX_STANDARD_REQUIRED true)
173
+
174
+ set(THREADS_PREFER_PTHREAD_FLAG ON)
175
+
176
+ find_package(Threads REQUIRED)
177
+
178
+ #
179
+ # build the library
180
+ #
181
+
182
+ add_subdirectory(src)
183
+
184
+ #
185
+ # tests and examples
186
+ #
187
+
188
+ if (GGML_BUILD_TESTS)
189
+ enable_testing()
190
+ add_subdirectory(tests)
191
+ endif ()
192
+
193
+ if (GGML_BUILD_EXAMPLES)
194
+ add_subdirectory(examples)
195
+ endif ()
196
+
197
+ #
198
+ # install
199
+ #
200
+
201
+ include(GNUInstallDirs)
202
+ include(CMakePackageConfigHelpers)
203
+
204
+ # all public headers
205
+ set(GGML_PUBLIC_HEADERS
206
+ include/ggml.h
207
+ include/ggml-alloc.h
208
+ include/ggml-backend.h
209
+ include/ggml-blas.h
210
+ include/ggml-cuda.h
211
+ include/ggml.h
212
+ include/ggml-kompute.h
213
+ include/ggml-metal.h
214
+ include/ggml-rpc.h
215
+ include/ggml-sycl.h
216
+ include/ggml-vulkan.h)
217
+
218
+ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
219
+ #if (GGML_METAL)
220
+ # set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
221
+ #endif()
222
+ install(TARGETS ggml PUBLIC_HEADER)
223
+
224
+ if (BUILD_SHARED_LIBS)
225
+ install(TARGETS ggml LIBRARY)
226
+ endif()
227
+
228
+ if (GGML_METAL)
229
+ install(
230
+ FILES src/ggml-metal.metal
231
+ PERMISSIONS
232
+ OWNER_READ
233
+ OWNER_WRITE
234
+ GROUP_READ
235
+ WORLD_READ
236
+ DESTINATION ${CMAKE_INSTALL_BINDIR})
237
+
238
+ if (NOT GGML_METAL_EMBED_LIBRARY)
239
+ install(
240
+ FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
241
+ DESTINATION ${CMAKE_INSTALL_BINDIR}
242
+ )
243
+ endif()
244
+ endif()
245
+
246
+ if (GGML_STANDALONE)
247
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
248
+ ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
249
+ @ONLY)
250
+
251
+ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
252
+ DESTINATION share/pkgconfig)
253
+ endif()