@fugood/llama.node 0.3.3 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/CMakeLists.txt +5 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +29 -1
  17. package/package.json +1 -1
  18. package/src/EmbeddingWorker.cpp +15 -5
  19. package/src/EmbeddingWorker.h +2 -1
  20. package/src/LlamaCompletionWorker.cpp +17 -1
  21. package/src/LlamaContext.cpp +86 -18
  22. package/src/LlamaContext.h +2 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +197 -159
  24. package/src/llama.cpp/.github/workflows/docker.yml +5 -8
  25. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  27. package/src/llama.cpp/CMakeLists.txt +11 -6
  28. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  29. package/src/llama.cpp/cmake/common.cmake +33 -0
  30. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  31. package/src/llama.cpp/common/CMakeLists.txt +6 -2
  32. package/src/llama.cpp/common/arg.cpp +426 -245
  33. package/src/llama.cpp/common/common.cpp +143 -80
  34. package/src/llama.cpp/common/common.h +81 -24
  35. package/src/llama.cpp/common/sampling.cpp +53 -19
  36. package/src/llama.cpp/common/sampling.h +22 -1
  37. package/src/llama.cpp/common/speculative.cpp +274 -0
  38. package/src/llama.cpp/common/speculative.h +28 -0
  39. package/src/llama.cpp/docs/build.md +101 -148
  40. package/src/llama.cpp/examples/CMakeLists.txt +32 -13
  41. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +5 -4
  43. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  46. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  47. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  48. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  49. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  50. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  52. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  55. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  57. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  59. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
  61. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/infill/infill.cpp +1 -1
  63. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  64. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
  65. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  66. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  67. package/src/llama.cpp/examples/llava/clip.cpp +262 -66
  68. package/src/llama.cpp/examples/llava/clip.h +8 -2
  69. package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
  70. package/src/llama.cpp/examples/llava/llava.cpp +46 -19
  71. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
  72. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  73. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  75. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  76. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
  77. package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
  78. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/main/main.cpp +9 -5
  80. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  81. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  83. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  84. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  87. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  88. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
  90. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  91. package/src/llama.cpp/examples/run/run.cpp +911 -0
  92. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
  94. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
  95. package/src/llama.cpp/examples/server/server.cpp +1758 -886
  96. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  97. package/src/llama.cpp/examples/server/utils.hpp +94 -304
  98. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  99. package/src/llama.cpp/examples/simple/simple.cpp +4 -0
  100. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
  101. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
  102. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
  104. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  106. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
  108. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  109. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  110. package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
  111. package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
  112. package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
  113. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  114. package/src/llama.cpp/ggml/include/ggml.h +106 -24
  115. package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
  116. package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
  117. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
  118. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
  119. package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
  120. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
  121. package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
  122. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
  123. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  124. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  125. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
  126. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  127. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  128. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  129. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  130. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  131. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  132. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  133. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  134. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  135. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
  136. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  137. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  138. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
  139. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
  140. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
  141. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  142. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
  143. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
  151. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
  152. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
  153. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  155. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
  156. package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
  157. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
  158. package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
  159. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
  160. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
  161. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
  162. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  163. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  164. package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
  165. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
  167. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
  169. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
  172. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  173. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  174. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
  175. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
  176. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  177. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
  178. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  179. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  180. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
  181. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
  182. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
  183. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  184. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  185. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
  187. package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
  188. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
  189. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
  190. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
  191. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
  192. package/src/llama.cpp/ggml/src/ggml.c +367 -207
  193. package/src/llama.cpp/include/llama-cpp.h +25 -0
  194. package/src/llama.cpp/include/llama.h +26 -19
  195. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  196. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  197. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  198. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  199. package/src/llama.cpp/src/CMakeLists.txt +2 -7
  200. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  201. package/src/llama.cpp/src/llama-grammar.h +2 -5
  202. package/src/llama.cpp/src/llama-sampling.cpp +35 -90
  203. package/src/llama.cpp/src/llama-vocab.cpp +6 -1
  204. package/src/llama.cpp/src/llama.cpp +1748 -640
  205. package/src/llama.cpp/src/unicode.cpp +62 -51
  206. package/src/llama.cpp/src/unicode.h +9 -10
  207. package/src/llama.cpp/tests/CMakeLists.txt +48 -37
  208. package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
  209. package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
  210. package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
  211. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  212. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  213. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  214. package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
  215. package/src/llama.cpp/tests/test-rope.cpp +61 -20
  216. package/src/llama.cpp/tests/test-sampling.cpp +2 -2
  217. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  218. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  219. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  220. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  221. package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
  222. package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
  223. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
  224. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  225. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
@@ -7,124 +7,75 @@ git clone https://github.com/ggerganov/llama.cpp
7
7
  cd llama.cpp
8
8
  ```
9
9
 
10
- In order to build llama.cpp you have four different options.
10
+ The following sections describe how to build with different backends and options.
11
11
 
12
- - Using `make`:
13
- - On Linux or MacOS:
12
+ ## CPU Build
14
13
 
15
- ```bash
16
- make
17
- ```
18
-
19
- - On Windows (x86/x64 only, arm64 requires cmake):
14
+ Build llama.cpp using `CMake`:
20
15
 
21
- 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
22
- 2. Extract `w64devkit` on your pc.
23
- 3. Run `w64devkit.exe`.
24
- 4. Use the `cd` command to reach the `llama.cpp` folder.
25
- 5. From here you can run:
26
- ```bash
27
- make
28
- ```
16
+ ```bash
17
+ cmake -B build
18
+ cmake --build build --config Release
19
+ ```
29
20
 
30
- - Notes:
31
- - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
32
- - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
33
- - For faster repeated compilation, install [ccache](https://ccache.dev/).
34
- - For debug builds, run `make LLAMA_DEBUG=1`
21
+ **Notes**:
35
22
 
36
- - Using `CMake`:
23
+ - For faster compilation, add the `-j` argument to run multiple jobs in parallel, or use a generator that does this automatically such as Ninja. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
24
+ - For faster repeated compilation, install [ccache](https://ccache.dev/)
25
+ - For debug builds, there are two cases:
37
26
 
38
- ```bash
39
- cmake -B build
40
- cmake --build build --config Release
41
- ```
27
+ 1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
42
28
 
43
- **Notes**:
29
+ ```bash
30
+ cmake -B build -DCMAKE_BUILD_TYPE=Debug
31
+ cmake --build build
32
+ ```
44
33
 
45
- - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
46
- - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
47
- - For faster repeated compilation, install [ccache](https://ccache.dev/).
48
- - For debug builds, there are two cases:
34
+ 2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
49
35
 
50
- 1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
36
+ ```bash
37
+ cmake -B build -G "Xcode"
38
+ cmake --build build --config Debug
39
+ ```
51
40
 
52
- ```bash
53
- cmake -B build -DCMAKE_BUILD_TYPE=Debug
54
- cmake --build build
55
- ```
41
+ For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html).
42
+ - For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
43
+ ```
44
+ cmake -B build -DBUILD_SHARED_LIBS=OFF
45
+ cmake --build build --config Release
46
+ ```
56
47
 
57
- 2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
48
+ - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
49
+ - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
50
+ - Tab Workload: Desktop-development with C++
51
+ - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
52
+ - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
53
+ - For Windows on ARM (arm64, WoA) build with:
54
+ ```bash
55
+ cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
56
+ cmake --build build-arm64-windows-llvm-release
57
+ ```
58
+ Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels.
58
59
 
60
+ For building with ninja generator and clang compiler as default:
61
+ -set path:set LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\x64;C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.41.34120\lib\x64\uwp;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\x64
59
62
  ```bash
60
- cmake -B build -G "Xcode"
61
- cmake --build build --config Debug
63
+ cmake --preset x64-windows-llvm-release
64
+ cmake --build build-x64-windows-llvm-release
62
65
  ```
63
- - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
64
- - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
65
- - Tab Workload: Desktop-development with C++
66
- - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
67
- - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
68
- - For Windows on ARM (arm64, WoA) build with:
69
- ```bash
70
- cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
71
- cmake --build build-arm64-windows-llvm-release
72
- ```
73
- Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
74
-
75
- - Using `gmake` (FreeBSD):
76
-
77
- 1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
78
- 2. Add your user to **video** group
79
- 3. Install compilation dependencies.
80
-
81
- ```bash
82
- sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
83
-
84
- gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
85
- ```
86
-
87
- ## Metal Build
88
-
89
- On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
90
- To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
91
-
92
- When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
93
- argument.
94
66
 
95
67
  ## BLAS Build
96
68
 
97
- Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
69
+ Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Using BLAS doesn't affect the generation performance. There are currently several different BLAS implementations available for build and use:
98
70
 
99
- ### Accelerate Framework:
71
+ ### Accelerate Framework
100
72
 
101
73
  This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
102
74
 
103
- ### OpenBLAS:
75
+ ### OpenBLAS
104
76
 
105
77
  This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
106
78
 
107
- - Using `make`:
108
- - On Linux:
109
- ```bash
110
- make GGML_OPENBLAS=1
111
- ```
112
-
113
- - On Windows:
114
-
115
- 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
116
- 2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
117
- 3. Extract `w64devkit` on your pc.
118
- 4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
119
- 5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
120
- 6. Run `w64devkit.exe`.
121
- 7. Use the `cd` command to reach the `llama.cpp` folder.
122
- 8. From here you can run:
123
-
124
- ```bash
125
- make GGML_OPENBLAS=1
126
- ```
127
-
128
79
  - Using `CMake` on Linux:
129
80
 
130
81
  ```bash
@@ -136,14 +87,6 @@ This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS i
136
87
 
137
88
  Check [BLIS.md](./backend/BLIS.md) for more information.
138
89
 
139
- ### SYCL
140
-
141
- SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
142
-
143
- llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
144
-
145
- For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
146
-
147
90
  ### Intel oneMKL
148
91
 
149
92
  Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
@@ -161,16 +104,29 @@ Building through oneAPI compilers will make avx_vnni instruction set available f
161
104
 
162
105
  Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
163
106
 
164
- ### CUDA
107
+ ### Other BLAS libraries
165
108
 
166
- This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
109
+ Any other BLAS library can be used by setting the `GGML_BLAS_VENDOR` option. See the [CMake documentation](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) for a list of supported vendors.
167
110
 
168
- For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
111
+ ## Metal Build
112
+
113
+ On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
114
+ To disable the Metal build at compile time use the `-DGGML_METAL=OFF` cmake option.
115
+
116
+ When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers 0` command-line argument.
117
+
118
+ ## SYCL
119
+
120
+ SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
121
+
122
+ llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
123
+
124
+ For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
125
+
126
+ ## CUDA
127
+
128
+ This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from the [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
169
129
 
170
- - Using `make`:
171
- ```bash
172
- make GGML_CUDA=1
173
- ```
174
130
  - Using `CMake`:
175
131
 
176
132
  ```bash
@@ -192,14 +148,10 @@ The following compilation options are also available to tweak performance:
192
148
  | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
193
149
  | GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
194
150
 
195
- ### MUSA
151
+ ## MUSA
196
152
 
197
153
  This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
198
154
 
199
- - Using `make`:
200
- ```bash
201
- make GGML_MUSA=1
202
- ```
203
155
  - Using `CMake`:
204
156
 
205
157
  ```bash
@@ -213,16 +165,12 @@ The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enab
213
165
 
214
166
  Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
215
167
 
216
- ### hipBLAS
168
+ ## HIP
217
169
 
218
- This provides BLAS acceleration on HIP-supported AMD GPUs.
170
+ This provides GPU acceleration on HIP-supported AMD GPUs.
219
171
  Make sure to have ROCm installed.
220
172
  You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
221
173
 
222
- - Using `make`:
223
- ```bash
224
- make GGML_HIPBLAS=1
225
- ```
226
174
  - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
227
175
  ```bash
228
176
  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
@@ -247,11 +195,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
247
195
  && cmake --build build -- -j 16
248
196
  ```
249
197
 
250
- - Using `make` (example for target gfx1030, build with 16 CPU threads):
251
- ```bash
252
- make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
253
- ```
254
-
255
198
  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
256
199
  ```bash
257
200
  set PATH=%HIP_PATH%\bin;%PATH%
@@ -265,11 +208,11 @@ You can download it from your Linux distro's package manager or from here: [ROCm
265
208
  The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
266
209
  If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
267
210
 
268
- ### Vulkan
211
+ ## Vulkan
269
212
 
270
213
  **Windows**
271
214
 
272
- #### w64devkit
215
+ ### w64devkit
273
216
 
274
217
  Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
275
218
 
@@ -289,9 +232,14 @@ Libs: -lvulkan-1
289
232
  EOF
290
233
 
291
234
  ```
292
- Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
293
235
 
294
- #### Git Bash MINGW64
236
+ Switch into the `llama.cpp` directory and build using CMake.
237
+ ```sh
238
+ cmake -B build -DGGML_VULKAN=ON
239
+ cmake --build build --config Release
240
+ ```
241
+
242
+ ### Git Bash MINGW64
295
243
 
296
244
  Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
297
245
 
@@ -310,20 +258,21 @@ cmake --build build --config Release
310
258
 
311
259
  Now you can load the model in conversation mode using `Vulkan`
312
260
 
313
- ```
314
- build/bin/release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
261
+ ```sh
262
+ build/bin/Release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
315
263
  ```
316
264
 
317
- #### MSYS2
265
+ ### MSYS2
318
266
  Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
319
- ```sh
320
- pacman -S git \
321
- mingw-w64-ucrt-x86_64-gcc \
322
- mingw-w64-ucrt-x86_64-cmake \
323
- mingw-w64-ucrt-x86_64-vulkan-devel \
324
- mingw-w64-ucrt-x86_64-shaderc
325
- ```
326
- Switch into `llama.cpp` directory and build using CMake.
267
+ ```sh
268
+ pacman -S git \
269
+ mingw-w64-ucrt-x86_64-gcc \
270
+ mingw-w64-ucrt-x86_64-cmake \
271
+ mingw-w64-ucrt-x86_64-vulkan-devel \
272
+ mingw-w64-ucrt-x86_64-shaderc
273
+ ```
274
+
275
+ Switch into the `llama.cpp` directory and build using CMake.
327
276
  ```sh
328
277
  cmake -B build -DGGML_VULKAN=ON
329
278
  cmake --build build --config Release
@@ -372,7 +321,7 @@ cmake --build build --config Release
372
321
  # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
373
322
  ```
374
323
 
375
- ### CANN
324
+ ## CANN
376
325
  This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
377
326
 
378
327
  For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
@@ -387,22 +336,26 @@ cmake --build build --config release
387
336
 
388
337
  You can test with:
389
338
 
390
- `./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
339
+ ```bash
340
+ ./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32
341
+ ```
391
342
 
392
- If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
343
+ If the following info is output on screen, you are using `llama.cpp` with the CANN backend:
393
344
  ```bash
394
- llm_load_tensors: CANN buffer size = 13313.00 MiB
345
+ llm_load_tensors: CANN model buffer size = 13313.00 MiB
395
346
  llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
396
347
  ```
397
348
 
398
349
  For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
399
350
 
400
- ### Android
351
+ ## Android
401
352
 
402
353
  To read documentation for how to build on Android, [click here](./android.md)
403
354
 
404
- ### Arm CPU optimized mulmat kernels
355
+ ## Notes about GPU-accelerated backends
356
+
357
+ The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.
405
358
 
406
- Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
359
+ In most cases, it is possible to build and use multiple backends at the same time. For example, you can build llama.cpp with both CUDA and Vulkan support by using the `-DGGML_CUDA=ON -DGGML_VULKAN=ON` options with CMake. At runtime, you can specify which backend devices to use with the `--device` option. To see a list of available devices, use the `--list-devices` option.
407
360
 
408
- To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
361
+ Backends can be built as dynamic libraries that can be loaded dynamically at runtime. This allows you to use the same llama.cpp binary on different machines with different GPUs. To enable this feature, use the `GGML_BACKEND_DL` option when building.
@@ -6,20 +6,26 @@ find_package(Threads REQUIRED)
6
6
 
7
7
  # ...
8
8
 
9
+ # flags
10
+
11
+ llama_add_compile_flags()
12
+
9
13
  # examples
10
14
 
11
15
  include_directories(${CMAKE_CURRENT_SOURCE_DIR})
12
16
 
13
17
  if (EMSCRIPTEN)
14
18
  else()
15
- add_subdirectory(cvector-generator)
16
19
  add_subdirectory(batched-bench)
17
20
  add_subdirectory(batched)
18
- add_subdirectory(convert-llama2c-to-ggml)
19
21
  add_subdirectory(embedding)
20
22
  add_subdirectory(eval-callback)
21
- add_subdirectory(export-lora)
22
- add_subdirectory(gbnf-validator)
23
+
24
+ if (NOT WIN32)
25
+ # disabled on Windows because it uses internal functions not exported with LLAMA_API
26
+ add_subdirectory(gbnf-validator)
27
+ endif()
28
+
23
29
  add_subdirectory(gguf-hash)
24
30
  add_subdirectory(gguf-split)
25
31
  add_subdirectory(gguf)
@@ -27,28 +33,41 @@ else()
27
33
  add_subdirectory(imatrix)
28
34
  add_subdirectory(infill)
29
35
  add_subdirectory(llama-bench)
30
- add_subdirectory(llava)
31
36
  add_subdirectory(lookahead)
32
37
  add_subdirectory(lookup)
33
38
  add_subdirectory(main)
34
39
  add_subdirectory(parallel)
35
40
  add_subdirectory(passkey)
36
41
  add_subdirectory(perplexity)
37
- add_subdirectory(quantize-stats)
38
42
  add_subdirectory(quantize)
39
43
  add_subdirectory(retrieval)
40
- if (GGML_RPC)
41
- add_subdirectory(rpc)
42
- endif()
43
44
  if (LLAMA_BUILD_SERVER)
44
- add_subdirectory(server)
45
- endif()
46
- if (GGML_SYCL)
47
- add_subdirectory(sycl)
45
+ add_subdirectory(server)
48
46
  endif()
49
47
  add_subdirectory(save-load-state)
48
+ add_subdirectory(run)
50
49
  add_subdirectory(simple)
51
50
  add_subdirectory(simple-chat)
52
51
  add_subdirectory(speculative)
52
+ add_subdirectory(speculative-simple)
53
53
  add_subdirectory(tokenize)
54
+ add_subdirectory(tts)
55
+ add_subdirectory(gen-docs)
56
+ if (NOT GGML_BACKEND_DL)
57
+ # these examples use the backends directly and cannot be built with dynamic loading
58
+ add_subdirectory(convert-llama2c-to-ggml)
59
+ add_subdirectory(cvector-generator)
60
+ add_subdirectory(export-lora)
61
+ if (NOT WIN32)
62
+ # disabled on Windows because it uses internal functions not exported with LLAMA_API
63
+ add_subdirectory(quantize-stats)
64
+ endif()
65
+ add_subdirectory(llava)
66
+ if (GGML_RPC)
67
+ add_subdirectory(rpc)
68
+ endif()
69
+ if (GGML_SYCL)
70
+ add_subdirectory(sycl)
71
+ endif()
72
+ endif()
54
73
  endif()
@@ -2,4 +2,4 @@ set(TARGET llama-batched)
2
2
  add_executable(${TARGET} batched.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -65,13 +65,14 @@ int main(int argc, char ** argv) {
65
65
  llama_context * ctx = llama_new_context_with_model(model, ctx_params);
66
66
 
67
67
  auto sparams = llama_sampler_chain_default_params();
68
+ sparams.no_perf = false;
68
69
 
69
70
  llama_sampler * smpl = llama_sampler_chain_init(sparams);
70
71
 
71
- llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
72
- llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
73
- llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
74
- llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
72
+ llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k));
73
+ llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep));
74
+ llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
75
+ llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
75
76
 
76
77
  if (ctx == NULL) {
77
78
  LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
@@ -2,4 +2,4 @@ set(TARGET llama-batched-bench)
2
2
  add_executable(${TARGET} batched-bench.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -2,4 +2,4 @@ set(TARGET llama-convert-llama2c-to-ggml)
2
2
  add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -2,4 +2,4 @@ set(TARGET llama-cvector-generator)
2
2
  add_executable(${TARGET} cvector-generator.cpp pca.hpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -12,7 +12,7 @@ int main(int argc, char** argv) {
12
12
  }
13
13
 
14
14
  // Get only the program name from the full path
15
- auto pos = filename.find_last_of('/');
15
+ auto pos = filename.find_last_of("/\\");
16
16
  if (pos != std::string::npos) {
17
17
  filename = filename.substr(pos+1);
18
18
  }
@@ -2,4 +2,4 @@ set(TARGET llama-embedding)
2
2
  add_executable(${TARGET} embedding.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -2,8 +2,9 @@ set(TARGET llama-eval-callback)
2
2
  add_executable(${TARGET} eval-callback.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
6
6
 
7
7
  set(TEST_TARGET test-eval-callback)
8
- add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
8
+ add_test(NAME ${TEST_TARGET}
9
+ COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
9
10
  set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
@@ -2,4 +2,4 @@ set(TARGET llama-export-lora)
2
2
  add_executable(${TARGET} export-lora.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -2,4 +2,4 @@ set(TARGET llama-gbnf-validator)
2
2
  add_executable(${TARGET} gbnf-validator.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -11,19 +11,15 @@
11
11
  static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
12
12
  const auto cpts = unicode_cpts_from_utf8(input_str);
13
13
 
14
- const llama_grammar_rules & rules = llama_grammar_get_rules (grammar);
15
- llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
14
+ auto & stacks_cur = llama_grammar_get_stacks(grammar);
16
15
 
17
16
  size_t pos = 0;
18
17
  for (const auto & cpt : cpts) {
19
- const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy
20
-
21
- llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
18
+ llama_grammar_accept(grammar, cpt);
22
19
 
23
20
  if (stacks_cur.empty()) {
24
21
  error_pos = pos;
25
22
  error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
26
- stacks_cur = stacks_prev;
27
23
  return false;
28
24
  }
29
25
  ++pos;
@@ -82,7 +78,8 @@ int main(int argc, char** argv) {
82
78
 
83
79
  llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
84
80
  if (grammar == nullptr) {
85
- throw std::runtime_error("Failed to initialize llama_grammar");
81
+ fprintf(stdout, "Failed to initialize llama_grammar\n");
82
+ return 1;
86
83
  }
87
84
  // Read the input file
88
85
  std::string input_str;
@@ -2,4 +2,4 @@ set(TARGET llama-gen-docs)
2
2
  add_executable(${TARGET} gen-docs.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -2,4 +2,4 @@ set(TARGET llama-gguf)
2
2
  add_executable(${TARGET} gguf.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -4,12 +4,19 @@ install(TARGETS ${TARGET} RUNTIME)
4
4
 
5
5
  # clibs dependencies
6
6
  include_directories(deps/)
7
+
7
8
  add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h)
8
9
  target_link_libraries(${TARGET} PRIVATE xxhash)
10
+
9
11
  add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h)
10
12
  target_link_libraries(${TARGET} PRIVATE sha1)
13
+ if (NOT MSVC)
14
+ # disable warnings in 3rd party code
15
+ target_compile_options(sha1 PRIVATE -w)
16
+ endif()
17
+
11
18
  add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
12
19
  target_link_libraries(${TARGET} PRIVATE sha256)
13
20
 
14
21
  target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
15
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
22
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -2,4 +2,4 @@ set(TARGET llama-gguf-split)
2
2
  add_executable(${TARGET} gguf-split.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -287,7 +287,7 @@ struct split_strategy {
287
287
  }
288
288
 
289
289
  void print_info() {
290
- printf("n_split: %ld\n", ctx_outs.size());
290
+ printf("n_split: %zu\n", ctx_outs.size());
291
291
  int i_split = 0;
292
292
  for (auto & ctx_out : ctx_outs) {
293
293
  // re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
@@ -297,7 +297,7 @@ struct split_strategy {
297
297
  total_size += ggml_nbytes(t);
298
298
  }
299
299
  total_size = total_size / 1000 / 1000; // convert to megabytes
300
- printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
300
+ printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
301
301
  i_split++;
302
302
  }
303
303
  }