@fugood/llama.node 0.3.16 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +238 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +6 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  130. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
  131. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  133. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  135. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  136. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
  142. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  143. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  144. package/src/llama.cpp/include/llama.h +30 -11
  145. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  147. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  149. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  150. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  151. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  152. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  153. package/src/llama.cpp/src/llama-arch.cpp +160 -17
  154. package/src/llama.cpp/src/llama-arch.h +16 -0
  155. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  156. package/src/llama.cpp/src/llama-chat.h +6 -2
  157. package/src/llama.cpp/src/llama-context.cpp +108 -92
  158. package/src/llama.cpp/src/llama-context.h +1 -2
  159. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  160. package/src/llama.cpp/src/llama-graph.h +26 -6
  161. package/src/llama.cpp/src/llama-hparams.h +13 -0
  162. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  163. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  164. package/src/llama.cpp/src/llama-memory.h +1 -1
  165. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  166. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  167. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  168. package/src/llama.cpp/src/llama-model.cpp +1760 -534
  169. package/src/llama.cpp/src/llama-model.h +13 -1
  170. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  171. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  172. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  173. package/src/llama.cpp/src/llama.cpp +1 -1
  174. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  175. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  176. package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
  177. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  178. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  179. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  180. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  181. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  182. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  183. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  184. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  185. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  186. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  188. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  189. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  190. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  191. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  192. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  193. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -132,12 +132,14 @@ You may find the official downloads here: [NVIDIA developer site](https://develo
132
132
 
133
133
 
134
134
  #### Compile and run inside a Fedora Toolbox Container
135
- We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
135
+ We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
136
136
 
137
137
  **Recommended for:**
138
-
139
- - ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
140
- - Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde).
138
+ - ***Necessary*** for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
139
+ - (there are no supported CUDA packages for these systems)
140
+ - ***Necessary*** for users that have a host that is not a: [Supported Nvidia CUDA Release Platform](https://developer.nvidia.com/cuda-downloads).
141
+ - (for example, you may have [Fedora 42 Beta](https://fedoramagazine.org/announcing-fedora-linux-42-beta/) as your your host operating system)
142
+ - ***Convenient*** For those running [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde), and want to keep their host system clean.
141
143
  - *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)
142
144
 
143
145
 
@@ -189,7 +191,7 @@ The following compilation options are also available to tweak performance:
189
191
 
190
192
  | Option | Legal values | Default | Description |
191
193
  |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
192
- | GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
194
+ | GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
193
195
  | GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
194
196
  | GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
195
197
  | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
@@ -216,6 +218,7 @@ By default, all supported compute capabilities are enabled. To customize this be
216
218
 
217
219
  ```bash
218
220
  cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
221
+ cmake --build build --config Release
219
222
  ```
220
223
 
221
224
  This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
@@ -256,8 +259,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
256
259
  cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
257
260
  && cmake --build build --config Release -- -j 16
258
261
  ```
259
- On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
260
- However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
261
262
 
262
263
  To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
263
264
 
@@ -293,6 +294,10 @@ You can download it from your Linux distro's package manager or from here: [ROCm
293
294
  The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
294
295
  If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
295
296
 
297
+ ### Unified Memory
298
+
299
+ On Linux it is possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
300
+
296
301
  ## Vulkan
297
302
 
298
303
  **Windows**
@@ -433,6 +438,116 @@ llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
433
438
 
434
439
  For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
435
440
 
441
+ ## Arm® KleidiAI™
442
+ KleidiAI is a library of optimized microkernels for AI workloads, specifically designed for Arm CPUs. These microkernels enhance performance and can be enabled for use by the CPU backend.
443
+
444
+ To enable KleidiAI, go to the llama.cpp directory and build using CMake
445
+ ```bash
446
+ cmake -B build -DGGML_CPU_KLEIDIAI=ON
447
+ cmake --build build --config Release
448
+ ```
449
+ You can verify that KleidiAI is being used by running
450
+ ```bash
451
+ ./build/bin/llama-cli -m PATH_TO_MODEL -p "What is a car?"
452
+ ```
453
+ If KleidiAI is enabled, the ouput will contain a line similar to:
454
+ ```
455
+ load_tensors: CPU_KLEIDIAI model buffer size = 3474.00 MiB
456
+ ```
457
+ KleidiAI's microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm and SME. llama.cpp selects the most efficient kernel based on runtime CPU feature detection. However, on platforms that support SME, you must manually enable SME microkernels by setting the environment variable `GGML_KLEIDIAI_SME=1`.
458
+
459
+ Depending on your build target, other higher priority backends may be enabled by default. To ensure the CPU backend is used, you must disable the higher priority backends either at compile time, e.g. -DGGML_METAL=OFF, or during run-time using the command line option `--device none`.
460
+
461
+ ## OpenCL
462
+
463
+ This provides GPU acceleration through OpenCL on recent Adreno GPU.
464
+ More information about OpenCL backend can be found in [OPENCL.md](./backend/OPENCL.md) for more information.
465
+
466
+ ### Android
467
+
468
+ Assume NDK is available in `$ANDROID_NDK`. First, install OpenCL headers and ICD loader library if not available,
469
+
470
+ ```sh
471
+ mkdir -p ~/dev/llm
472
+ cd ~/dev/llm
473
+
474
+ git clone https://github.com/KhronosGroup/OpenCL-Headers && \
475
+ cd OpenCL-Headers && \
476
+ cp -r CL $ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
477
+
478
+ cd ~/dev/llm
479
+
480
+ git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && \
481
+ cd OpenCL-ICD-Loader && \
482
+ mkdir build_ndk && cd build_ndk && \
483
+ cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
484
+ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
485
+ -DOPENCL_ICD_LOADER_HEADERS_DIR=$ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
486
+ -DANDROID_ABI=arm64-v8a \
487
+ -DANDROID_PLATFORM=24 \
488
+ -DANDROID_STL=c++_shared && \
489
+ ninja && \
490
+ cp libOpenCL.so $ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
491
+ ```
492
+
493
+ Then build llama.cpp with OpenCL enabled,
494
+
495
+ ```sh
496
+ cd ~/dev/llm
497
+
498
+ git clone https://github.com/ggml-org/llama.cpp && \
499
+ cd llama.cpp && \
500
+ mkdir build-android && cd build-android
501
+
502
+ cmake .. -G Ninja \
503
+ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
504
+ -DANDROID_ABI=arm64-v8a \
505
+ -DANDROID_PLATFORM=android-28 \
506
+ -DBUILD_SHARED_LIBS=OFF \
507
+ -DGGML_OPENCL=ON
508
+
509
+ ninja
510
+ ```
511
+
512
+ ### Windows Arm64
513
+
514
+ First, install OpenCL headers and ICD loader library if not available,
515
+
516
+ ```powershell
517
+ mkdir -p ~/dev/llm
518
+
519
+ cd ~/dev/llm
520
+ git clone https://github.com/KhronosGroup/OpenCL-Headers && cd OpenCL-Headers
521
+ mkdir build && cd build
522
+ cmake .. -G Ninja `
523
+ -DBUILD_TESTING=OFF `
524
+ -DOPENCL_HEADERS_BUILD_TESTING=OFF `
525
+ -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
526
+ -DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
527
+ cmake --build . --target install
528
+
529
+ cd ~/dev/llm
530
+ git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && cd OpenCL-ICD-Loader
531
+ mkdir build && cd build
532
+ cmake .. -G Ninja `
533
+ -DCMAKE_BUILD_TYPE=Release `
534
+ -DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
535
+ -DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
536
+ cmake --build . --target install
537
+ ```
538
+
539
+ Then build llama.cpp with OpenCL enabled,
540
+
541
+ ```powershell
542
+ cmake .. -G Ninja `
543
+ -DCMAKE_TOOLCHAIN_FILE="$HOME/dev/llm/llama.cpp/cmake/arm64-windows-llvm.cmake" `
544
+ -DCMAKE_BUILD_TYPE=Release `
545
+ -DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
546
+ -DBUILD_SHARED_LIBS=OFF `
547
+ -DGGML_OPENCL=ON
548
+ ninja
549
+ ```
550
+
436
551
  ## Android
437
552
 
438
553
  To read documentation for how to build on Android, [click here](./android.md)
@@ -21,11 +21,6 @@ else()
21
21
  add_subdirectory(embedding)
22
22
  add_subdirectory(eval-callback)
23
23
 
24
- if (NOT WIN32)
25
- # disabled on Windows because it uses internal functions not exported with LLAMA_API
26
- add_subdirectory(gbnf-validator)
27
- endif()
28
-
29
24
  add_subdirectory(gguf-hash)
30
25
  add_subdirectory(gguf-split)
31
26
  add_subdirectory(gguf)
@@ -58,10 +53,6 @@ else()
58
53
  add_subdirectory(convert-llama2c-to-ggml)
59
54
  add_subdirectory(cvector-generator)
60
55
  add_subdirectory(export-lora)
61
- if (NOT WIN32)
62
- # disabled on Windows because it uses internal functions not exported with LLAMA_API
63
- add_subdirectory(quantize-stats)
64
- endif()
65
56
  add_subdirectory(llava)
66
57
  if (GGML_RPC)
67
58
  add_subdirectory(rpc)
@@ -41,7 +41,7 @@ int main(int argc, char ** argv) {
41
41
 
42
42
  llama_model_params model_params = common_model_params_to_llama(params);
43
43
 
44
- llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
44
+ llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
45
45
 
46
46
  if (model == NULL) {
47
47
  LOG_ERR("%s: error: unable to load model\n" , __func__);
@@ -38,7 +38,7 @@ int main(int argc, char ** argv) {
38
38
 
39
39
  llama_model_params model_params = common_model_params_to_llama(params);
40
40
 
41
- llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
41
+ llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
42
42
 
43
43
  if (model == NULL) {
44
44
  fprintf(stderr , "%s: error: unable to load model\n" , __func__);
@@ -89,6 +89,13 @@ int main(int argc, char ** argv) {
89
89
  common_init();
90
90
 
91
91
  params.embedding = true;
92
+
93
+ // utilize the full context
94
+ if (params.n_batch < params.n_ctx) {
95
+ LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
96
+ params.n_batch = params.n_ctx;
97
+ }
98
+
92
99
  // For non-causal models, batch size must be equal to ubatch size
93
100
  params.n_ubatch = params.n_batch;
94
101
 
@@ -134,7 +141,6 @@ int main(int argc, char ** argv) {
134
141
 
135
142
  // max batch size
136
143
  const uint64_t n_batch = params.n_batch;
137
- GGML_ASSERT(params.n_batch >= params.n_ctx);
138
144
 
139
145
  // tokenize the prompts and trim
140
146
  std::vector<std::vector<int32_t>> inputs;
@@ -421,7 +421,7 @@ int main(int argc, char ** argv) {
421
421
 
422
422
  g_verbose = (params.verbosity > 1);
423
423
  try {
424
- lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
424
+ lora_merge_ctx ctx(params.model.path, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
425
425
  ctx.run_merge();
426
426
  } catch (const std::exception & err) {
427
427
  fprintf(stderr, "%s\n", err.what());
@@ -408,8 +408,6 @@ static void gguf_merge(const split_params & split_params) {
408
408
  exit(EXIT_FAILURE);
409
409
  }
410
410
 
411
- std::ofstream fout(split_params.output.c_str(), std::ios::binary);
412
- fout.exceptions(std::ofstream::failbit); // fail fast on write errors
413
411
 
414
412
  auto * ctx_out = gguf_init_empty();
415
413
 
@@ -453,7 +451,6 @@ static void gguf_merge(const split_params & split_params) {
453
451
  gguf_free(ctx_gguf);
454
452
  ggml_free(ctx_meta);
455
453
  gguf_free(ctx_out);
456
- fout.close();
457
454
  exit(EXIT_FAILURE);
458
455
  }
459
456
 
@@ -466,7 +463,6 @@ static void gguf_merge(const split_params & split_params) {
466
463
  gguf_free(ctx_gguf);
467
464
  ggml_free(ctx_meta);
468
465
  gguf_free(ctx_out);
469
- fout.close();
470
466
  exit(EXIT_FAILURE);
471
467
  }
472
468
 
@@ -479,7 +475,6 @@ static void gguf_merge(const split_params & split_params) {
479
475
  gguf_free(ctx_gguf);
480
476
  ggml_free(ctx_meta);
481
477
  gguf_free(ctx_out);
482
- fout.close();
483
478
  exit(EXIT_FAILURE);
484
479
  }
485
480
 
@@ -500,9 +495,11 @@ static void gguf_merge(const split_params & split_params) {
500
495
 
501
496
  fprintf(stderr, "\033[3Ddone\n");
502
497
  }
503
-
504
- // placeholder for the meta data
505
- {
498
+ std::ofstream fout;
499
+ if (!split_params.dry_run) {
500
+ fout.open(split_params.output.c_str(), std::ios::binary);
501
+ fout.exceptions(std::ofstream::failbit); // fail fast on write errors
502
+ // placeholder for the meta data
506
503
  auto meta_size = gguf_get_meta_size(ctx_out);
507
504
  ::zeros(fout, meta_size);
508
505
  }
@@ -518,7 +515,9 @@ static void gguf_merge(const split_params & split_params) {
518
515
  ggml_free(ctx_metas[i]);
519
516
  }
520
517
  gguf_free(ctx_out);
521
- fout.close();
518
+ if (!split_params.dry_run) {
519
+ fout.close();
520
+ }
522
521
  exit(EXIT_FAILURE);
523
522
  }
524
523
  fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
@@ -540,10 +539,11 @@ static void gguf_merge(const split_params & split_params) {
540
539
  auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
541
540
  f_input.seekg(offset);
542
541
  f_input.read((char *)read_data.data(), n_bytes);
543
-
544
- // write tensor data + padding
545
- fout.write((const char *)read_data.data(), n_bytes);
546
- zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
542
+ if (!split_params.dry_run) {
543
+ // write tensor data + padding
544
+ fout.write((const char *)read_data.data(), n_bytes);
545
+ zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
546
+ }
547
547
  }
548
548
 
549
549
  gguf_free(ctx_gguf);
@@ -552,16 +552,15 @@ static void gguf_merge(const split_params & split_params) {
552
552
  fprintf(stderr, "\033[3Ddone\n");
553
553
  }
554
554
 
555
- {
555
+ if (!split_params.dry_run) {
556
556
  // go back to beginning of file and write the updated metadata
557
557
  fout.seekp(0);
558
558
  std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
559
559
  gguf_get_meta_data(ctx_out, data.data());
560
560
  fout.write((const char *)data.data(), data.size());
561
-
562
561
  fout.close();
563
- gguf_free(ctx_out);
564
562
  }
563
+ gguf_free(ctx_out);
565
564
 
566
565
  fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n",
567
566
  __func__, split_params.output.c_str(), n_split, total_tensors);
@@ -168,7 +168,7 @@ int main(int argc, char * argv[]) {
168
168
 
169
169
  llama_backend_init();
170
170
 
171
- llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
171
+ llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
172
172
 
173
173
  // create generation context
174
174
  llama_context * ctx = llama_init_from_model(model, cparams);