@fugood/llama.node 0.3.16 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +238 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +6 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +160 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1760 -534
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -132,12 +132,14 @@ You may find the official downloads here: [NVIDIA developer site](https://develo
|
|
|
132
132
|
|
|
133
133
|
|
|
134
134
|
#### Compile and run inside a Fedora Toolbox Container
|
|
135
|
-
We also have a [guide](./
|
|
135
|
+
We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
|
|
136
136
|
|
|
137
137
|
**Recommended for:**
|
|
138
|
-
|
|
139
|
-
-
|
|
140
|
-
-
|
|
138
|
+
- ***Necessary*** for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
|
|
139
|
+
- (there are no supported CUDA packages for these systems)
|
|
140
|
+
- ***Necessary*** for users that have a host that is not a: [Supported Nvidia CUDA Release Platform](https://developer.nvidia.com/cuda-downloads).
|
|
141
|
+
- (for example, you may have [Fedora 42 Beta](https://fedoramagazine.org/announcing-fedora-linux-42-beta/) as your your host operating system)
|
|
142
|
+
- ***Convenient*** For those running [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde), and want to keep their host system clean.
|
|
141
143
|
- *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)
|
|
142
144
|
|
|
143
145
|
|
|
@@ -189,7 +191,7 @@ The following compilation options are also available to tweak performance:
|
|
|
189
191
|
|
|
190
192
|
| Option | Legal values | Default | Description |
|
|
191
193
|
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
192
|
-
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
|
|
194
|
+
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
|
|
193
195
|
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
|
|
194
196
|
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
|
195
197
|
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
|
@@ -216,6 +218,7 @@ By default, all supported compute capabilities are enabled. To customize this be
|
|
|
216
218
|
|
|
217
219
|
```bash
|
|
218
220
|
cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
|
|
221
|
+
cmake --build build --config Release
|
|
219
222
|
```
|
|
220
223
|
|
|
221
224
|
This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
|
|
@@ -256,8 +259,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
|
|
256
259
|
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
|
257
260
|
&& cmake --build build --config Release -- -j 16
|
|
258
261
|
```
|
|
259
|
-
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
|
|
260
|
-
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
|
261
262
|
|
|
262
263
|
To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
|
|
263
264
|
|
|
@@ -293,6 +294,10 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
|
|
293
294
|
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
|
|
294
295
|
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
|
|
295
296
|
|
|
297
|
+
### Unified Memory
|
|
298
|
+
|
|
299
|
+
On Linux it is possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
|
300
|
+
|
|
296
301
|
## Vulkan
|
|
297
302
|
|
|
298
303
|
**Windows**
|
|
@@ -433,6 +438,116 @@ llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
|
|
|
433
438
|
|
|
434
439
|
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
|
|
435
440
|
|
|
441
|
+
## Arm® KleidiAI™
|
|
442
|
+
KleidiAI is a library of optimized microkernels for AI workloads, specifically designed for Arm CPUs. These microkernels enhance performance and can be enabled for use by the CPU backend.
|
|
443
|
+
|
|
444
|
+
To enable KleidiAI, go to the llama.cpp directory and build using CMake
|
|
445
|
+
```bash
|
|
446
|
+
cmake -B build -DGGML_CPU_KLEIDIAI=ON
|
|
447
|
+
cmake --build build --config Release
|
|
448
|
+
```
|
|
449
|
+
You can verify that KleidiAI is being used by running
|
|
450
|
+
```bash
|
|
451
|
+
./build/bin/llama-cli -m PATH_TO_MODEL -p "What is a car?"
|
|
452
|
+
```
|
|
453
|
+
If KleidiAI is enabled, the ouput will contain a line similar to:
|
|
454
|
+
```
|
|
455
|
+
load_tensors: CPU_KLEIDIAI model buffer size = 3474.00 MiB
|
|
456
|
+
```
|
|
457
|
+
KleidiAI's microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm and SME. llama.cpp selects the most efficient kernel based on runtime CPU feature detection. However, on platforms that support SME, you must manually enable SME microkernels by setting the environment variable `GGML_KLEIDIAI_SME=1`.
|
|
458
|
+
|
|
459
|
+
Depending on your build target, other higher priority backends may be enabled by default. To ensure the CPU backend is used, you must disable the higher priority backends either at compile time, e.g. -DGGML_METAL=OFF, or during run-time using the command line option `--device none`.
|
|
460
|
+
|
|
461
|
+
## OpenCL
|
|
462
|
+
|
|
463
|
+
This provides GPU acceleration through OpenCL on recent Adreno GPU.
|
|
464
|
+
More information about OpenCL backend can be found in [OPENCL.md](./backend/OPENCL.md) for more information.
|
|
465
|
+
|
|
466
|
+
### Android
|
|
467
|
+
|
|
468
|
+
Assume NDK is available in `$ANDROID_NDK`. First, install OpenCL headers and ICD loader library if not available,
|
|
469
|
+
|
|
470
|
+
```sh
|
|
471
|
+
mkdir -p ~/dev/llm
|
|
472
|
+
cd ~/dev/llm
|
|
473
|
+
|
|
474
|
+
git clone https://github.com/KhronosGroup/OpenCL-Headers && \
|
|
475
|
+
cd OpenCL-Headers && \
|
|
476
|
+
cp -r CL $ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
|
|
477
|
+
|
|
478
|
+
cd ~/dev/llm
|
|
479
|
+
|
|
480
|
+
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && \
|
|
481
|
+
cd OpenCL-ICD-Loader && \
|
|
482
|
+
mkdir build_ndk && cd build_ndk && \
|
|
483
|
+
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
|
|
484
|
+
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
|
|
485
|
+
-DOPENCL_ICD_LOADER_HEADERS_DIR=$ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
|
|
486
|
+
-DANDROID_ABI=arm64-v8a \
|
|
487
|
+
-DANDROID_PLATFORM=24 \
|
|
488
|
+
-DANDROID_STL=c++_shared && \
|
|
489
|
+
ninja && \
|
|
490
|
+
cp libOpenCL.so $ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
|
|
491
|
+
```
|
|
492
|
+
|
|
493
|
+
Then build llama.cpp with OpenCL enabled,
|
|
494
|
+
|
|
495
|
+
```sh
|
|
496
|
+
cd ~/dev/llm
|
|
497
|
+
|
|
498
|
+
git clone https://github.com/ggml-org/llama.cpp && \
|
|
499
|
+
cd llama.cpp && \
|
|
500
|
+
mkdir build-android && cd build-android
|
|
501
|
+
|
|
502
|
+
cmake .. -G Ninja \
|
|
503
|
+
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
|
|
504
|
+
-DANDROID_ABI=arm64-v8a \
|
|
505
|
+
-DANDROID_PLATFORM=android-28 \
|
|
506
|
+
-DBUILD_SHARED_LIBS=OFF \
|
|
507
|
+
-DGGML_OPENCL=ON
|
|
508
|
+
|
|
509
|
+
ninja
|
|
510
|
+
```
|
|
511
|
+
|
|
512
|
+
### Windows Arm64
|
|
513
|
+
|
|
514
|
+
First, install OpenCL headers and ICD loader library if not available,
|
|
515
|
+
|
|
516
|
+
```powershell
|
|
517
|
+
mkdir -p ~/dev/llm
|
|
518
|
+
|
|
519
|
+
cd ~/dev/llm
|
|
520
|
+
git clone https://github.com/KhronosGroup/OpenCL-Headers && cd OpenCL-Headers
|
|
521
|
+
mkdir build && cd build
|
|
522
|
+
cmake .. -G Ninja `
|
|
523
|
+
-DBUILD_TESTING=OFF `
|
|
524
|
+
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
|
|
525
|
+
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
|
|
526
|
+
-DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
|
|
527
|
+
cmake --build . --target install
|
|
528
|
+
|
|
529
|
+
cd ~/dev/llm
|
|
530
|
+
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && cd OpenCL-ICD-Loader
|
|
531
|
+
mkdir build && cd build
|
|
532
|
+
cmake .. -G Ninja `
|
|
533
|
+
-DCMAKE_BUILD_TYPE=Release `
|
|
534
|
+
-DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
|
|
535
|
+
-DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
|
|
536
|
+
cmake --build . --target install
|
|
537
|
+
```
|
|
538
|
+
|
|
539
|
+
Then build llama.cpp with OpenCL enabled,
|
|
540
|
+
|
|
541
|
+
```powershell
|
|
542
|
+
cmake .. -G Ninja `
|
|
543
|
+
-DCMAKE_TOOLCHAIN_FILE="$HOME/dev/llm/llama.cpp/cmake/arm64-windows-llvm.cmake" `
|
|
544
|
+
-DCMAKE_BUILD_TYPE=Release `
|
|
545
|
+
-DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
|
|
546
|
+
-DBUILD_SHARED_LIBS=OFF `
|
|
547
|
+
-DGGML_OPENCL=ON
|
|
548
|
+
ninja
|
|
549
|
+
```
|
|
550
|
+
|
|
436
551
|
## Android
|
|
437
552
|
|
|
438
553
|
To read documentation for how to build on Android, [click here](./android.md)
|
|
@@ -21,11 +21,6 @@ else()
|
|
|
21
21
|
add_subdirectory(embedding)
|
|
22
22
|
add_subdirectory(eval-callback)
|
|
23
23
|
|
|
24
|
-
if (NOT WIN32)
|
|
25
|
-
# disabled on Windows because it uses internal functions not exported with LLAMA_API
|
|
26
|
-
add_subdirectory(gbnf-validator)
|
|
27
|
-
endif()
|
|
28
|
-
|
|
29
24
|
add_subdirectory(gguf-hash)
|
|
30
25
|
add_subdirectory(gguf-split)
|
|
31
26
|
add_subdirectory(gguf)
|
|
@@ -58,10 +53,6 @@ else()
|
|
|
58
53
|
add_subdirectory(convert-llama2c-to-ggml)
|
|
59
54
|
add_subdirectory(cvector-generator)
|
|
60
55
|
add_subdirectory(export-lora)
|
|
61
|
-
if (NOT WIN32)
|
|
62
|
-
# disabled on Windows because it uses internal functions not exported with LLAMA_API
|
|
63
|
-
add_subdirectory(quantize-stats)
|
|
64
|
-
endif()
|
|
65
56
|
add_subdirectory(llava)
|
|
66
57
|
if (GGML_RPC)
|
|
67
58
|
add_subdirectory(rpc)
|
|
@@ -41,7 +41,7 @@ int main(int argc, char ** argv) {
|
|
|
41
41
|
|
|
42
42
|
llama_model_params model_params = common_model_params_to_llama(params);
|
|
43
43
|
|
|
44
|
-
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
|
|
44
|
+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
|
|
45
45
|
|
|
46
46
|
if (model == NULL) {
|
|
47
47
|
LOG_ERR("%s: error: unable to load model\n" , __func__);
|
|
@@ -38,7 +38,7 @@ int main(int argc, char ** argv) {
|
|
|
38
38
|
|
|
39
39
|
llama_model_params model_params = common_model_params_to_llama(params);
|
|
40
40
|
|
|
41
|
-
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
|
|
41
|
+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
|
|
42
42
|
|
|
43
43
|
if (model == NULL) {
|
|
44
44
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
|
@@ -89,6 +89,13 @@ int main(int argc, char ** argv) {
|
|
|
89
89
|
common_init();
|
|
90
90
|
|
|
91
91
|
params.embedding = true;
|
|
92
|
+
|
|
93
|
+
// utilize the full context
|
|
94
|
+
if (params.n_batch < params.n_ctx) {
|
|
95
|
+
LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
|
|
96
|
+
params.n_batch = params.n_ctx;
|
|
97
|
+
}
|
|
98
|
+
|
|
92
99
|
// For non-causal models, batch size must be equal to ubatch size
|
|
93
100
|
params.n_ubatch = params.n_batch;
|
|
94
101
|
|
|
@@ -134,7 +141,6 @@ int main(int argc, char ** argv) {
|
|
|
134
141
|
|
|
135
142
|
// max batch size
|
|
136
143
|
const uint64_t n_batch = params.n_batch;
|
|
137
|
-
GGML_ASSERT(params.n_batch >= params.n_ctx);
|
|
138
144
|
|
|
139
145
|
// tokenize the prompts and trim
|
|
140
146
|
std::vector<std::vector<int32_t>> inputs;
|
|
@@ -421,7 +421,7 @@ int main(int argc, char ** argv) {
|
|
|
421
421
|
|
|
422
422
|
g_verbose = (params.verbosity > 1);
|
|
423
423
|
try {
|
|
424
|
-
lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
|
|
424
|
+
lora_merge_ctx ctx(params.model.path, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
|
|
425
425
|
ctx.run_merge();
|
|
426
426
|
} catch (const std::exception & err) {
|
|
427
427
|
fprintf(stderr, "%s\n", err.what());
|
|
@@ -408,8 +408,6 @@ static void gguf_merge(const split_params & split_params) {
|
|
|
408
408
|
exit(EXIT_FAILURE);
|
|
409
409
|
}
|
|
410
410
|
|
|
411
|
-
std::ofstream fout(split_params.output.c_str(), std::ios::binary);
|
|
412
|
-
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
|
413
411
|
|
|
414
412
|
auto * ctx_out = gguf_init_empty();
|
|
415
413
|
|
|
@@ -453,7 +451,6 @@ static void gguf_merge(const split_params & split_params) {
|
|
|
453
451
|
gguf_free(ctx_gguf);
|
|
454
452
|
ggml_free(ctx_meta);
|
|
455
453
|
gguf_free(ctx_out);
|
|
456
|
-
fout.close();
|
|
457
454
|
exit(EXIT_FAILURE);
|
|
458
455
|
}
|
|
459
456
|
|
|
@@ -466,7 +463,6 @@ static void gguf_merge(const split_params & split_params) {
|
|
|
466
463
|
gguf_free(ctx_gguf);
|
|
467
464
|
ggml_free(ctx_meta);
|
|
468
465
|
gguf_free(ctx_out);
|
|
469
|
-
fout.close();
|
|
470
466
|
exit(EXIT_FAILURE);
|
|
471
467
|
}
|
|
472
468
|
|
|
@@ -479,7 +475,6 @@ static void gguf_merge(const split_params & split_params) {
|
|
|
479
475
|
gguf_free(ctx_gguf);
|
|
480
476
|
ggml_free(ctx_meta);
|
|
481
477
|
gguf_free(ctx_out);
|
|
482
|
-
fout.close();
|
|
483
478
|
exit(EXIT_FAILURE);
|
|
484
479
|
}
|
|
485
480
|
|
|
@@ -500,9 +495,11 @@ static void gguf_merge(const split_params & split_params) {
|
|
|
500
495
|
|
|
501
496
|
fprintf(stderr, "\033[3Ddone\n");
|
|
502
497
|
}
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
498
|
+
std::ofstream fout;
|
|
499
|
+
if (!split_params.dry_run) {
|
|
500
|
+
fout.open(split_params.output.c_str(), std::ios::binary);
|
|
501
|
+
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
|
502
|
+
// placeholder for the meta data
|
|
506
503
|
auto meta_size = gguf_get_meta_size(ctx_out);
|
|
507
504
|
::zeros(fout, meta_size);
|
|
508
505
|
}
|
|
@@ -518,7 +515,9 @@ static void gguf_merge(const split_params & split_params) {
|
|
|
518
515
|
ggml_free(ctx_metas[i]);
|
|
519
516
|
}
|
|
520
517
|
gguf_free(ctx_out);
|
|
521
|
-
|
|
518
|
+
if (!split_params.dry_run) {
|
|
519
|
+
fout.close();
|
|
520
|
+
}
|
|
522
521
|
exit(EXIT_FAILURE);
|
|
523
522
|
}
|
|
524
523
|
fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
|
|
@@ -540,10 +539,11 @@ static void gguf_merge(const split_params & split_params) {
|
|
|
540
539
|
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
|
|
541
540
|
f_input.seekg(offset);
|
|
542
541
|
f_input.read((char *)read_data.data(), n_bytes);
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
542
|
+
if (!split_params.dry_run) {
|
|
543
|
+
// write tensor data + padding
|
|
544
|
+
fout.write((const char *)read_data.data(), n_bytes);
|
|
545
|
+
zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
|
|
546
|
+
}
|
|
547
547
|
}
|
|
548
548
|
|
|
549
549
|
gguf_free(ctx_gguf);
|
|
@@ -552,16 +552,15 @@ static void gguf_merge(const split_params & split_params) {
|
|
|
552
552
|
fprintf(stderr, "\033[3Ddone\n");
|
|
553
553
|
}
|
|
554
554
|
|
|
555
|
-
{
|
|
555
|
+
if (!split_params.dry_run) {
|
|
556
556
|
// go back to beginning of file and write the updated metadata
|
|
557
557
|
fout.seekp(0);
|
|
558
558
|
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
|
559
559
|
gguf_get_meta_data(ctx_out, data.data());
|
|
560
560
|
fout.write((const char *)data.data(), data.size());
|
|
561
|
-
|
|
562
561
|
fout.close();
|
|
563
|
-
gguf_free(ctx_out);
|
|
564
562
|
}
|
|
563
|
+
gguf_free(ctx_out);
|
|
565
564
|
|
|
566
565
|
fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n",
|
|
567
566
|
__func__, split_params.output.c_str(), n_split, total_tensors);
|
|
@@ -168,7 +168,7 @@ int main(int argc, char * argv[]) {
|
|
|
168
168
|
|
|
169
169
|
llama_backend_init();
|
|
170
170
|
|
|
171
|
-
llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
|
|
171
|
+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
|
172
172
|
|
|
173
173
|
// create generation context
|
|
174
174
|
llama_context * ctx = llama_init_from_model(model, cparams);
|