@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -178,22 +178,24 @@ For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](ht
|
|
|
178
178
|
cmake --build build --config Release
|
|
179
179
|
```
|
|
180
180
|
|
|
181
|
-
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
|
|
181
|
+
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
|
|
182
|
+
|
|
183
|
+
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
|
|
184
|
+
|
|
185
|
+
The following compilation options are also available to tweak performance:
|
|
182
186
|
|
|
183
187
|
| Option | Legal values | Default | Description |
|
|
184
188
|
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
185
|
-
| GGML_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
|
186
|
-
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
|
187
|
-
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
|
188
189
|
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
|
|
189
190
|
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
|
|
190
191
|
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
|
191
|
-
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
|
192
192
|
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
|
193
193
|
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
|
|
194
194
|
|
|
195
195
|
### MUSA
|
|
196
196
|
|
|
197
|
+
This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
|
|
198
|
+
|
|
197
199
|
- Using `make`:
|
|
198
200
|
```bash
|
|
199
201
|
make GGML_MUSA=1
|
|
@@ -205,6 +207,12 @@ The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/c
|
|
|
205
207
|
cmake --build build --config Release
|
|
206
208
|
```
|
|
207
209
|
|
|
210
|
+
The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
|
|
211
|
+
|
|
212
|
+
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
|
|
213
|
+
|
|
214
|
+
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
|
|
215
|
+
|
|
208
216
|
### hipBLAS
|
|
209
217
|
|
|
210
218
|
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
|
@@ -218,7 +226,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
|
|
218
226
|
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
|
|
219
227
|
```bash
|
|
220
228
|
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
|
221
|
-
cmake -S . -B build -
|
|
229
|
+
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
|
222
230
|
&& cmake --build build --config Release -- -j 16
|
|
223
231
|
```
|
|
224
232
|
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
|
|
@@ -235,7 +243,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
|
|
235
243
|
```bash
|
|
236
244
|
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
|
|
237
245
|
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
|
|
238
|
-
cmake -S . -B build -
|
|
246
|
+
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
|
239
247
|
&& cmake --build build -- -j 16
|
|
240
248
|
```
|
|
241
249
|
|
|
@@ -247,7 +255,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
|
|
247
255
|
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
|
248
256
|
```bash
|
|
249
257
|
set PATH=%HIP_PATH%\bin;%PATH%
|
|
250
|
-
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -
|
|
258
|
+
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
|
|
251
259
|
cmake --build build
|
|
252
260
|
```
|
|
253
261
|
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
|
@@ -256,13 +264,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
|
|
256
264
|
|
|
257
265
|
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
|
|
258
266
|
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
|
|
259
|
-
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
|
|
260
|
-
|
|
261
|
-
| Option | Legal values | Default | Description |
|
|
262
|
-
|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
263
|
-
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
|
264
|
-
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
|
265
|
-
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
|
266
267
|
|
|
267
268
|
### Vulkan
|
|
268
269
|
|
|
@@ -270,9 +271,9 @@ The following compilation options are also available to tweak performance (yes,
|
|
|
270
271
|
|
|
271
272
|
#### w64devkit
|
|
272
273
|
|
|
273
|
-
Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
|
274
|
+
Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
|
|
274
275
|
|
|
275
|
-
Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows)
|
|
276
|
+
Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
|
|
276
277
|
|
|
277
278
|
Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
|
|
278
279
|
```sh
|
|
@@ -290,6 +291,29 @@ EOF
|
|
|
290
291
|
```
|
|
291
292
|
Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
|
|
292
293
|
|
|
294
|
+
#### Git Bash MINGW64
|
|
295
|
+
|
|
296
|
+
Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
|
|
297
|
+
|
|
298
|
+
Download and install [`Visual Studio Community Edition`](https://visualstudio.microsoft.com/) and make sure you select `C++`
|
|
299
|
+
|
|
300
|
+
Download and install [`CMake`](https://cmake.org/download/) with the default settings
|
|
301
|
+
|
|
302
|
+
Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
|
|
303
|
+
|
|
304
|
+
Go into your `llama.cpp` directory and right click, select `Open Git Bash Here` and then run the following commands
|
|
305
|
+
|
|
306
|
+
```
|
|
307
|
+
cmake -B build -DGGML_VULKAN=ON
|
|
308
|
+
cmake --build build --config Release
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
Now you can load the model in conversation mode using `Vulkan`
|
|
312
|
+
|
|
313
|
+
```
|
|
314
|
+
build/bin/release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
|
|
315
|
+
```
|
|
316
|
+
|
|
293
317
|
#### MSYS2
|
|
294
318
|
Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
|
|
295
319
|
```sh
|
|
@@ -348,6 +372,37 @@ cmake --build build --config Release
|
|
|
348
372
|
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
|
349
373
|
```
|
|
350
374
|
|
|
375
|
+
### CANN
|
|
376
|
+
This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
|
|
377
|
+
|
|
378
|
+
For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
|
|
379
|
+
|
|
380
|
+
Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
|
|
381
|
+
|
|
382
|
+
Go to `llama.cpp` directory and build using CMake.
|
|
383
|
+
```bash
|
|
384
|
+
cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
|
|
385
|
+
cmake --build build --config release
|
|
386
|
+
```
|
|
387
|
+
|
|
388
|
+
You can test with:
|
|
389
|
+
|
|
390
|
+
`./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
|
|
391
|
+
|
|
392
|
+
If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
|
|
393
|
+
```bash
|
|
394
|
+
llm_load_tensors: CANN buffer size = 13313.00 MiB
|
|
395
|
+
llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
|
|
396
|
+
```
|
|
397
|
+
|
|
398
|
+
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
|
|
399
|
+
|
|
351
400
|
### Android
|
|
352
401
|
|
|
353
402
|
To read documentation for how to build on Android, [click here](./android.md)
|
|
403
|
+
|
|
404
|
+
### Arm CPU optimized mulmat kernels
|
|
405
|
+
|
|
406
|
+
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
|
|
407
|
+
|
|
408
|
+
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
|
|
@@ -13,10 +13,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
|
|
13
13
|
if (EMSCRIPTEN)
|
|
14
14
|
else()
|
|
15
15
|
add_subdirectory(cvector-generator)
|
|
16
|
-
add_subdirectory(baby-llama)
|
|
17
16
|
add_subdirectory(batched-bench)
|
|
18
17
|
add_subdirectory(batched)
|
|
19
|
-
add_subdirectory(benchmark)
|
|
20
18
|
add_subdirectory(convert-llama2c-to-ggml)
|
|
21
19
|
add_subdirectory(embedding)
|
|
22
20
|
add_subdirectory(eval-callback)
|
|
@@ -50,6 +48,7 @@ else()
|
|
|
50
48
|
endif()
|
|
51
49
|
add_subdirectory(save-load-state)
|
|
52
50
|
add_subdirectory(simple)
|
|
51
|
+
add_subdirectory(simple-chat)
|
|
53
52
|
add_subdirectory(speculative)
|
|
54
53
|
add_subdirectory(tokenize)
|
|
55
54
|
endif()
|
|
@@ -1,31 +1,30 @@
|
|
|
1
|
+
#include "arg.h"
|
|
1
2
|
#include "common.h"
|
|
3
|
+
#include "log.h"
|
|
2
4
|
#include "llama.h"
|
|
3
5
|
|
|
4
6
|
#include <algorithm>
|
|
5
|
-
#include <cmath>
|
|
6
7
|
#include <cstdio>
|
|
7
8
|
#include <string>
|
|
8
9
|
#include <vector>
|
|
9
10
|
|
|
10
|
-
static void print_usage(int
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
|
|
15
|
-
LOG_TEE("\n");
|
|
11
|
+
static void print_usage(int, char ** argv) {
|
|
12
|
+
LOG("\nexample usage:\n");
|
|
13
|
+
LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
|
|
14
|
+
LOG("\n");
|
|
16
15
|
}
|
|
17
16
|
|
|
18
17
|
int main(int argc, char ** argv) {
|
|
19
|
-
|
|
18
|
+
common_params params;
|
|
20
19
|
|
|
21
20
|
params.prompt = "Hello my name is";
|
|
22
21
|
params.n_predict = 32;
|
|
23
22
|
|
|
24
|
-
if (!
|
|
25
|
-
print_usage(argc, argv, params);
|
|
23
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
|
|
26
24
|
return 1;
|
|
27
25
|
}
|
|
28
26
|
|
|
27
|
+
common_init();
|
|
29
28
|
|
|
30
29
|
// number of parallel batches
|
|
31
30
|
int n_parallel = params.n_parallel;
|
|
@@ -40,57 +39,64 @@ int main(int argc, char ** argv) {
|
|
|
40
39
|
|
|
41
40
|
// initialize the model
|
|
42
41
|
|
|
43
|
-
llama_model_params model_params =
|
|
42
|
+
llama_model_params model_params = common_model_params_to_llama(params);
|
|
44
43
|
|
|
45
44
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
|
46
45
|
|
|
47
46
|
if (model == NULL) {
|
|
48
|
-
|
|
47
|
+
LOG_ERR("%s: error: unable to load model\n" , __func__);
|
|
49
48
|
return 1;
|
|
50
49
|
}
|
|
51
50
|
|
|
52
51
|
// tokenize the prompt
|
|
53
52
|
|
|
54
53
|
std::vector<llama_token> tokens_list;
|
|
55
|
-
tokens_list =
|
|
54
|
+
tokens_list = common_tokenize(model, params.prompt, true);
|
|
56
55
|
|
|
57
56
|
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
|
|
58
57
|
|
|
59
58
|
// initialize the context
|
|
60
59
|
|
|
61
|
-
llama_context_params ctx_params =
|
|
60
|
+
llama_context_params ctx_params = common_context_params_to_llama(params);
|
|
62
61
|
|
|
63
62
|
ctx_params.n_ctx = n_kv_req;
|
|
64
63
|
ctx_params.n_batch = std::max(n_predict, n_parallel);
|
|
65
64
|
|
|
66
65
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
|
67
66
|
|
|
67
|
+
auto sparams = llama_sampler_chain_default_params();
|
|
68
|
+
|
|
69
|
+
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
|
70
|
+
|
|
71
|
+
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
|
|
72
|
+
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
|
|
73
|
+
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
|
|
74
|
+
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
|
|
75
|
+
|
|
68
76
|
if (ctx == NULL) {
|
|
69
|
-
|
|
77
|
+
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
|
|
70
78
|
return 1;
|
|
71
79
|
}
|
|
72
80
|
|
|
73
81
|
const int n_ctx = llama_n_ctx(ctx);
|
|
74
82
|
|
|
75
|
-
|
|
83
|
+
LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
|
76
84
|
|
|
77
85
|
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
|
78
86
|
if (n_kv_req > n_ctx) {
|
|
79
|
-
|
|
80
|
-
|
|
87
|
+
LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
|
|
88
|
+
LOG_ERR("%s: either reduce n_parallel or increase n_ctx\n", __func__);
|
|
81
89
|
return 1;
|
|
82
90
|
}
|
|
83
91
|
|
|
84
92
|
// print the prompt token-by-token
|
|
85
93
|
|
|
86
|
-
|
|
94
|
+
LOG("\n");
|
|
87
95
|
|
|
88
96
|
for (auto id : tokens_list) {
|
|
89
|
-
|
|
97
|
+
LOG("%s", common_token_to_piece(ctx, id).c_str());
|
|
90
98
|
}
|
|
91
99
|
|
|
92
|
-
fflush(stderr);
|
|
93
|
-
|
|
94
100
|
// create a llama_batch
|
|
95
101
|
// we use this object to submit token data for decoding
|
|
96
102
|
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
|
|
@@ -102,13 +108,13 @@ int main(int argc, char ** argv) {
|
|
|
102
108
|
|
|
103
109
|
// evaluate the initial prompt
|
|
104
110
|
for (size_t i = 0; i < tokens_list.size(); ++i) {
|
|
105
|
-
|
|
111
|
+
common_batch_add(batch, tokens_list[i], i, seq_ids, false);
|
|
106
112
|
}
|
|
107
113
|
GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
|
|
108
114
|
|
|
109
115
|
if (llama_model_has_encoder(model)) {
|
|
110
116
|
if (llama_encode(ctx, batch)) {
|
|
111
|
-
|
|
117
|
+
LOG_ERR("%s : failed to eval\n", __func__);
|
|
112
118
|
return 1;
|
|
113
119
|
}
|
|
114
120
|
|
|
@@ -117,15 +123,15 @@ int main(int argc, char ** argv) {
|
|
|
117
123
|
decoder_start_token_id = llama_token_bos(model);
|
|
118
124
|
}
|
|
119
125
|
|
|
120
|
-
|
|
121
|
-
|
|
126
|
+
common_batch_clear(batch);
|
|
127
|
+
common_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
|
|
122
128
|
}
|
|
123
129
|
|
|
124
130
|
// llama_decode will output logits only for the last token of the prompt
|
|
125
131
|
batch.logits[batch.n_tokens - 1] = true;
|
|
126
132
|
|
|
127
133
|
if (llama_decode(ctx, batch) != 0) {
|
|
128
|
-
|
|
134
|
+
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
|
129
135
|
return 1;
|
|
130
136
|
}
|
|
131
137
|
|
|
@@ -136,7 +142,7 @@ int main(int argc, char ** argv) {
|
|
|
136
142
|
//}
|
|
137
143
|
|
|
138
144
|
if (n_parallel > 1) {
|
|
139
|
-
|
|
145
|
+
LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
|
|
140
146
|
}
|
|
141
147
|
|
|
142
148
|
// main loop
|
|
@@ -155,7 +161,7 @@ int main(int argc, char ** argv) {
|
|
|
155
161
|
|
|
156
162
|
while (n_cur <= n_predict) {
|
|
157
163
|
// prepare the next batch
|
|
158
|
-
|
|
164
|
+
common_batch_clear(batch);
|
|
159
165
|
|
|
160
166
|
// sample the next token for each parallel sequence / stream
|
|
161
167
|
for (int32_t i = 0; i < n_parallel; ++i) {
|
|
@@ -164,36 +170,14 @@ int main(int argc, char ** argv) {
|
|
|
164
170
|
continue;
|
|
165
171
|
}
|
|
166
172
|
|
|
167
|
-
|
|
168
|
-
auto * logits = llama_get_logits_ith(ctx, i_batch[i]);
|
|
169
|
-
|
|
170
|
-
std::vector<llama_token_data> candidates;
|
|
171
|
-
candidates.reserve(n_vocab);
|
|
172
|
-
|
|
173
|
-
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
174
|
-
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
178
|
-
|
|
179
|
-
const int top_k = 40;
|
|
180
|
-
const float top_p = 0.9f;
|
|
181
|
-
const float temp = 0.4f;
|
|
182
|
-
|
|
183
|
-
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
|
|
184
|
-
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
|
|
185
|
-
llama_sample_temp (ctx, &candidates_p, temp);
|
|
186
|
-
|
|
187
|
-
const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
|
|
188
|
-
|
|
189
|
-
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
|
173
|
+
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
|
|
190
174
|
|
|
191
175
|
// is it an end of generation? -> mark the stream as finished
|
|
192
176
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
|
193
177
|
i_batch[i] = -1;
|
|
194
|
-
|
|
178
|
+
LOG("\n");
|
|
195
179
|
if (n_parallel > 1) {
|
|
196
|
-
|
|
180
|
+
LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
|
|
197
181
|
}
|
|
198
182
|
|
|
199
183
|
continue;
|
|
@@ -201,16 +185,15 @@ int main(int argc, char ** argv) {
|
|
|
201
185
|
|
|
202
186
|
// if there is only one stream, we print immediately to stdout
|
|
203
187
|
if (n_parallel == 1) {
|
|
204
|
-
|
|
205
|
-
fflush(stdout);
|
|
188
|
+
LOG("%s", common_token_to_piece(ctx, new_token_id).c_str());
|
|
206
189
|
}
|
|
207
190
|
|
|
208
|
-
streams[i] +=
|
|
191
|
+
streams[i] += common_token_to_piece(ctx, new_token_id);
|
|
209
192
|
|
|
210
193
|
i_batch[i] = batch.n_tokens;
|
|
211
194
|
|
|
212
195
|
// push this new token for next evaluation
|
|
213
|
-
|
|
196
|
+
common_batch_add(batch, new_token_id, n_cur, { i }, true);
|
|
214
197
|
|
|
215
198
|
n_decode += 1;
|
|
216
199
|
}
|
|
@@ -224,32 +207,33 @@ int main(int argc, char ** argv) {
|
|
|
224
207
|
|
|
225
208
|
// evaluate the current batch with the transformer model
|
|
226
209
|
if (llama_decode(ctx, batch)) {
|
|
227
|
-
|
|
210
|
+
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
|
|
228
211
|
return 1;
|
|
229
212
|
}
|
|
230
213
|
}
|
|
231
214
|
|
|
232
|
-
LOG_TEE("\n");
|
|
233
|
-
|
|
234
215
|
if (n_parallel > 1) {
|
|
235
|
-
|
|
216
|
+
LOG("\n");
|
|
236
217
|
|
|
237
218
|
for (int32_t i = 0; i < n_parallel; ++i) {
|
|
238
|
-
|
|
219
|
+
LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
|
|
239
220
|
}
|
|
240
221
|
}
|
|
241
222
|
|
|
242
223
|
const auto t_main_end = ggml_time_us();
|
|
243
224
|
|
|
244
|
-
|
|
225
|
+
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
|
245
226
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
|
246
227
|
|
|
247
|
-
|
|
228
|
+
LOG("\n");
|
|
229
|
+
llama_perf_sampler_print(smpl);
|
|
230
|
+
llama_perf_context_print(ctx);
|
|
248
231
|
|
|
249
232
|
fprintf(stderr, "\n");
|
|
250
233
|
|
|
251
234
|
llama_batch_free(batch);
|
|
252
235
|
|
|
236
|
+
llama_sampler_free(smpl);
|
|
253
237
|
llama_free(ctx);
|
|
254
238
|
llama_free_model(model);
|
|
255
239
|
|
|
@@ -1,49 +1,28 @@
|
|
|
1
|
+
#include "arg.h"
|
|
1
2
|
#include "common.h"
|
|
3
|
+
#include "log.h"
|
|
2
4
|
#include "llama.h"
|
|
3
5
|
|
|
4
6
|
#include <algorithm>
|
|
5
|
-
#include <cmath>
|
|
6
7
|
#include <cstdio>
|
|
7
8
|
#include <string>
|
|
8
9
|
#include <vector>
|
|
9
10
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
char * q = p;
|
|
15
|
-
|
|
16
|
-
while (*p) {
|
|
17
|
-
if (*p == ',') {
|
|
18
|
-
*p = '\0';
|
|
19
|
-
ret.push_back(std::atoi(q));
|
|
20
|
-
q = p + 1;
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
++p;
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
ret.push_back(std::atoi(q));
|
|
27
|
-
|
|
28
|
-
return ret;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
|
32
|
-
gpt_params_print_usage(argc, argv, params);
|
|
33
|
-
|
|
34
|
-
LOG_TEE("\nexample usage:\n");
|
|
35
|
-
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
|
36
|
-
LOG_TEE("\n");
|
|
11
|
+
static void print_usage(int, char ** argv) {
|
|
12
|
+
LOG("\nexample usage:\n");
|
|
13
|
+
LOG("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
|
14
|
+
LOG("\n");
|
|
37
15
|
}
|
|
38
16
|
|
|
39
17
|
int main(int argc, char ** argv) {
|
|
40
|
-
|
|
18
|
+
common_params params;
|
|
41
19
|
|
|
42
|
-
if (!
|
|
43
|
-
print_usage(argc, argv, params);
|
|
20
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
|
|
44
21
|
return 1;
|
|
45
22
|
}
|
|
46
23
|
|
|
24
|
+
common_init();
|
|
25
|
+
|
|
47
26
|
int is_pp_shared = params.is_pp_shared;
|
|
48
27
|
|
|
49
28
|
std::vector<int> n_pp = params.n_pp;
|
|
@@ -57,7 +36,7 @@ int main(int argc, char ** argv) {
|
|
|
57
36
|
|
|
58
37
|
// initialize the model
|
|
59
38
|
|
|
60
|
-
llama_model_params model_params =
|
|
39
|
+
llama_model_params model_params = common_model_params_to_llama(params);
|
|
61
40
|
|
|
62
41
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
|
63
42
|
|
|
@@ -66,10 +45,10 @@ int main(int argc, char ** argv) {
|
|
|
66
45
|
return 1;
|
|
67
46
|
}
|
|
68
47
|
|
|
69
|
-
llama_context_params ctx_params =
|
|
48
|
+
llama_context_params ctx_params = common_context_params_to_llama(params);
|
|
70
49
|
|
|
71
50
|
// ensure enough sequences are available
|
|
72
|
-
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
|
|
51
|
+
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
|
|
73
52
|
|
|
74
53
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
|
75
54
|
|
|
@@ -95,12 +74,11 @@ int main(int argc, char ** argv) {
|
|
|
95
74
|
batch.n_seq_id + i,
|
|
96
75
|
batch.seq_id + i,
|
|
97
76
|
batch.logits + i,
|
|
98
|
-
0, 0, 0, // unused
|
|
99
77
|
};
|
|
100
78
|
|
|
101
79
|
const int ret = llama_decode(ctx, batch_view);
|
|
102
80
|
if (ret != 0) {
|
|
103
|
-
|
|
81
|
+
LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
|
104
82
|
return false;
|
|
105
83
|
}
|
|
106
84
|
|
|
@@ -113,21 +91,22 @@ int main(int argc, char ** argv) {
|
|
|
113
91
|
// warm up
|
|
114
92
|
{
|
|
115
93
|
for (int i = 0; i < 16; ++i) {
|
|
116
|
-
|
|
94
|
+
common_batch_add(batch, 0, i, { 0 }, false);
|
|
117
95
|
}
|
|
118
96
|
|
|
119
97
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
|
120
|
-
|
|
98
|
+
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
|
121
99
|
return 1;
|
|
122
100
|
}
|
|
123
101
|
}
|
|
124
102
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
103
|
+
if (!params.batched_bench_output_jsonl) {
|
|
104
|
+
LOG("\n");
|
|
105
|
+
LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
|
106
|
+
LOG("\n");
|
|
107
|
+
LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
|
108
|
+
LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
|
109
|
+
}
|
|
131
110
|
|
|
132
111
|
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
|
|
133
112
|
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
|
|
@@ -142,11 +121,11 @@ int main(int argc, char ** argv) {
|
|
|
142
121
|
continue;
|
|
143
122
|
}
|
|
144
123
|
|
|
145
|
-
|
|
124
|
+
common_batch_clear(batch);
|
|
146
125
|
|
|
147
126
|
for (int i = 0; i < pp; ++i) {
|
|
148
127
|
for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
|
|
149
|
-
|
|
128
|
+
common_batch_add(batch, 0, i, { j }, false);
|
|
150
129
|
}
|
|
151
130
|
}
|
|
152
131
|
batch.logits[batch.n_tokens - 1] = true;
|
|
@@ -156,7 +135,7 @@ int main(int argc, char ** argv) {
|
|
|
156
135
|
llama_kv_cache_clear(ctx);
|
|
157
136
|
|
|
158
137
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
|
159
|
-
|
|
138
|
+
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
|
160
139
|
return 1;
|
|
161
140
|
}
|
|
162
141
|
|
|
@@ -171,14 +150,14 @@ int main(int argc, char ** argv) {
|
|
|
171
150
|
const auto t_tg_start = ggml_time_us();
|
|
172
151
|
|
|
173
152
|
for (int i = 0; i < tg; ++i) {
|
|
174
|
-
|
|
153
|
+
common_batch_clear(batch);
|
|
175
154
|
|
|
176
155
|
for (int j = 0; j < pl; ++j) {
|
|
177
|
-
|
|
156
|
+
common_batch_add(batch, 0, pp + i, { j }, true);
|
|
178
157
|
}
|
|
179
158
|
|
|
180
159
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
|
181
|
-
|
|
160
|
+
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
|
182
161
|
return 1;
|
|
183
162
|
}
|
|
184
163
|
}
|
|
@@ -195,12 +174,22 @@ int main(int argc, char ** argv) {
|
|
|
195
174
|
const float speed_tg = pl*tg / t_tg;
|
|
196
175
|
const float speed = n_kv / t;
|
|
197
176
|
|
|
198
|
-
|
|
177
|
+
if(params.batched_bench_output_jsonl) {
|
|
178
|
+
LOG(
|
|
179
|
+
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
|
|
180
|
+
"\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
|
|
181
|
+
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
|
|
182
|
+
pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
|
|
183
|
+
);
|
|
184
|
+
} else {
|
|
185
|
+
LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
|
186
|
+
}
|
|
199
187
|
}
|
|
200
188
|
}
|
|
201
189
|
}
|
|
202
190
|
|
|
203
|
-
|
|
191
|
+
LOG("\n");
|
|
192
|
+
llama_perf_context_print(ctx);
|
|
204
193
|
|
|
205
194
|
llama_batch_free(batch);
|
|
206
195
|
|
|
@@ -209,7 +198,7 @@ int main(int argc, char ** argv) {
|
|
|
209
198
|
|
|
210
199
|
llama_backend_free();
|
|
211
200
|
|
|
212
|
-
|
|
201
|
+
LOG("\n\n");
|
|
213
202
|
|
|
214
203
|
return 0;
|
|
215
204
|
}
|