@fugood/llama.node 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +17 -7
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +89 -27
- package/src/LlamaContext.h +2 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +240 -168
- package/src/llama.cpp/.github/workflows/docker.yml +8 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +14 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -4
- package/src/llama.cpp/common/arg.cpp +986 -770
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +212 -351
- package/src/llama.cpp/common/common.h +204 -117
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +163 -121
- package/src/llama.cpp/common/sampling.h +41 -20
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +134 -161
- package/src/llama.cpp/examples/CMakeLists.txt +33 -14
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +19 -18
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +41 -87
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +263 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +83 -22
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
- package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +73 -114
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
- package/src/llama.cpp/examples/server/server.cpp +2073 -1339
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +354 -277
- package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/simple/simple.cpp +130 -94
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
- package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +159 -417
- package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +93 -52
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +4 -8
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +779 -194
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +55 -10
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +4317 -2979
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -38
- package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
- package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +62 -20
- package/src/llama.cpp/tests/test-sampling.cpp +163 -138
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -32,7 +32,15 @@ else()
|
|
|
32
32
|
endif()
|
|
33
33
|
endif()
|
|
34
34
|
|
|
35
|
+
# remove the lib prefix on win32 mingw
|
|
36
|
+
if (WIN32)
|
|
37
|
+
set(CMAKE_STATIC_LIBRARY_PREFIX "")
|
|
38
|
+
set(CMAKE_SHARED_LIBRARY_PREFIX "")
|
|
39
|
+
set(CMAKE_SHARED_MODULE_PREFIX "")
|
|
40
|
+
endif()
|
|
41
|
+
|
|
35
42
|
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
|
43
|
+
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
|
|
36
44
|
|
|
37
45
|
#
|
|
38
46
|
# option list
|
|
@@ -66,10 +74,10 @@ if (NOT GGML_CUDA_GRAPHS_DEFAULT)
|
|
|
66
74
|
endif()
|
|
67
75
|
|
|
68
76
|
# general
|
|
69
|
-
option(GGML_STATIC "ggml: static link libraries"
|
|
70
|
-
option(GGML_NATIVE "ggml:
|
|
71
|
-
option(GGML_LTO "ggml: enable link time optimization"
|
|
72
|
-
option(GGML_CCACHE "ggml: use ccache if available"
|
|
77
|
+
option(GGML_STATIC "ggml: static link libraries" OFF)
|
|
78
|
+
option(GGML_NATIVE "ggml: optimize the build for the current system" ${GGML_NATIVE_DEFAULT})
|
|
79
|
+
option(GGML_LTO "ggml: enable link time optimization" OFF)
|
|
80
|
+
option(GGML_CCACHE "ggml: use ccache if available" ON)
|
|
73
81
|
|
|
74
82
|
# debug
|
|
75
83
|
option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON)
|
|
@@ -91,28 +99,39 @@ else()
|
|
|
91
99
|
set(INS_ENB ON)
|
|
92
100
|
endif()
|
|
93
101
|
|
|
94
|
-
option(GGML_CPU_HBM
|
|
95
|
-
|
|
96
|
-
option(GGML_AVX
|
|
97
|
-
option(
|
|
98
|
-
option(
|
|
99
|
-
option(
|
|
100
|
-
option(
|
|
101
|
-
option(
|
|
102
|
-
option(
|
|
102
|
+
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
|
103
|
+
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
|
104
|
+
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
|
105
|
+
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
|
|
106
|
+
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
|
107
|
+
option(GGML_AVX512 "ggml: enable AVX512F" OFF)
|
|
108
|
+
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
|
|
109
|
+
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
|
|
110
|
+
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
|
|
103
111
|
if (NOT MSVC)
|
|
104
|
-
|
|
112
|
+
# in MSVC F16C and FMA is implied with AVX2/AVX512
|
|
113
|
+
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
|
|
114
|
+
option(GGML_F16C "ggml: enable F16C" ${INS_ENB})
|
|
115
|
+
# MSVC does not seem to support AMX
|
|
116
|
+
option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
|
|
117
|
+
option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
|
|
118
|
+
option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
|
|
105
119
|
endif()
|
|
106
|
-
option(GGML_LASX
|
|
107
|
-
option(GGML_LSX
|
|
108
|
-
option(
|
|
120
|
+
option(GGML_LASX "ggml: enable lasx" ON)
|
|
121
|
+
option(GGML_LSX "ggml: enable lsx" ON)
|
|
122
|
+
option(GGML_RVV "ggml: enable rvv" ON)
|
|
123
|
+
|
|
124
|
+
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
|
125
|
+
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
|
126
|
+
|
|
109
127
|
|
|
110
128
|
if (WIN32)
|
|
111
|
-
set(GGML_WIN_VER "0x602" CACHE STRING
|
|
129
|
+
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
|
|
112
130
|
endif()
|
|
113
131
|
|
|
114
132
|
# ggml core
|
|
115
133
|
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
|
|
134
|
+
option(GGML_CPU "ggml: enable CPU backend" ON)
|
|
116
135
|
|
|
117
136
|
# 3rd party libs / backends
|
|
118
137
|
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
|
|
@@ -123,14 +142,9 @@ option(GGML_LLAMAFILE "ggml: use LLAMAFILE"
|
|
|
123
142
|
|
|
124
143
|
option(GGML_CUDA "ggml: use CUDA" OFF)
|
|
125
144
|
option(GGML_MUSA "ggml: use MUSA" OFF)
|
|
126
|
-
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
|
|
127
145
|
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
|
|
128
146
|
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
|
|
129
|
-
set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
|
|
130
|
-
set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
|
|
131
147
|
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
|
|
132
|
-
set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
|
|
133
|
-
"ggml: iters./thread per block for Q2_K/Q6_K")
|
|
134
148
|
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
|
135
149
|
"ggml: max. batch size for using peer access")
|
|
136
150
|
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
|
|
@@ -138,7 +152,7 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
|
|
|
138
152
|
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
|
139
153
|
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
|
|
140
154
|
|
|
141
|
-
option(
|
|
155
|
+
option(GGML_HIP "ggml: use HIP" OFF)
|
|
142
156
|
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
|
143
157
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
|
144
158
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
|
@@ -150,6 +164,7 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
|
|
|
150
164
|
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
|
151
165
|
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
|
|
152
166
|
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
|
167
|
+
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
|
153
168
|
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
|
154
169
|
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
|
|
155
170
|
option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
|
|
@@ -162,6 +177,13 @@ option(GGML_SYCL "ggml: use SYCL"
|
|
|
162
177
|
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
|
163
178
|
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
|
164
179
|
"ggml: sycl target device")
|
|
180
|
+
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
|
181
|
+
"ggml: sycl device architecture")
|
|
182
|
+
|
|
183
|
+
option(GGML_OPENCL "ggml: use OpenCL" OFF)
|
|
184
|
+
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
|
|
185
|
+
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
|
|
186
|
+
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
|
|
165
187
|
|
|
166
188
|
# extra artifacts
|
|
167
189
|
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
|
|
@@ -174,11 +196,7 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
|
|
|
174
196
|
set(CMAKE_C_STANDARD 11)
|
|
175
197
|
set(CMAKE_C_STANDARD_REQUIRED true)
|
|
176
198
|
|
|
177
|
-
|
|
178
|
-
set(CMAKE_CXX_STANDARD 17)
|
|
179
|
-
else()
|
|
180
|
-
set(CMAKE_CXX_STANDARD 11)
|
|
181
|
-
endif()
|
|
199
|
+
set(CMAKE_CXX_STANDARD 17)
|
|
182
200
|
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
|
183
201
|
|
|
184
202
|
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
|
@@ -214,13 +232,14 @@ include(CMakePackageConfigHelpers)
|
|
|
214
232
|
# all public headers
|
|
215
233
|
set(GGML_PUBLIC_HEADERS
|
|
216
234
|
include/ggml.h
|
|
235
|
+
include/ggml-cpu.h
|
|
217
236
|
include/ggml-alloc.h
|
|
218
237
|
include/ggml-backend.h
|
|
219
238
|
include/ggml-blas.h
|
|
220
239
|
include/ggml-cann.h
|
|
221
240
|
include/ggml-cuda.h
|
|
222
|
-
include/ggml.h
|
|
223
241
|
include/ggml-kompute.h
|
|
242
|
+
include/ggml-opt.h
|
|
224
243
|
include/ggml-metal.h
|
|
225
244
|
include/ggml-rpc.h
|
|
226
245
|
include/ggml-sycl.h
|
|
@@ -230,15 +249,14 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
|
|
230
249
|
#if (GGML_METAL)
|
|
231
250
|
# set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
|
|
232
251
|
#endif()
|
|
233
|
-
install(TARGETS ggml PUBLIC_HEADER)
|
|
234
|
-
|
|
235
|
-
if (BUILD_SHARED_LIBS)
|
|
236
|
-
install(TARGETS ggml LIBRARY)
|
|
237
|
-
endif()
|
|
252
|
+
install(TARGETS ggml LIBRARY PUBLIC_HEADER)
|
|
253
|
+
install(TARGETS ggml-base LIBRARY)
|
|
238
254
|
|
|
255
|
+
# FIXME: this should be done in the backend cmake files
|
|
239
256
|
if (GGML_METAL)
|
|
257
|
+
# FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
|
|
240
258
|
install(
|
|
241
|
-
FILES src/ggml-metal.metal
|
|
259
|
+
FILES src/ggml-metal/ggml-metal.metal
|
|
242
260
|
PERMISSIONS
|
|
243
261
|
OWNER_READ
|
|
244
262
|
OWNER_WRITE
|
|
@@ -3,6 +3,20 @@
|
|
|
3
3
|
#include "ggml.h"
|
|
4
4
|
#include "ggml-alloc.h"
|
|
5
5
|
|
|
6
|
+
#ifdef GGML_BACKEND_SHARED
|
|
7
|
+
# if defined(_WIN32) && !defined(__MINGW32__)
|
|
8
|
+
# ifdef GGML_BACKEND_BUILD
|
|
9
|
+
# define GGML_BACKEND_API __declspec(dllexport) extern
|
|
10
|
+
# else
|
|
11
|
+
# define GGML_BACKEND_API __declspec(dllimport) extern
|
|
12
|
+
# endif
|
|
13
|
+
# else
|
|
14
|
+
# define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
|
|
15
|
+
# endif
|
|
16
|
+
#else
|
|
17
|
+
# define GGML_BACKEND_API extern
|
|
18
|
+
#endif
|
|
19
|
+
|
|
6
20
|
#ifdef __cplusplus
|
|
7
21
|
extern "C" {
|
|
8
22
|
#endif
|
|
@@ -72,7 +86,7 @@ extern "C" {
|
|
|
72
86
|
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
|
73
87
|
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
|
74
88
|
|
|
75
|
-
// "offset" refers to the offset
|
|
89
|
+
// "offset" refers to the offset in tensor->data for setting/getting data
|
|
76
90
|
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
|
77
91
|
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
|
78
92
|
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
|
@@ -114,11 +128,12 @@ extern "C" {
|
|
|
114
128
|
//
|
|
115
129
|
|
|
116
130
|
enum ggml_backend_dev_type {
|
|
131
|
+
// CPU device using system memory
|
|
117
132
|
GGML_BACKEND_DEVICE_TYPE_CPU,
|
|
133
|
+
// GPU device using dedicated memory
|
|
118
134
|
GGML_BACKEND_DEVICE_TYPE_GPU,
|
|
119
|
-
// devices
|
|
120
|
-
|
|
121
|
-
GGML_BACKEND_DEVICE_TYPE_GPU_FULL
|
|
135
|
+
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
|
|
136
|
+
GGML_BACKEND_DEVICE_TYPE_ACCEL
|
|
122
137
|
};
|
|
123
138
|
|
|
124
139
|
// functionality supported by the device
|
|
@@ -127,6 +142,8 @@ extern "C" {
|
|
|
127
142
|
bool async;
|
|
128
143
|
// pinned host buffer
|
|
129
144
|
bool host_buffer;
|
|
145
|
+
// creating buffers from host ptr
|
|
146
|
+
bool buffer_from_host_ptr;
|
|
130
147
|
// event synchronization
|
|
131
148
|
bool events;
|
|
132
149
|
};
|
|
@@ -165,9 +182,22 @@ extern "C" {
|
|
|
165
182
|
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
|
|
166
183
|
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
|
|
167
184
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
185
|
+
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
|
|
186
|
+
|
|
187
|
+
// Split buffer type for tensor parallelism
|
|
188
|
+
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
|
|
189
|
+
// Set the number of threads for the backend
|
|
190
|
+
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
|
|
191
|
+
// Get additional buffer types provided by the device (returns a NULL-terminated array)
|
|
192
|
+
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
|
|
193
|
+
// Set the abort callback for the backend
|
|
194
|
+
typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
|
|
195
|
+
// Get a list of feature flags supported by the backend (returns a NULL-terminated array)
|
|
196
|
+
struct ggml_backend_feature {
|
|
197
|
+
const char * name;
|
|
198
|
+
const char * value;
|
|
199
|
+
};
|
|
200
|
+
typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
|
|
171
201
|
|
|
172
202
|
//
|
|
173
203
|
// Backend registry
|
|
@@ -189,9 +219,17 @@ extern "C" {
|
|
|
189
219
|
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
|
|
190
220
|
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
|
|
191
221
|
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
|
|
192
|
-
// = ggml_backend_dev_init(ggml_backend_dev_by_type(
|
|
222
|
+
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
|
|
193
223
|
GGML_API ggml_backend_t ggml_backend_init_best(void);
|
|
194
224
|
|
|
225
|
+
// Load a backend from a dynamic library and register it
|
|
226
|
+
GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
|
|
227
|
+
// Unload a backend if loaded dynamically and unregister it
|
|
228
|
+
GGML_API void ggml_backend_unload(ggml_backend_reg_t reg);
|
|
229
|
+
// Load all known backends from dynamic libraries
|
|
230
|
+
GGML_API void ggml_backend_load_all(void);
|
|
231
|
+
GGML_API void ggml_backend_load_all_from_path(const char * dir_path);
|
|
232
|
+
|
|
195
233
|
//
|
|
196
234
|
// Backend scheduler
|
|
197
235
|
//
|
|
@@ -220,14 +258,20 @@ extern "C" {
|
|
|
220
258
|
ggml_backend_sched_reserve(sched, reserve_graph);
|
|
221
259
|
|
|
222
260
|
// compute
|
|
223
|
-
graph = build_graph(sched);
|
|
224
|
-
|
|
261
|
+
graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
|
|
262
|
+
for (int i = 0; i < 10; ++i) {
|
|
263
|
+
ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
|
|
264
|
+
}
|
|
225
265
|
|
|
226
266
|
// if there are graph inputs:
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
267
|
+
graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
|
|
268
|
+
ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
|
|
269
|
+
ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
|
|
270
|
+
ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
|
|
271
|
+
ggml_backend_sched_graph_compute(sched, graph); // execute the graph
|
|
272
|
+
|
|
273
|
+
// as an alternative to the above it is also possible to assign the inputs to a dedicated context and
|
|
274
|
+
// allocate them statically via ggml_backend_alloc_ctx_tensors
|
|
231
275
|
}
|
|
232
276
|
*/
|
|
233
277
|
|
|
@@ -242,7 +286,7 @@ extern "C" {
|
|
|
242
286
|
//
|
|
243
287
|
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
|
244
288
|
|
|
245
|
-
// Initialize a backend scheduler
|
|
289
|
+
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
|
|
246
290
|
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
|
|
247
291
|
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
|
248
292
|
|
|
@@ -267,7 +311,9 @@ extern "C" {
|
|
|
267
311
|
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
|
268
312
|
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
|
|
269
313
|
|
|
270
|
-
// Reset all assignments and allocators - must be called before changing the node backends
|
|
314
|
+
// Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
|
|
315
|
+
// This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
|
|
316
|
+
// The correct way to use this API is to discard the deallocated tensors and create new ones.
|
|
271
317
|
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
|
272
318
|
|
|
273
319
|
// Set a callback to be called for each resulting node during graph compute
|
|
@@ -297,27 +343,10 @@ extern "C" {
|
|
|
297
343
|
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
|
298
344
|
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
|
299
345
|
|
|
300
|
-
//
|
|
301
|
-
// CPU backend
|
|
302
|
-
//
|
|
303
|
-
|
|
304
|
-
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
|
305
|
-
|
|
306
|
-
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
|
307
|
-
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
|
308
|
-
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
|
309
|
-
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
|
310
|
-
|
|
311
|
-
// Create a backend buffer from an existing pointer
|
|
346
|
+
// CPU buffer types are always available
|
|
312
347
|
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
|
313
348
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
|
314
349
|
|
|
315
|
-
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
|
316
|
-
|
|
317
|
-
#ifdef GGML_USE_CPU_HBM
|
|
318
|
-
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
|
319
|
-
#endif
|
|
320
|
-
|
|
321
350
|
#ifdef __cplusplus
|
|
322
351
|
}
|
|
323
352
|
#endif
|
|
@@ -9,13 +9,15 @@ extern "C" {
|
|
|
9
9
|
#endif
|
|
10
10
|
|
|
11
11
|
// backend API
|
|
12
|
-
|
|
12
|
+
GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
|
|
15
15
|
|
|
16
16
|
// number of threads used for conversion to float
|
|
17
17
|
// for openblas and blis, this will also set the number of threads used for blas operations
|
|
18
|
-
|
|
18
|
+
GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
|
|
19
|
+
|
|
20
|
+
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
#ifdef __cplusplus
|
|
@@ -34,6 +34,8 @@ extern "C" {
|
|
|
34
34
|
*/
|
|
35
35
|
#define GGML_CANN_MAX_DEVICES 16
|
|
36
36
|
|
|
37
|
+
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
|
|
38
|
+
|
|
37
39
|
/**
|
|
38
40
|
* @brief Initializes the CANN backend for a specified device.
|
|
39
41
|
*
|
|
@@ -44,7 +46,7 @@ extern "C" {
|
|
|
44
46
|
* @param device The index of the device to initialize.
|
|
45
47
|
* @return A pointer to the initialized backend instance, or nullptr on failure.
|
|
46
48
|
*/
|
|
47
|
-
|
|
49
|
+
GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
|
|
48
50
|
|
|
49
51
|
/**
|
|
50
52
|
* @brief Checks if a given backend is a CANN backend.
|
|
@@ -55,7 +57,7 @@ GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
|
|
|
55
57
|
* @param backend The backend instance to check.
|
|
56
58
|
* @return True if the backend is a CANN backend, false otherwise.
|
|
57
59
|
*/
|
|
58
|
-
|
|
60
|
+
GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
|
|
59
61
|
|
|
60
62
|
/**
|
|
61
63
|
* @brief Retrieves the CANN buffer type for a specified device.
|
|
@@ -67,7 +69,7 @@ GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
|
|
|
67
69
|
* @return A pointer to the buffer type interface for the specified device, or
|
|
68
70
|
* nullptr if the device index is out of range.
|
|
69
71
|
*/
|
|
70
|
-
|
|
72
|
+
GGML_BACKEND_API ggml_backend_buffer_type_t
|
|
71
73
|
ggml_backend_cann_buffer_type(int32_t device);
|
|
72
74
|
|
|
73
75
|
/**
|
|
@@ -78,14 +80,14 @@ ggml_backend_cann_buffer_type(int32_t device);
|
|
|
78
80
|
*
|
|
79
81
|
* @return The number of CANN devices available.
|
|
80
82
|
*/
|
|
81
|
-
|
|
83
|
+
GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
|
|
82
84
|
|
|
83
85
|
/**
|
|
84
86
|
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
|
|
85
87
|
*
|
|
86
88
|
* @return A pointer to the host buffer type interface.
|
|
87
89
|
*/
|
|
88
|
-
|
|
90
|
+
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
|
89
91
|
|
|
90
92
|
/**
|
|
91
93
|
* @brief Retrieves the description of a specific CANN device.
|
|
@@ -97,7 +99,7 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
|
|
97
99
|
* @param description Pointer to a buffer where the description will be written.
|
|
98
100
|
* @param description_size Size of the description buffer.
|
|
99
101
|
*/
|
|
100
|
-
|
|
102
|
+
GGML_BACKEND_API void ggml_backend_cann_get_device_description(
|
|
101
103
|
int32_t device, char* description, size_t description_size);
|
|
102
104
|
|
|
103
105
|
/**
|
|
@@ -112,7 +114,7 @@ GGML_API void ggml_backend_cann_get_device_description(
|
|
|
112
114
|
* @param total Pointer to a variable where the total memory size will be
|
|
113
115
|
* stored.
|
|
114
116
|
*/
|
|
115
|
-
|
|
117
|
+
GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
|
|
116
118
|
size_t* free,
|
|
117
119
|
size_t* total);
|
|
118
120
|
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#ifndef __cplusplus
|
|
4
|
+
#error "This header is for C++ only"
|
|
5
|
+
#endif
|
|
6
|
+
|
|
7
|
+
#include "ggml.h"
|
|
8
|
+
#include "ggml-alloc.h"
|
|
9
|
+
#include "ggml-backend.h"
|
|
10
|
+
#include <memory>
|
|
11
|
+
|
|
12
|
+
// Smart pointers for ggml types
|
|
13
|
+
|
|
14
|
+
// ggml
|
|
15
|
+
|
|
16
|
+
struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
|
|
17
|
+
struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
|
|
18
|
+
|
|
19
|
+
typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
|
|
20
|
+
typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
|
|
21
|
+
|
|
22
|
+
// ggml-alloc
|
|
23
|
+
|
|
24
|
+
struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
|
|
25
|
+
|
|
26
|
+
typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
|
|
27
|
+
|
|
28
|
+
// ggml-backend
|
|
29
|
+
|
|
30
|
+
struct ggml_backend_deleter { void operator()(ggml_backend_t backend) { ggml_backend_free(backend); } };
|
|
31
|
+
struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
|
|
32
|
+
struct ggml_backend_event_deleter { void operator()(ggml_backend_event_t event) { ggml_backend_event_free(event); } };
|
|
33
|
+
struct ggml_backend_sched_deleter { void operator()(ggml_backend_sched_t sched) { ggml_backend_sched_free(sched); } };
|
|
34
|
+
|
|
35
|
+
typedef std::unique_ptr<ggml_backend, ggml_backend_deleter> ggml_backend_ptr;
|
|
36
|
+
typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
|
|
37
|
+
typedef std::unique_ptr<ggml_backend_event, ggml_backend_event_deleter> ggml_backend_event_ptr;
|
|
38
|
+
typedef std::unique_ptr<ggml_backend_sched, ggml_backend_sched_deleter> ggml_backend_sched_ptr;
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
#include "ggml-backend.h"
|
|
5
|
+
|
|
6
|
+
#ifdef __cplusplus
|
|
7
|
+
extern "C" {
|
|
8
|
+
#endif
|
|
9
|
+
|
|
10
|
+
// the compute plan that needs to be prepared for ggml_graph_compute()
|
|
11
|
+
// since https://github.com/ggerganov/ggml/issues/287
|
|
12
|
+
struct ggml_cplan {
|
|
13
|
+
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
|
14
|
+
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
|
15
|
+
|
|
16
|
+
int n_threads;
|
|
17
|
+
struct ggml_threadpool * threadpool;
|
|
18
|
+
|
|
19
|
+
// abort ggml_graph_compute when true
|
|
20
|
+
ggml_abort_callback abort_callback;
|
|
21
|
+
void * abort_callback_data;
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
// numa strategies
|
|
25
|
+
enum ggml_numa_strategy {
|
|
26
|
+
GGML_NUMA_STRATEGY_DISABLED = 0,
|
|
27
|
+
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
|
28
|
+
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
|
29
|
+
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
|
30
|
+
GGML_NUMA_STRATEGY_MIRROR = 4,
|
|
31
|
+
GGML_NUMA_STRATEGY_COUNT
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
GGML_BACKEND_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
|
35
|
+
GGML_BACKEND_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
|
36
|
+
|
|
37
|
+
GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
|
38
|
+
GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
|
39
|
+
|
|
40
|
+
GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
|
41
|
+
GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
|
42
|
+
|
|
43
|
+
GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
|
44
|
+
GGML_BACKEND_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
|
45
|
+
|
|
46
|
+
GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
|
47
|
+
GGML_BACKEND_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
|
48
|
+
|
|
49
|
+
GGML_BACKEND_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
|
50
|
+
GGML_BACKEND_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
|
51
|
+
|
|
52
|
+
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
|
53
|
+
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
|
54
|
+
|
|
55
|
+
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
|
56
|
+
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
|
57
|
+
GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
|
|
58
|
+
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
|
59
|
+
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
|
60
|
+
|
|
61
|
+
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
|
62
|
+
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
|
63
|
+
GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
|
|
64
|
+
const struct ggml_cgraph * cgraph,
|
|
65
|
+
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
|
66
|
+
struct ggml_threadpool * threadpool /* = NULL */ );
|
|
67
|
+
GGML_BACKEND_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
|
68
|
+
|
|
69
|
+
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
|
70
|
+
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
|
71
|
+
GGML_BACKEND_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
|
72
|
+
|
|
73
|
+
//
|
|
74
|
+
// system info
|
|
75
|
+
//
|
|
76
|
+
|
|
77
|
+
// x86
|
|
78
|
+
GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
|
|
79
|
+
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
|
|
80
|
+
GGML_BACKEND_API int ggml_cpu_has_avx (void);
|
|
81
|
+
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
|
|
82
|
+
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
|
|
83
|
+
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
|
|
84
|
+
GGML_BACKEND_API int ggml_cpu_has_fma (void);
|
|
85
|
+
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
|
|
86
|
+
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
|
|
87
|
+
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
|
|
88
|
+
GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
|
|
89
|
+
GGML_BACKEND_API int ggml_cpu_has_amx_int8 (void);
|
|
90
|
+
// ARM
|
|
91
|
+
GGML_BACKEND_API int ggml_cpu_has_neon (void);
|
|
92
|
+
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
|
|
93
|
+
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
|
|
94
|
+
GGML_BACKEND_API int ggml_cpu_has_dotprod (void);
|
|
95
|
+
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
|
|
96
|
+
GGML_BACKEND_API int ggml_cpu_has_sve (void);
|
|
97
|
+
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
|
|
98
|
+
// other
|
|
99
|
+
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
|
100
|
+
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
|
101
|
+
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
|
102
|
+
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
|
103
|
+
|
|
104
|
+
// Internal types and functions exposed for tests and benchmarks
|
|
105
|
+
|
|
106
|
+
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
|
107
|
+
const void * GGML_RESTRICT y, size_t by, int nrc);
|
|
108
|
+
|
|
109
|
+
struct ggml_type_traits_cpu {
|
|
110
|
+
ggml_from_float_t from_float;
|
|
111
|
+
ggml_vec_dot_t vec_dot;
|
|
112
|
+
enum ggml_type vec_dot_type;
|
|
113
|
+
int64_t nrows; // number of rows to process simultaneously
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
|
|
117
|
+
|
|
118
|
+
GGML_BACKEND_API void ggml_cpu_init(void);
|
|
119
|
+
|
|
120
|
+
//
|
|
121
|
+
// CPU backend
|
|
122
|
+
//
|
|
123
|
+
|
|
124
|
+
GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
|
|
125
|
+
|
|
126
|
+
GGML_BACKEND_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
|
127
|
+
GGML_BACKEND_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
|
128
|
+
GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
|
129
|
+
GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
|
130
|
+
|
|
131
|
+
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
|
132
|
+
|
|
133
|
+
#ifdef __cplusplus
|
|
134
|
+
}
|
|
135
|
+
#endif
|