@fugood/llama.node 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +17 -7
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +89 -27
- package/src/LlamaContext.h +2 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +240 -168
- package/src/llama.cpp/.github/workflows/docker.yml +8 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +14 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -4
- package/src/llama.cpp/common/arg.cpp +986 -770
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +212 -351
- package/src/llama.cpp/common/common.h +204 -117
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +163 -121
- package/src/llama.cpp/common/sampling.h +41 -20
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +134 -161
- package/src/llama.cpp/examples/CMakeLists.txt +33 -14
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +19 -18
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +41 -87
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +263 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +83 -22
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
- package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +73 -114
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
- package/src/llama.cpp/examples/server/server.cpp +2073 -1339
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +354 -277
- package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/simple/simple.cpp +130 -94
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
- package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +159 -417
- package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +93 -52
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +4 -8
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +779 -194
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +55 -10
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +4317 -2979
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -38
- package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
- package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +62 -20
- package/src/llama.cpp/tests/test-sampling.cpp +163 -138
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
#include <windows.h>
|
|
9
9
|
#endif
|
|
10
10
|
|
|
11
|
+
#include "ggml-backend.h"
|
|
11
12
|
#include "ggml-backend-impl.h"
|
|
12
13
|
#include "ggml-alloc.h"
|
|
13
14
|
#include "ggml-impl.h"
|
|
@@ -34,6 +35,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
|
|
34
35
|
}
|
|
35
36
|
|
|
36
37
|
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
38
|
+
if (size == 0) {
|
|
39
|
+
// return a dummy buffer for zero-sized allocations
|
|
40
|
+
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
|
41
|
+
}
|
|
42
|
+
|
|
37
43
|
return buft->iface.alloc_buffer(buft, size);
|
|
38
44
|
}
|
|
39
45
|
|
|
@@ -89,7 +95,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
|
89
95
|
}
|
|
90
96
|
|
|
91
97
|
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
|
|
92
|
-
return
|
|
98
|
+
return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
|
|
93
99
|
}
|
|
94
100
|
|
|
95
101
|
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
|
@@ -108,6 +114,11 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
|
|
108
114
|
}
|
|
109
115
|
|
|
110
116
|
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
117
|
+
// get_base is optional if the buffer is zero-sized
|
|
118
|
+
if (buffer->size == 0) {
|
|
119
|
+
return NULL;
|
|
120
|
+
}
|
|
121
|
+
|
|
111
122
|
void * base = buffer->iface.get_base(buffer);
|
|
112
123
|
|
|
113
124
|
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
|
@@ -122,6 +133,15 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
|
|
|
122
133
|
}
|
|
123
134
|
}
|
|
124
135
|
|
|
136
|
+
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
137
|
+
// clear is optional if the buffer is zero-sized
|
|
138
|
+
if (buffer->size == 0) {
|
|
139
|
+
return;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
buffer->iface.clear(buffer, value);
|
|
143
|
+
}
|
|
144
|
+
|
|
125
145
|
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
|
|
126
146
|
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
|
127
147
|
}
|
|
@@ -134,10 +154,6 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
|
|
|
134
154
|
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
|
135
155
|
}
|
|
136
156
|
|
|
137
|
-
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
138
|
-
buffer->iface.clear(buffer, value);
|
|
139
|
-
}
|
|
140
|
-
|
|
141
157
|
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
|
142
158
|
return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
|
|
143
159
|
}
|
|
@@ -198,7 +214,7 @@ void ggml_backend_free(ggml_backend_t backend) {
|
|
|
198
214
|
}
|
|
199
215
|
|
|
200
216
|
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
|
|
201
|
-
return backend->
|
|
217
|
+
return ggml_backend_dev_buffer_type(backend->device);
|
|
202
218
|
}
|
|
203
219
|
|
|
204
220
|
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
|
|
@@ -236,45 +252,46 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
|
|
|
236
252
|
}
|
|
237
253
|
|
|
238
254
|
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
255
|
+
GGML_ASSERT(tensor);
|
|
239
256
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
240
257
|
|
|
258
|
+
if (size == 0) {
|
|
259
|
+
return;
|
|
260
|
+
}
|
|
261
|
+
|
|
241
262
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
|
242
263
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
243
264
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
244
265
|
|
|
245
|
-
if (!size) {
|
|
246
|
-
return;
|
|
247
|
-
}
|
|
248
|
-
|
|
249
266
|
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
|
250
267
|
}
|
|
251
268
|
|
|
252
269
|
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
270
|
+
GGML_ASSERT(tensor);
|
|
253
271
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
254
272
|
|
|
273
|
+
if (size == 0) {
|
|
274
|
+
return;
|
|
275
|
+
}
|
|
276
|
+
|
|
255
277
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
|
256
278
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
257
279
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
258
280
|
|
|
259
|
-
if (!size) {
|
|
260
|
-
return;
|
|
261
|
-
}
|
|
262
|
-
|
|
263
281
|
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
|
264
282
|
}
|
|
265
283
|
|
|
266
|
-
|
|
284
|
+
void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
267
285
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
268
286
|
|
|
269
|
-
|
|
270
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
271
|
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
272
|
-
|
|
273
|
-
if (!size) {
|
|
287
|
+
if (size == 0) {
|
|
274
288
|
return;
|
|
275
289
|
}
|
|
276
290
|
|
|
277
|
-
GGML_ASSERT(buf
|
|
291
|
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
|
292
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
293
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
294
|
+
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
|
|
278
295
|
|
|
279
296
|
buf->iface.memset_tensor(buf, tensor, value, offset, size);
|
|
280
297
|
}
|
|
@@ -316,33 +333,15 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
|
|
|
316
333
|
}
|
|
317
334
|
|
|
318
335
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
319
|
-
|
|
320
|
-
if (backend->device) {
|
|
321
|
-
return ggml_backend_dev_supports_op(backend->device, op);
|
|
322
|
-
}
|
|
323
|
-
|
|
324
|
-
return backend->iface.supports_op(backend, op);
|
|
336
|
+
return ggml_backend_dev_supports_op(backend->device, op);
|
|
325
337
|
}
|
|
326
338
|
|
|
327
339
|
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
328
|
-
|
|
329
|
-
if (backend->device) {
|
|
330
|
-
return ggml_backend_dev_supports_buft(backend->device, buft);
|
|
331
|
-
}
|
|
332
|
-
|
|
333
|
-
return backend->iface.supports_buft(backend, buft);
|
|
340
|
+
return ggml_backend_dev_supports_buft(backend->device, buft);
|
|
334
341
|
}
|
|
335
342
|
|
|
336
343
|
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
337
|
-
|
|
338
|
-
if (backend->device) {
|
|
339
|
-
return ggml_backend_dev_offload_op(backend->device, op);
|
|
340
|
-
}
|
|
341
|
-
|
|
342
|
-
if (backend->iface.offload_op != NULL) {
|
|
343
|
-
return backend->iface.offload_op(backend, op);
|
|
344
|
-
}
|
|
345
|
-
return false;
|
|
344
|
+
return ggml_backend_dev_offload_op(backend->device, op);
|
|
346
345
|
}
|
|
347
346
|
|
|
348
347
|
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
|
|
@@ -379,7 +378,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
|
|
379
378
|
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
|
380
379
|
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
|
|
381
380
|
#ifndef NDEBUG
|
|
382
|
-
|
|
381
|
+
GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
|
|
383
382
|
#endif
|
|
384
383
|
size_t nbytes = ggml_nbytes(src);
|
|
385
384
|
void * data = malloc(nbytes);
|
|
@@ -404,837 +403,128 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
|
|
|
404
403
|
|
|
405
404
|
// an async copy would normally happen after all the queued operations on both backends are completed
|
|
406
405
|
// to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
|
|
407
|
-
ggml_backend_synchronize(backend_src);
|
|
408
|
-
ggml_backend_synchronize(backend_dst);
|
|
409
|
-
ggml_backend_tensor_copy(src, dst);
|
|
410
|
-
}
|
|
411
|
-
|
|
412
|
-
// events
|
|
413
|
-
|
|
414
|
-
ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
|
|
415
|
-
// null device is allowed for the transition period to the device interface
|
|
416
|
-
if (device == NULL || device->iface.event_new == NULL) {
|
|
417
|
-
return NULL;
|
|
418
|
-
}
|
|
419
|
-
return device->iface.event_new(device);
|
|
420
|
-
}
|
|
421
|
-
|
|
422
|
-
void ggml_backend_event_free(ggml_backend_event_t event) {
|
|
423
|
-
if (event == NULL) {
|
|
424
|
-
return;
|
|
425
|
-
}
|
|
426
|
-
event->device->iface.event_free(event->device, event);
|
|
427
|
-
}
|
|
428
|
-
|
|
429
|
-
void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
|
|
430
|
-
GGML_ASSERT(backend->iface.event_record != NULL);
|
|
431
|
-
|
|
432
|
-
backend->iface.event_record(backend, event);
|
|
433
|
-
}
|
|
434
|
-
|
|
435
|
-
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
|
|
436
|
-
GGML_ASSERT(event->device->iface.event_synchronize);
|
|
437
|
-
|
|
438
|
-
event->device->iface.event_synchronize(event->device, event);
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
|
442
|
-
GGML_ASSERT(backend->iface.event_wait != NULL);
|
|
443
|
-
|
|
444
|
-
backend->iface.event_wait(backend, event);
|
|
445
|
-
}
|
|
446
|
-
|
|
447
|
-
// Backend device
|
|
448
|
-
|
|
449
|
-
const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
|
|
450
|
-
return device->iface.get_name(device);
|
|
451
|
-
}
|
|
452
|
-
|
|
453
|
-
const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
|
|
454
|
-
return device->iface.get_description(device);
|
|
455
|
-
}
|
|
456
|
-
|
|
457
|
-
void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
|
458
|
-
device->iface.get_memory(device, free, total);
|
|
459
|
-
}
|
|
460
|
-
|
|
461
|
-
enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
|
|
462
|
-
return device->iface.get_type(device);
|
|
463
|
-
}
|
|
464
|
-
|
|
465
|
-
void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
|
|
466
|
-
device->iface.get_props(device, props);
|
|
467
|
-
}
|
|
468
|
-
|
|
469
|
-
ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
|
|
470
|
-
return device->reg;
|
|
471
|
-
}
|
|
472
|
-
|
|
473
|
-
ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
|
|
474
|
-
return device->iface.init_backend(device, params);
|
|
475
|
-
}
|
|
476
|
-
|
|
477
|
-
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
|
|
478
|
-
return device->iface.get_buffer_type(device);
|
|
479
|
-
}
|
|
480
|
-
|
|
481
|
-
ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
|
|
482
|
-
return device->iface.get_host_buffer_type(device);
|
|
483
|
-
}
|
|
484
|
-
|
|
485
|
-
ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
|
|
486
|
-
return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
|
|
487
|
-
}
|
|
488
|
-
|
|
489
|
-
bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
|
490
|
-
return device->iface.supports_op(device, op);
|
|
491
|
-
}
|
|
492
|
-
|
|
493
|
-
bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
|
|
494
|
-
return device->iface.supports_buft(device, buft);
|
|
495
|
-
}
|
|
496
|
-
|
|
497
|
-
bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
|
498
|
-
return device->iface.offload_op(device, op);
|
|
499
|
-
}
|
|
500
|
-
|
|
501
|
-
// Backend (reg)
|
|
502
|
-
|
|
503
|
-
const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
|
|
504
|
-
return reg->iface.get_name(reg);
|
|
505
|
-
}
|
|
506
|
-
|
|
507
|
-
size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
|
|
508
|
-
return reg->iface.get_device_count(reg);
|
|
509
|
-
}
|
|
510
|
-
|
|
511
|
-
ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
|
|
512
|
-
return reg->iface.get_device(reg, index);
|
|
513
|
-
}
|
|
514
|
-
|
|
515
|
-
void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
|
516
|
-
if (!reg->iface.get_proc_address) {
|
|
517
|
-
return NULL;
|
|
518
|
-
}
|
|
519
|
-
return reg->iface.get_proc_address(reg, name);
|
|
520
|
-
}
|
|
521
|
-
|
|
522
|
-
// Backend registry
|
|
523
|
-
|
|
524
|
-
#ifdef GGML_USE_CUDA
|
|
525
|
-
#include "ggml-cuda.h"
|
|
526
|
-
#endif
|
|
527
|
-
|
|
528
|
-
struct ggml_backend_registry {
|
|
529
|
-
std::vector<ggml_backend_reg_t> backends;
|
|
530
|
-
std::vector<ggml_backend_dev_t> devices;
|
|
531
|
-
|
|
532
|
-
ggml_backend_registry() {
|
|
533
|
-
#ifdef GGML_USE_CUDA
|
|
534
|
-
register_backend(ggml_backend_cuda_reg());
|
|
535
|
-
#endif
|
|
536
|
-
|
|
537
|
-
register_backend(ggml_backend_cpu_reg());
|
|
538
|
-
|
|
539
|
-
// TODO: sycl, metal, vulkan, kompute, cann
|
|
540
|
-
}
|
|
541
|
-
|
|
542
|
-
void register_backend(ggml_backend_reg_t reg) {
|
|
543
|
-
#ifndef NDEBUG
|
|
544
|
-
fprintf(stderr, "%s: registered backend %s (%zu devices)\n",
|
|
545
|
-
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
|
546
|
-
#endif
|
|
547
|
-
backends.push_back(reg);
|
|
548
|
-
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
|
549
|
-
register_device(ggml_backend_reg_dev_get(reg, i));
|
|
550
|
-
}
|
|
551
|
-
}
|
|
552
|
-
|
|
553
|
-
void register_device(ggml_backend_dev_t device) {
|
|
554
|
-
#ifndef NDEBUG
|
|
555
|
-
fprintf(stderr, "%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
|
|
556
|
-
#endif
|
|
557
|
-
devices.push_back(device);
|
|
558
|
-
}
|
|
559
|
-
};
|
|
560
|
-
|
|
561
|
-
static ggml_backend_registry & get_reg() {
|
|
562
|
-
static ggml_backend_registry reg;
|
|
563
|
-
return reg;
|
|
564
|
-
}
|
|
565
|
-
|
|
566
|
-
// Internal API
|
|
567
|
-
void ggml_backend_register(ggml_backend_reg_t reg) {
|
|
568
|
-
get_reg().register_backend(reg);
|
|
569
|
-
}
|
|
570
|
-
|
|
571
|
-
void ggml_backend_device_register(ggml_backend_dev_t device) {
|
|
572
|
-
get_reg().register_device(device);
|
|
573
|
-
}
|
|
574
|
-
|
|
575
|
-
// Backend (reg) enumeration
|
|
576
|
-
size_t ggml_backend_reg_count() {
|
|
577
|
-
return get_reg().backends.size();
|
|
578
|
-
}
|
|
579
|
-
|
|
580
|
-
ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
|
|
581
|
-
GGML_ASSERT(index < ggml_backend_reg_count());
|
|
582
|
-
return get_reg().backends[index];
|
|
583
|
-
}
|
|
584
|
-
|
|
585
|
-
ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
|
|
586
|
-
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
|
587
|
-
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
|
|
588
|
-
if (strcmp(ggml_backend_reg_name(reg), name) == 0) {
|
|
589
|
-
return reg;
|
|
590
|
-
}
|
|
591
|
-
}
|
|
592
|
-
return NULL;
|
|
593
|
-
}
|
|
594
|
-
|
|
595
|
-
// Device enumeration
|
|
596
|
-
size_t ggml_backend_dev_count() {
|
|
597
|
-
return get_reg().devices.size();
|
|
598
|
-
}
|
|
599
|
-
|
|
600
|
-
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
|
|
601
|
-
GGML_ASSERT(index < ggml_backend_dev_count());
|
|
602
|
-
return get_reg().devices[index];
|
|
603
|
-
}
|
|
604
|
-
|
|
605
|
-
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
|
|
606
|
-
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
|
607
|
-
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
608
|
-
if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
|
|
609
|
-
return dev;
|
|
610
|
-
}
|
|
611
|
-
}
|
|
612
|
-
return NULL;
|
|
613
|
-
}
|
|
614
|
-
|
|
615
|
-
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
|
|
616
|
-
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
|
617
|
-
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
618
|
-
if (ggml_backend_dev_type(dev) == type) {
|
|
619
|
-
return dev;
|
|
620
|
-
}
|
|
621
|
-
}
|
|
622
|
-
return NULL;
|
|
623
|
-
}
|
|
624
|
-
|
|
625
|
-
// Convenience functions
|
|
626
|
-
ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
|
|
627
|
-
ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
|
|
628
|
-
if (!dev) {
|
|
629
|
-
return NULL;
|
|
630
|
-
}
|
|
631
|
-
return ggml_backend_dev_init(dev, params);
|
|
632
|
-
}
|
|
633
|
-
|
|
634
|
-
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
|
|
635
|
-
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
|
|
636
|
-
if (!dev) {
|
|
637
|
-
return NULL;
|
|
638
|
-
}
|
|
639
|
-
return ggml_backend_dev_init(dev, params);
|
|
640
|
-
}
|
|
641
|
-
|
|
642
|
-
ggml_backend_t ggml_backend_init_best(void) {
|
|
643
|
-
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
|
|
644
|
-
if (!dev) {
|
|
645
|
-
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
|
|
646
|
-
}
|
|
647
|
-
if (!dev) {
|
|
648
|
-
return NULL;
|
|
649
|
-
}
|
|
650
|
-
return ggml_backend_dev_init(dev, NULL);
|
|
651
|
-
}
|
|
652
|
-
|
|
653
|
-
// backend CPU
|
|
654
|
-
|
|
655
|
-
static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
|
|
656
|
-
|
|
657
|
-
static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
658
|
-
return "CPU";
|
|
659
|
-
|
|
660
|
-
GGML_UNUSED(buffer);
|
|
661
|
-
}
|
|
662
|
-
|
|
663
|
-
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
664
|
-
uintptr_t data = (uintptr_t)buffer->context;
|
|
665
|
-
|
|
666
|
-
// align the buffer
|
|
667
|
-
if (data % TENSOR_ALIGNMENT != 0) {
|
|
668
|
-
data = GGML_PAD(data, TENSOR_ALIGNMENT);
|
|
669
|
-
}
|
|
670
|
-
|
|
671
|
-
return (void *)data;
|
|
672
|
-
}
|
|
673
|
-
|
|
674
|
-
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
675
|
-
free(buffer->context);
|
|
676
|
-
}
|
|
677
|
-
|
|
678
|
-
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
679
|
-
memset((char *)tensor->data + offset, value, size);
|
|
680
|
-
|
|
681
|
-
GGML_UNUSED(buffer);
|
|
682
|
-
}
|
|
683
|
-
|
|
684
|
-
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
685
|
-
memcpy((char *)tensor->data + offset, data, size);
|
|
686
|
-
|
|
687
|
-
GGML_UNUSED(buffer);
|
|
688
|
-
}
|
|
689
|
-
|
|
690
|
-
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
691
|
-
memcpy(data, (const char *)tensor->data + offset, size);
|
|
692
|
-
|
|
693
|
-
GGML_UNUSED(buffer);
|
|
694
|
-
}
|
|
695
|
-
|
|
696
|
-
static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
697
|
-
if (ggml_backend_buffer_is_host(src->buffer)) {
|
|
698
|
-
memcpy(dst->data, src->data, ggml_nbytes(src));
|
|
699
|
-
return true;
|
|
700
|
-
}
|
|
701
|
-
return false;
|
|
702
|
-
|
|
703
|
-
GGML_UNUSED(buffer);
|
|
704
|
-
}
|
|
705
|
-
|
|
706
|
-
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
707
|
-
memset(buffer->context, value, buffer->size);
|
|
708
|
-
}
|
|
709
|
-
|
|
710
|
-
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
|
711
|
-
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
|
|
712
|
-
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
|
713
|
-
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
714
|
-
/* .init_tensor = */ NULL, // no initialization required
|
|
715
|
-
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
|
716
|
-
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
|
717
|
-
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
|
718
|
-
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
|
719
|
-
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
|
720
|
-
/* .reset = */ NULL,
|
|
721
|
-
};
|
|
722
|
-
|
|
723
|
-
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
|
724
|
-
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
|
|
725
|
-
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
|
726
|
-
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
727
|
-
/* .init_tensor = */ NULL, // no initialization required
|
|
728
|
-
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
|
729
|
-
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
|
730
|
-
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
|
731
|
-
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
|
732
|
-
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
|
733
|
-
/* .reset = */ NULL,
|
|
734
|
-
};
|
|
735
|
-
|
|
736
|
-
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
737
|
-
return "CPU";
|
|
738
|
-
|
|
739
|
-
GGML_UNUSED(buft);
|
|
740
|
-
}
|
|
741
|
-
|
|
742
|
-
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
743
|
-
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
|
744
|
-
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
|
|
745
|
-
if (data == NULL) {
|
|
746
|
-
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
|
747
|
-
return NULL;
|
|
748
|
-
}
|
|
749
|
-
|
|
750
|
-
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
|
|
751
|
-
}
|
|
752
|
-
|
|
753
|
-
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
754
|
-
return TENSOR_ALIGNMENT;
|
|
755
|
-
|
|
756
|
-
GGML_UNUSED(buft);
|
|
757
|
-
}
|
|
758
|
-
|
|
759
|
-
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
|
760
|
-
return true;
|
|
761
|
-
|
|
762
|
-
GGML_UNUSED(buft);
|
|
763
|
-
}
|
|
764
|
-
|
|
765
|
-
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
766
|
-
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
|
767
|
-
/* .iface = */ {
|
|
768
|
-
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
|
769
|
-
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
|
770
|
-
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
771
|
-
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
772
|
-
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
773
|
-
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
774
|
-
},
|
|
775
|
-
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
|
776
|
-
/* .context = */ NULL,
|
|
777
|
-
};
|
|
778
|
-
|
|
779
|
-
return &ggml_backend_cpu_buffer_type;
|
|
780
|
-
}
|
|
781
|
-
|
|
782
|
-
#ifdef GGML_USE_CPU_HBM
|
|
783
|
-
|
|
784
|
-
// buffer type HBM
|
|
785
|
-
|
|
786
|
-
#include <hbwmalloc.h>
|
|
787
|
-
|
|
788
|
-
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
789
|
-
return "CPU_HBM";
|
|
790
|
-
|
|
791
|
-
GGML_UNUSED(buft);
|
|
792
|
-
}
|
|
793
|
-
|
|
794
|
-
static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
|
795
|
-
return "CPU_HBM";
|
|
796
|
-
|
|
797
|
-
GGML_UNUSED(buf);
|
|
798
|
-
}
|
|
799
|
-
|
|
800
|
-
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
801
|
-
hbw_free(buffer->context);
|
|
802
|
-
}
|
|
803
|
-
|
|
804
|
-
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
805
|
-
//void * ptr = hbw_malloc(size);
|
|
806
|
-
void * ptr;
|
|
807
|
-
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
|
808
|
-
if (result != 0) {
|
|
809
|
-
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
|
|
810
|
-
return NULL;
|
|
811
|
-
}
|
|
812
|
-
|
|
813
|
-
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
|
814
|
-
buffer->buft = buft;
|
|
815
|
-
buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
|
|
816
|
-
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
|
817
|
-
|
|
818
|
-
return buffer;
|
|
819
|
-
}
|
|
820
|
-
|
|
821
|
-
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|
822
|
-
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
|
823
|
-
/* .iface = */ {
|
|
824
|
-
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
|
825
|
-
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
|
826
|
-
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
827
|
-
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
828
|
-
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
829
|
-
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
830
|
-
},
|
|
831
|
-
/* .context = */ NULL,
|
|
832
|
-
};
|
|
833
|
-
|
|
834
|
-
return &ggml_backend_cpu_buffer_type_hbm;
|
|
835
|
-
}
|
|
836
|
-
#endif
|
|
837
|
-
|
|
838
|
-
struct ggml_backend_cpu_context {
|
|
839
|
-
int n_threads;
|
|
840
|
-
ggml_threadpool_t threadpool;
|
|
841
|
-
|
|
842
|
-
uint8_t * work_data;
|
|
843
|
-
size_t work_size;
|
|
844
|
-
|
|
845
|
-
ggml_abort_callback abort_callback;
|
|
846
|
-
void * abort_callback_data;
|
|
847
|
-
};
|
|
848
|
-
|
|
849
|
-
static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
|
|
850
|
-
return "CPU";
|
|
851
|
-
|
|
852
|
-
GGML_UNUSED(backend);
|
|
853
|
-
}
|
|
854
|
-
|
|
855
|
-
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
|
856
|
-
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
|
857
|
-
delete[] cpu_ctx->work_data;
|
|
858
|
-
delete cpu_ctx;
|
|
859
|
-
delete backend;
|
|
860
|
-
}
|
|
861
|
-
|
|
862
|
-
static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
|
863
|
-
return ggml_backend_cpu_buffer_type();
|
|
864
|
-
|
|
865
|
-
GGML_UNUSED(backend);
|
|
866
|
-
}
|
|
867
|
-
|
|
868
|
-
struct ggml_backend_plan_cpu {
|
|
869
|
-
struct ggml_cplan cplan;
|
|
870
|
-
struct ggml_cgraph cgraph;
|
|
871
|
-
};
|
|
872
|
-
|
|
873
|
-
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
|
874
|
-
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
|
875
|
-
|
|
876
|
-
struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
|
|
877
|
-
|
|
878
|
-
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
|
879
|
-
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
|
880
|
-
|
|
881
|
-
if (cpu_plan->cplan.work_size > 0) {
|
|
882
|
-
cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
|
|
883
|
-
if (cpu_plan->cplan.work_data == NULL) {
|
|
884
|
-
delete cpu_plan;
|
|
885
|
-
return NULL;
|
|
886
|
-
}
|
|
887
|
-
}
|
|
888
|
-
|
|
889
|
-
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
|
890
|
-
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
|
891
|
-
|
|
892
|
-
return cpu_plan;
|
|
893
|
-
}
|
|
894
|
-
|
|
895
|
-
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
|
896
|
-
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
|
897
|
-
|
|
898
|
-
delete[] cpu_plan->cplan.work_data;
|
|
899
|
-
delete cpu_plan;
|
|
900
|
-
|
|
901
|
-
GGML_UNUSED(backend);
|
|
902
|
-
}
|
|
903
|
-
|
|
904
|
-
static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
|
905
|
-
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
|
906
|
-
|
|
907
|
-
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
|
908
|
-
|
|
909
|
-
GGML_UNUSED(backend);
|
|
910
|
-
}
|
|
911
|
-
|
|
912
|
-
static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
|
913
|
-
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
|
914
|
-
|
|
915
|
-
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
|
916
|
-
|
|
917
|
-
if (cpu_ctx->work_size < cplan.work_size) {
|
|
918
|
-
delete[] cpu_ctx->work_data;
|
|
919
|
-
cpu_ctx->work_data = new uint8_t[cplan.work_size];
|
|
920
|
-
if (cpu_ctx->work_data == NULL) {
|
|
921
|
-
cpu_ctx->work_size = 0;
|
|
922
|
-
return GGML_STATUS_ALLOC_FAILED;
|
|
923
|
-
}
|
|
924
|
-
cpu_ctx->work_size = cplan.work_size;
|
|
925
|
-
}
|
|
926
|
-
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
|
|
927
|
-
|
|
928
|
-
cplan.abort_callback = cpu_ctx->abort_callback;
|
|
929
|
-
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
|
930
|
-
|
|
931
|
-
return ggml_graph_compute(cgraph, &cplan);
|
|
932
|
-
}
|
|
933
|
-
|
|
934
|
-
static const struct ggml_backend_i ggml_backend_cpu_i = {
|
|
935
|
-
/* .get_name = */ ggml_backend_cpu_get_name,
|
|
936
|
-
/* .free = */ ggml_backend_cpu_free,
|
|
937
|
-
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
|
|
938
|
-
/* .set_tensor_async = */ NULL,
|
|
939
|
-
/* .get_tensor_async = */ NULL,
|
|
940
|
-
/* .cpy_tensor_async = */ NULL,
|
|
941
|
-
/* .synchronize = */ NULL,
|
|
942
|
-
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
|
943
|
-
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
|
944
|
-
/* .graph_plan_update = */ NULL,
|
|
945
|
-
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
|
946
|
-
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
|
947
|
-
/* .supports_op = */ NULL,
|
|
948
|
-
/* .supports_buft = */ NULL,
|
|
949
|
-
/* .offload_op = */ NULL,
|
|
950
|
-
/* .event_record = */ NULL,
|
|
951
|
-
/* .event_wait = */ NULL,
|
|
952
|
-
};
|
|
953
|
-
|
|
954
|
-
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
|
955
|
-
static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
|
|
956
|
-
return &guid;
|
|
957
|
-
}
|
|
958
|
-
|
|
959
|
-
ggml_backend_t ggml_backend_cpu_init(void) {
|
|
960
|
-
struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
|
|
961
|
-
if (ctx == NULL) {
|
|
962
|
-
return NULL;
|
|
963
|
-
}
|
|
964
|
-
|
|
965
|
-
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
|
966
|
-
ctx->threadpool = NULL;
|
|
967
|
-
ctx->work_data = NULL;
|
|
968
|
-
ctx->work_size = 0;
|
|
969
|
-
ctx->abort_callback = NULL;
|
|
970
|
-
ctx->abort_callback_data = NULL;
|
|
971
|
-
|
|
972
|
-
ggml_backend_t cpu_backend = new ggml_backend {
|
|
973
|
-
/* .guid = */ ggml_backend_cpu_guid(),
|
|
974
|
-
/* .interface = */ ggml_backend_cpu_i,
|
|
975
|
-
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
|
976
|
-
/* .context = */ ctx,
|
|
977
|
-
};
|
|
978
|
-
|
|
979
|
-
if (cpu_backend == NULL) {
|
|
980
|
-
delete ctx;
|
|
981
|
-
return NULL;
|
|
982
|
-
}
|
|
983
|
-
|
|
984
|
-
return cpu_backend;
|
|
985
|
-
}
|
|
986
|
-
|
|
987
|
-
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
|
988
|
-
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
|
|
989
|
-
}
|
|
990
|
-
|
|
991
|
-
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|
992
|
-
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
|
993
|
-
|
|
994
|
-
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
|
995
|
-
ctx->n_threads = n_threads;
|
|
996
|
-
}
|
|
997
|
-
|
|
998
|
-
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
|
|
999
|
-
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
|
1000
|
-
|
|
1001
|
-
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
|
1002
|
-
|
|
1003
|
-
if (ctx->threadpool && ctx->threadpool != threadpool) {
|
|
1004
|
-
// already had a different threadpool, pause/suspend it before switching
|
|
1005
|
-
ggml_threadpool_pause(ctx->threadpool);
|
|
1006
|
-
}
|
|
1007
|
-
ctx->threadpool = threadpool;
|
|
1008
|
-
}
|
|
1009
|
-
|
|
1010
|
-
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
|
1011
|
-
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
|
1012
|
-
|
|
1013
|
-
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
|
1014
|
-
ctx->abort_callback = abort_callback;
|
|
1015
|
-
ctx->abort_callback_data = abort_callback_data;
|
|
1016
|
-
}
|
|
1017
|
-
|
|
1018
|
-
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
|
1019
|
-
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
|
1020
|
-
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
|
406
|
+
ggml_backend_synchronize(backend_src);
|
|
407
|
+
ggml_backend_synchronize(backend_dst);
|
|
408
|
+
ggml_backend_tensor_copy(src, dst);
|
|
1021
409
|
}
|
|
1022
410
|
|
|
1023
|
-
|
|
411
|
+
// events
|
|
1024
412
|
|
|
1025
|
-
|
|
1026
|
-
|
|
413
|
+
ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
|
|
414
|
+
// null device is allowed for the transition period to the device interface
|
|
415
|
+
if (device == NULL || device->iface.event_new == NULL) {
|
|
416
|
+
return NULL;
|
|
417
|
+
}
|
|
418
|
+
return device->iface.event_new(device);
|
|
419
|
+
}
|
|
1027
420
|
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
|
|
1032
|
-
description.resize(len);
|
|
1033
|
-
sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
|
|
1034
|
-
}
|
|
1035
|
-
#elif defined(__linux__)
|
|
1036
|
-
FILE * f = fopen("/proc/cpuinfo", "r");
|
|
1037
|
-
if (f) {
|
|
1038
|
-
char buf[1024];
|
|
1039
|
-
while (fgets(buf, sizeof(buf), f)) {
|
|
1040
|
-
if (strncmp(buf, "model name", 10) == 0) {
|
|
1041
|
-
char * p = strchr(buf, ':');
|
|
1042
|
-
if (p) {
|
|
1043
|
-
p++;
|
|
1044
|
-
while (std::isspace(*p)) {
|
|
1045
|
-
p++;
|
|
1046
|
-
}
|
|
1047
|
-
while (std::isspace(p[strlen(p) - 1])) {
|
|
1048
|
-
p[strlen(p) - 1] = '\0';
|
|
1049
|
-
}
|
|
1050
|
-
description = p;
|
|
1051
|
-
break;
|
|
1052
|
-
}
|
|
1053
|
-
}
|
|
1054
|
-
}
|
|
1055
|
-
fclose(f);
|
|
1056
|
-
}
|
|
1057
|
-
#elif defined(_WIN32)
|
|
1058
|
-
HKEY hKey;
|
|
1059
|
-
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
|
|
1060
|
-
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
|
|
1061
|
-
0,
|
|
1062
|
-
KEY_READ,
|
|
1063
|
-
&hKey) == ERROR_SUCCESS) {
|
|
1064
|
-
DWORD cpu_brand_size = 0;
|
|
1065
|
-
if (RegQueryValueExA(hKey,
|
|
1066
|
-
TEXT("ProcessorNameString"),
|
|
1067
|
-
NULL,
|
|
1068
|
-
NULL,
|
|
1069
|
-
NULL,
|
|
1070
|
-
&cpu_brand_size) == ERROR_SUCCESS) {
|
|
1071
|
-
description.resize(cpu_brand_size);
|
|
1072
|
-
if (RegQueryValueExA(hKey,
|
|
1073
|
-
TEXT("ProcessorNameString"),
|
|
1074
|
-
NULL,
|
|
1075
|
-
NULL,
|
|
1076
|
-
(LPBYTE)&description[0], // NOLINT
|
|
1077
|
-
&cpu_brand_size) == ERROR_SUCCESS) {
|
|
1078
|
-
if (description.find('\0') != std::string::npos) {
|
|
1079
|
-
description.resize(description.find('\0'));
|
|
1080
|
-
}
|
|
1081
|
-
}
|
|
1082
|
-
}
|
|
1083
|
-
RegCloseKey(hKey);
|
|
1084
|
-
}
|
|
1085
|
-
#endif
|
|
421
|
+
void ggml_backend_event_free(ggml_backend_event_t event) {
|
|
422
|
+
if (event == NULL) {
|
|
423
|
+
return;
|
|
1086
424
|
}
|
|
1087
|
-
|
|
425
|
+
event->device->iface.event_free(event->device, event);
|
|
426
|
+
}
|
|
1088
427
|
|
|
1089
|
-
|
|
1090
|
-
|
|
428
|
+
void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
|
|
429
|
+
GGML_ASSERT(backend->iface.event_record != NULL);
|
|
1091
430
|
|
|
1092
|
-
|
|
431
|
+
backend->iface.event_record(backend, event);
|
|
1093
432
|
}
|
|
1094
433
|
|
|
1095
|
-
|
|
1096
|
-
|
|
434
|
+
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
|
|
435
|
+
GGML_ASSERT(event->device->iface.event_synchronize);
|
|
1097
436
|
|
|
1098
|
-
|
|
437
|
+
event->device->iface.event_synchronize(event->device, event);
|
|
1099
438
|
}
|
|
1100
439
|
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
*free = 0;
|
|
1104
|
-
*total = 0;
|
|
440
|
+
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
|
441
|
+
GGML_ASSERT(backend->iface.event_wait != NULL);
|
|
1105
442
|
|
|
1106
|
-
|
|
443
|
+
backend->iface.event_wait(backend, event);
|
|
1107
444
|
}
|
|
1108
445
|
|
|
1109
|
-
|
|
1110
|
-
return GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
|
|
446
|
+
// Backend device
|
|
1111
447
|
|
|
1112
|
-
|
|
448
|
+
const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
|
|
449
|
+
return device->iface.get_name(device);
|
|
1113
450
|
}
|
|
1114
451
|
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
props->description = ggml_backend_cpu_device_get_description(dev);
|
|
1118
|
-
props->type = ggml_backend_cpu_device_get_type(dev);
|
|
1119
|
-
ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
|
1120
|
-
props->caps = {
|
|
1121
|
-
/* async */ false,
|
|
1122
|
-
/* host_buffer */ false,
|
|
1123
|
-
/* events */ false,
|
|
1124
|
-
};
|
|
452
|
+
const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
|
|
453
|
+
return device->iface.get_description(device);
|
|
1125
454
|
}
|
|
1126
455
|
|
|
1127
|
-
|
|
1128
|
-
|
|
456
|
+
void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
|
457
|
+
device->iface.get_memory(device, free, total);
|
|
458
|
+
}
|
|
1129
459
|
|
|
1130
|
-
|
|
1131
|
-
|
|
460
|
+
enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
|
|
461
|
+
return device->iface.get_type(device);
|
|
1132
462
|
}
|
|
1133
463
|
|
|
1134
|
-
|
|
1135
|
-
|
|
464
|
+
void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
|
|
465
|
+
memset(props, 0, sizeof(*props));
|
|
466
|
+
device->iface.get_props(device, props);
|
|
467
|
+
}
|
|
1136
468
|
|
|
1137
|
-
|
|
469
|
+
ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
|
|
470
|
+
return device->reg;
|
|
1138
471
|
}
|
|
1139
472
|
|
|
1140
|
-
|
|
1141
|
-
return
|
|
473
|
+
ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
|
|
474
|
+
return device->iface.init_backend(device, params);
|
|
475
|
+
}
|
|
1142
476
|
|
|
1143
|
-
|
|
1144
|
-
|
|
477
|
+
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
|
|
478
|
+
return device->iface.get_buffer_type(device);
|
|
1145
479
|
}
|
|
1146
480
|
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
return
|
|
1151
|
-
op->type != GGML_TYPE_IQ2_XXS &&
|
|
1152
|
-
op->type != GGML_TYPE_IQ2_XS &&
|
|
1153
|
-
op->type != GGML_TYPE_IQ1_S &&
|
|
1154
|
-
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
|
1155
|
-
case GGML_OP_MUL_MAT:
|
|
1156
|
-
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
|
1157
|
-
case GGML_OP_ROPE_BACK:
|
|
1158
|
-
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
|
1159
|
-
case GGML_OP_IM2COL_BACK:
|
|
1160
|
-
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
|
1161
|
-
case GGML_OP_OUT_PROD:
|
|
1162
|
-
return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
|
|
1163
|
-
default:
|
|
1164
|
-
return true;
|
|
481
|
+
ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
|
|
482
|
+
if (device->iface.get_host_buffer_type == NULL) {
|
|
483
|
+
return NULL;
|
|
1165
484
|
}
|
|
1166
485
|
|
|
1167
|
-
|
|
486
|
+
return device->iface.get_host_buffer_type(device);
|
|
1168
487
|
}
|
|
1169
488
|
|
|
1170
|
-
|
|
1171
|
-
return
|
|
1172
|
-
|
|
1173
|
-
GGML_UNUSED(dev);
|
|
489
|
+
ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
|
|
490
|
+
return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
|
|
1174
491
|
}
|
|
1175
492
|
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
/* .get_description = */ ggml_backend_cpu_device_get_description,
|
|
1179
|
-
/* .get_memory = */ ggml_backend_cpu_device_get_memory,
|
|
1180
|
-
/* .get_type = */ ggml_backend_cpu_device_get_type,
|
|
1181
|
-
/* .get_props = */ ggml_backend_cpu_device_get_props,
|
|
1182
|
-
/* .init_backend = */ ggml_backend_cpu_device_init,
|
|
1183
|
-
/* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
|
|
1184
|
-
/* .get_host_buffer_type = */ NULL,
|
|
1185
|
-
/* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_ptr,
|
|
1186
|
-
/* .supports_op = */ ggml_backend_cpu_device_supports_op,
|
|
1187
|
-
/* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
|
|
1188
|
-
/* .offload_op = */ NULL,
|
|
1189
|
-
/* .event_new = */ NULL,
|
|
1190
|
-
/* .event_free = */ NULL,
|
|
1191
|
-
/* .event_synchronize = */ NULL,
|
|
1192
|
-
};
|
|
1193
|
-
|
|
1194
|
-
////////////////////////
|
|
1195
|
-
|
|
1196
|
-
static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
|
|
1197
|
-
return "CPU";
|
|
1198
|
-
|
|
1199
|
-
GGML_UNUSED(reg);
|
|
493
|
+
bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
|
494
|
+
return device->iface.supports_op(device, op);
|
|
1200
495
|
}
|
|
1201
496
|
|
|
1202
|
-
|
|
1203
|
-
return
|
|
1204
|
-
|
|
1205
|
-
GGML_UNUSED(reg);
|
|
497
|
+
bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
|
|
498
|
+
return device->iface.supports_buft(device, buft);
|
|
1206
499
|
}
|
|
1207
500
|
|
|
1208
|
-
|
|
1209
|
-
|
|
501
|
+
bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
|
502
|
+
if (device->iface.offload_op != NULL) {
|
|
503
|
+
return device->iface.offload_op(device, op);
|
|
504
|
+
}
|
|
1210
505
|
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
/* .iface = */ ggml_backend_cpu_device_i,
|
|
1214
|
-
/* .reg = */ reg,
|
|
1215
|
-
/* .context = */ &ctx,
|
|
1216
|
-
};
|
|
506
|
+
return false;
|
|
507
|
+
}
|
|
1217
508
|
|
|
1218
|
-
|
|
509
|
+
// Backend (reg)
|
|
1219
510
|
|
|
1220
|
-
|
|
1221
|
-
|
|
511
|
+
const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
|
|
512
|
+
return reg->iface.get_name(reg);
|
|
1222
513
|
}
|
|
1223
514
|
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
/* .get_device = */ ggml_backend_cpu_reg_get_device,
|
|
1228
|
-
/* .get_proc_address = */ NULL,
|
|
1229
|
-
};
|
|
515
|
+
size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
|
|
516
|
+
return reg->iface.get_device_count(reg);
|
|
517
|
+
}
|
|
1230
518
|
|
|
1231
|
-
ggml_backend_reg_t
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
/* .context = */ NULL,
|
|
1235
|
-
};
|
|
519
|
+
ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
|
|
520
|
+
return reg->iface.get_device(reg, index);
|
|
521
|
+
}
|
|
1236
522
|
|
|
1237
|
-
|
|
523
|
+
void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
|
524
|
+
if (!reg->iface.get_proc_address) {
|
|
525
|
+
return NULL;
|
|
526
|
+
}
|
|
527
|
+
return reg->iface.get_proc_address(reg, name);
|
|
1238
528
|
}
|
|
1239
529
|
|
|
1240
530
|
// multi-buffer buffer
|
|
@@ -1244,12 +534,6 @@ struct ggml_backend_multi_buffer_context {
|
|
|
1244
534
|
size_t n_buffers;
|
|
1245
535
|
};
|
|
1246
536
|
|
|
1247
|
-
static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
1248
|
-
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
|
1249
|
-
|
|
1250
|
-
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
|
|
1251
|
-
}
|
|
1252
|
-
|
|
1253
537
|
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
1254
538
|
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
|
1255
539
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
|
@@ -1268,7 +552,6 @@ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_
|
|
|
1268
552
|
}
|
|
1269
553
|
|
|
1270
554
|
static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
|
|
1271
|
-
/* .get_name = */ ggml_backend_multi_buffer_get_name,
|
|
1272
555
|
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
|
1273
556
|
/* .get_base = */ NULL,
|
|
1274
557
|
/* .init_tensor = */ NULL,
|
|
@@ -1297,7 +580,7 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer
|
|
|
1297
580
|
}
|
|
1298
581
|
|
|
1299
582
|
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
|
1300
|
-
return buffer->iface.
|
|
583
|
+
return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
|
|
1301
584
|
}
|
|
1302
585
|
|
|
1303
586
|
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
|
@@ -1389,7 +672,7 @@ struct ggml_backend_sched {
|
|
|
1389
672
|
char * context_buffer;
|
|
1390
673
|
size_t context_buffer_size;
|
|
1391
674
|
|
|
1392
|
-
|
|
675
|
+
int debug;
|
|
1393
676
|
};
|
|
1394
677
|
|
|
1395
678
|
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
|
@@ -1408,7 +691,7 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
|
|
|
1408
691
|
}
|
|
1409
692
|
|
|
1410
693
|
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
|
|
1411
|
-
ggml_backend_buffer_t buffer = tensor->buffer;
|
|
694
|
+
ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
1412
695
|
if (buffer == NULL) {
|
|
1413
696
|
return -1;
|
|
1414
697
|
}
|
|
@@ -1422,7 +705,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
|
|
|
1422
705
|
}
|
|
1423
706
|
|
|
1424
707
|
#ifndef NDEBUG
|
|
1425
|
-
|
|
708
|
+
GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
|
1426
709
|
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
|
|
1427
710
|
#endif
|
|
1428
711
|
|
|
@@ -1441,8 +724,6 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML
|
|
|
1441
724
|
|
|
1442
725
|
// returns the backend that should be used for the node based on the current locations
|
|
1443
726
|
static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
|
|
1444
|
-
// TODO: use supports_op to check if the backend supports the op
|
|
1445
|
-
|
|
1446
727
|
// assign pre-allocated nodes to their backend
|
|
1447
728
|
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
|
|
1448
729
|
if (cur_backend_id != -1) {
|
|
@@ -1461,7 +742,8 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
|
1461
742
|
|
|
1462
743
|
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
|
1463
744
|
// since the tensor is pre-allocated, it cannot be moved to another backend
|
|
1464
|
-
|
|
745
|
+
ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
746
|
+
GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
|
|
1465
747
|
}
|
|
1466
748
|
|
|
1467
749
|
// graph input
|
|
@@ -1477,7 +759,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
|
1477
759
|
if (src == NULL) {
|
|
1478
760
|
continue;
|
|
1479
761
|
}
|
|
1480
|
-
|
|
762
|
+
// skip ROPE since the rope freqs tensor is too small to choose a backend based on it
|
|
763
|
+
// not an ideal solution
|
|
764
|
+
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
|
1481
765
|
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
|
1482
766
|
// check if a backend with higher prio wants to offload the op
|
|
1483
767
|
if (src_backend_id == sched->n_backends - 1) {
|
|
@@ -1511,32 +795,34 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
|
1511
795
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
1512
796
|
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
|
1513
797
|
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
|
1514
|
-
|
|
798
|
+
GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
|
1515
799
|
sched->splits[cur_split].n_inputs);
|
|
1516
800
|
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
|
1517
|
-
|
|
801
|
+
GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
|
1518
802
|
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
|
1519
803
|
}
|
|
1520
|
-
|
|
804
|
+
GGML_LOG_DEBUG("\n");
|
|
1521
805
|
cur_split++;
|
|
1522
806
|
}
|
|
1523
807
|
struct ggml_tensor * node = graph->nodes[i];
|
|
1524
808
|
if (ggml_is_view_op(node->op)) {
|
|
1525
809
|
continue;
|
|
1526
810
|
}
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
811
|
+
if (sched->debug > 1) {
|
|
812
|
+
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
|
813
|
+
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
|
814
|
+
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
|
815
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
816
|
+
struct ggml_tensor * src = node->src[j];
|
|
817
|
+
if (src == NULL) {
|
|
818
|
+
continue;
|
|
819
|
+
}
|
|
820
|
+
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
|
821
|
+
GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
|
822
|
+
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
|
1534
823
|
}
|
|
1535
|
-
|
|
1536
|
-
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
|
1537
|
-
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
|
824
|
+
GGML_LOG_DEBUG("\n");
|
|
1538
825
|
}
|
|
1539
|
-
fprintf(stderr, "\n");
|
|
1540
826
|
}
|
|
1541
827
|
}
|
|
1542
828
|
|
|
@@ -1828,11 +1114,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1828
1114
|
if (src == NULL) {
|
|
1829
1115
|
continue;
|
|
1830
1116
|
}
|
|
1831
|
-
// check if a weight is on a different backend
|
|
1117
|
+
// check if a weight is on a different and incompatible backend
|
|
1832
1118
|
// by starting a new split, the memory of the previously offloaded weights can be reused
|
|
1833
1119
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
|
1834
1120
|
int src_backend_id = tensor_backend_id(src);
|
|
1835
|
-
if (src_backend_id != cur_backend_id) {
|
|
1121
|
+
if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
|
1836
1122
|
need_new_split = true;
|
|
1837
1123
|
break;
|
|
1838
1124
|
}
|
|
@@ -1844,7 +1130,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1844
1130
|
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
|
1845
1131
|
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
|
1846
1132
|
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
|
1847
|
-
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
|
1848
1133
|
need_new_split = true;
|
|
1849
1134
|
break;
|
|
1850
1135
|
}
|
|
@@ -2050,11 +1335,11 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
|
2050
1335
|
// the re-allocation may cause the split inputs to be moved to a different address
|
|
2051
1336
|
ggml_backend_sched_synchronize(sched);
|
|
2052
1337
|
#ifndef NDEBUG
|
|
2053
|
-
|
|
1338
|
+
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
|
2054
1339
|
#endif
|
|
2055
1340
|
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
|
2056
1341
|
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
|
2057
|
-
|
|
1342
|
+
GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
|
|
2058
1343
|
return false;
|
|
2059
1344
|
}
|
|
2060
1345
|
}
|
|
@@ -2165,11 +1450,12 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
2165
1450
|
bool parallel) {
|
|
2166
1451
|
GGML_ASSERT(n_backends > 0);
|
|
2167
1452
|
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
|
2168
|
-
GGML_ASSERT(
|
|
1453
|
+
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
2169
1454
|
|
|
2170
1455
|
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
|
|
2171
1456
|
|
|
2172
|
-
|
|
1457
|
+
const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
|
|
1458
|
+
sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
|
|
2173
1459
|
sched->n_backends = n_backends;
|
|
2174
1460
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
|
2175
1461
|
|
|
@@ -2197,6 +1483,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
2197
1483
|
sched->backends[b] = backends[b];
|
|
2198
1484
|
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
|
|
2199
1485
|
GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
|
1486
|
+
|
|
2200
1487
|
if (sched->n_copies > 1) {
|
|
2201
1488
|
for (int c = 0; c < sched->n_copies; c++) {
|
|
2202
1489
|
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
|
|
@@ -2252,12 +1539,13 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
|
|
2252
1539
|
|
|
2253
1540
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
|
2254
1541
|
|
|
1542
|
+
ggml_backend_sched_synchronize(sched);
|
|
1543
|
+
|
|
2255
1544
|
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
|
2256
1545
|
return false;
|
|
2257
1546
|
}
|
|
2258
1547
|
|
|
2259
1548
|
ggml_backend_sched_reset(sched);
|
|
2260
|
-
ggml_backend_sched_synchronize(sched);
|
|
2261
1549
|
|
|
2262
1550
|
return true;
|
|
2263
1551
|
}
|
|
@@ -2448,7 +1736,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
|
2448
1736
|
struct ggml_context * ctx_unallocated = ggml_init(params);
|
|
2449
1737
|
|
|
2450
1738
|
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
|
2451
|
-
|
|
1739
|
+
GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
|
|
2452
1740
|
ggml_hash_set_free(&hash_set);
|
|
2453
1741
|
free(node_copies);
|
|
2454
1742
|
free(node_init);
|
|
@@ -2471,7 +1759,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
|
2471
1759
|
// allocate nodes
|
|
2472
1760
|
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
|
2473
1761
|
if (buffer == NULL) {
|
|
2474
|
-
|
|
1762
|
+
GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
|
|
2475
1763
|
ggml_hash_set_free(&hash_set);
|
|
2476
1764
|
free(node_copies);
|
|
2477
1765
|
free(node_init);
|
|
@@ -2558,3 +1846,154 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
|
|
|
2558
1846
|
|
|
2559
1847
|
return true;
|
|
2560
1848
|
}
|
|
1849
|
+
|
|
1850
|
+
// CPU backend - buffer
|
|
1851
|
+
|
|
1852
|
+
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
1853
|
+
uintptr_t data = (uintptr_t)buffer->context;
|
|
1854
|
+
|
|
1855
|
+
// align the buffer
|
|
1856
|
+
if (data % TENSOR_ALIGNMENT != 0) {
|
|
1857
|
+
data = GGML_PAD(data, TENSOR_ALIGNMENT);
|
|
1858
|
+
}
|
|
1859
|
+
|
|
1860
|
+
return (void *)data;
|
|
1861
|
+
}
|
|
1862
|
+
|
|
1863
|
+
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
1864
|
+
ggml_aligned_free(buffer->context, buffer->size);
|
|
1865
|
+
}
|
|
1866
|
+
|
|
1867
|
+
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
1868
|
+
memset((char *)tensor->data + offset, value, size);
|
|
1869
|
+
|
|
1870
|
+
GGML_UNUSED(buffer);
|
|
1871
|
+
}
|
|
1872
|
+
|
|
1873
|
+
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
1874
|
+
memcpy((char *)tensor->data + offset, data, size);
|
|
1875
|
+
|
|
1876
|
+
GGML_UNUSED(buffer);
|
|
1877
|
+
}
|
|
1878
|
+
|
|
1879
|
+
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
1880
|
+
memcpy(data, (const char *)tensor->data + offset, size);
|
|
1881
|
+
|
|
1882
|
+
GGML_UNUSED(buffer);
|
|
1883
|
+
}
|
|
1884
|
+
|
|
1885
|
+
static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
1886
|
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
|
1887
|
+
memcpy(dst->data, src->data, ggml_nbytes(src));
|
|
1888
|
+
return true;
|
|
1889
|
+
}
|
|
1890
|
+
return false;
|
|
1891
|
+
|
|
1892
|
+
GGML_UNUSED(buffer);
|
|
1893
|
+
}
|
|
1894
|
+
|
|
1895
|
+
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
1896
|
+
memset(buffer->context, value, buffer->size);
|
|
1897
|
+
}
|
|
1898
|
+
|
|
1899
|
+
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
|
1900
|
+
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
|
1901
|
+
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
1902
|
+
/* .init_tensor = */ NULL, // no initialization required
|
|
1903
|
+
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
|
1904
|
+
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
|
1905
|
+
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
|
1906
|
+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
|
1907
|
+
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
|
1908
|
+
/* .reset = */ NULL,
|
|
1909
|
+
};
|
|
1910
|
+
|
|
1911
|
+
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
|
1912
|
+
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
|
1913
|
+
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
1914
|
+
/* .init_tensor = */ NULL, // no initialization required
|
|
1915
|
+
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
|
1916
|
+
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
|
1917
|
+
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
|
1918
|
+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
|
1919
|
+
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
|
1920
|
+
/* .reset = */ NULL,
|
|
1921
|
+
};
|
|
1922
|
+
|
|
1923
|
+
// CPU backend buffer type
|
|
1924
|
+
|
|
1925
|
+
// this buffer type is defined here to make it available to all backends
|
|
1926
|
+
|
|
1927
|
+
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
1928
|
+
return "CPU";
|
|
1929
|
+
|
|
1930
|
+
GGML_UNUSED(buft);
|
|
1931
|
+
}
|
|
1932
|
+
|
|
1933
|
+
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
1934
|
+
void * data = ggml_aligned_malloc(size);
|
|
1935
|
+
|
|
1936
|
+
if (data == NULL) {
|
|
1937
|
+
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
|
|
1938
|
+
return NULL;
|
|
1939
|
+
}
|
|
1940
|
+
|
|
1941
|
+
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
|
|
1942
|
+
}
|
|
1943
|
+
|
|
1944
|
+
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
1945
|
+
return TENSOR_ALIGNMENT;
|
|
1946
|
+
|
|
1947
|
+
GGML_UNUSED(buft);
|
|
1948
|
+
}
|
|
1949
|
+
|
|
1950
|
+
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
|
1951
|
+
return true;
|
|
1952
|
+
|
|
1953
|
+
GGML_UNUSED(buft);
|
|
1954
|
+
}
|
|
1955
|
+
|
|
1956
|
+
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
1957
|
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
|
1958
|
+
/* .iface = */ {
|
|
1959
|
+
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
|
1960
|
+
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
|
1961
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
1962
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
1963
|
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
1964
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
1965
|
+
},
|
|
1966
|
+
/* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
|
1967
|
+
/* .context = */ NULL,
|
|
1968
|
+
};
|
|
1969
|
+
|
|
1970
|
+
return &ggml_backend_cpu_buffer_type;
|
|
1971
|
+
}
|
|
1972
|
+
|
|
1973
|
+
static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
1974
|
+
return "CPU_Mapped";
|
|
1975
|
+
|
|
1976
|
+
GGML_UNUSED(buft);
|
|
1977
|
+
}
|
|
1978
|
+
|
|
1979
|
+
static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
|
|
1980
|
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
|
1981
|
+
/* .iface = */ {
|
|
1982
|
+
/* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
|
|
1983
|
+
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
|
1984
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
1985
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
1986
|
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
1987
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
1988
|
+
},
|
|
1989
|
+
/* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
|
1990
|
+
/* .context = */ NULL,
|
|
1991
|
+
};
|
|
1992
|
+
|
|
1993
|
+
return &ggml_backend_cpu_buffer_type;
|
|
1994
|
+
}
|
|
1995
|
+
|
|
1996
|
+
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
|
1997
|
+
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
|
1998
|
+
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
|
1999
|
+
}
|