@fugood/llama.node 0.2.3 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -3
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +8 -1
- package/package.json +3 -3
- package/patches/llama.patch +12 -12
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/LlamaContext.cpp +33 -1
- package/src/LlamaContext.h +1 -0
- package/src/llama.cpp/.github/workflows/bench.yml +310 -0
- package/src/llama.cpp/.github/workflows/build.yml +1315 -0
- package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
- package/src/llama.cpp/.github/workflows/docker.yml +116 -0
- package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
- package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
- package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
- package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
- package/src/llama.cpp/.github/workflows/server.yml +183 -0
- package/src/llama.cpp/CMakeLists.txt +91 -1245
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
- package/src/llama.cpp/cmake/build-info.cmake +58 -0
- package/src/llama.cpp/cmake/git-vars.cmake +22 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -3
- package/src/llama.cpp/common/build-info.cpp.in +4 -0
- package/src/llama.cpp/common/common.cpp +1116 -877
- package/src/llama.cpp/common/common.h +191 -77
- package/src/llama.cpp/common/grammar-parser.cpp +118 -31
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
- package/src/llama.cpp/common/log.h +1 -1
- package/src/llama.cpp/common/ngram-cache.h +10 -3
- package/src/llama.cpp/common/sampling.cpp +19 -10
- package/src/llama.cpp/docs/build.md +353 -0
- package/src/llama.cpp/examples/CMakeLists.txt +22 -22
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +52 -55
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/chat-13B.bat +57 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
- package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
- package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
- package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +38 -153
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
- package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
- package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
- package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
- package/src/llama.cpp/examples/llava/clip.cpp +23 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
- package/src/llama.cpp/examples/llava/requirements.txt +3 -2
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +98 -75
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
- package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
- package/src/llama.cpp/examples/server/server.cpp +274 -671
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +28 -29
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +21 -29
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +23 -0
- package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
- package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
- package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
- package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
- package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
- package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
- package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
- package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
- package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
- package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
- package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
- package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
- package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
- package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
- package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
- package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
- package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
- package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
- package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
- package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
- package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
- package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
- package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
- package/src/llama.cpp/models/.editorconfig +1 -0
- package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/requirements/requirements-all.txt +12 -0
- package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
- package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
- package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
- package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
- package/src/llama.cpp/requirements.txt +5 -4
- package/src/llama.cpp/scripts/build-info.sh +30 -0
- package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
- package/src/llama.cpp/src/CMakeLists.txt +33 -0
- package/src/llama.cpp/src/llama-grammar.cpp +539 -0
- package/src/llama.cpp/src/llama-grammar.h +39 -0
- package/src/llama.cpp/src/llama-impl.h +26 -0
- package/src/llama.cpp/src/llama-sampling.cpp +635 -0
- package/src/llama.cpp/src/llama-sampling.h +56 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
- package/src/llama.cpp/src/llama-vocab.h +130 -0
- package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
- package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
- package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
- package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
- package/src/llama.cpp/tests/CMakeLists.txt +19 -20
- package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
- package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
- package/src/llama.cpp/tests/test-double-float.cpp +2 -2
- package/src/llama.cpp/tests/test-grad0.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
- package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
- package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
- package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
- package/src/llama.cpp/tests/test-rope.cpp +3 -4
- package/src/llama.cpp/tests/test-sampling.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
- package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
- package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
- package/src/llama.cpp/ggml-opencl.cpp +0 -2305
- package/src/llama.cpp/ggml-opencl.h +0 -36
- package/src/llama.cpp/ggml-sycl.cpp +0 -17340
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
- package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
- /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
- /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
- /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
- /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
- /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
|
@@ -44,10 +44,6 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf
|
|
|
44
44
|
return ggml_nbytes(tensor);
|
|
45
45
|
}
|
|
46
46
|
|
|
47
|
-
bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
48
|
-
return buft->iface.supports_backend(buft, backend);
|
|
49
|
-
}
|
|
50
|
-
|
|
51
47
|
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
|
52
48
|
if (buft->iface.is_host) {
|
|
53
49
|
return buft->iface.is_host(buft);
|
|
@@ -138,6 +134,10 @@ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backe
|
|
|
138
134
|
}
|
|
139
135
|
}
|
|
140
136
|
|
|
137
|
+
enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
|
|
138
|
+
return buffer->usage;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
141
|
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
|
142
142
|
return buffer->buft;
|
|
143
143
|
}
|
|
@@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
|
|
151
151
|
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
152
152
|
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
|
153
153
|
if (dst_buf->iface.cpy_tensor) {
|
|
154
|
-
return
|
|
154
|
+
return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
|
|
155
155
|
}
|
|
156
156
|
return false;
|
|
157
157
|
}
|
|
@@ -286,6 +286,10 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
|
|
|
286
286
|
return backend->iface.supports_op(backend, op);
|
|
287
287
|
}
|
|
288
288
|
|
|
289
|
+
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
290
|
+
return backend->iface.supports_buft(backend, buft);
|
|
291
|
+
}
|
|
292
|
+
|
|
289
293
|
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
290
294
|
if (backend->iface.offload_op != NULL) {
|
|
291
295
|
return backend->iface.offload_op(backend, op);
|
|
@@ -394,7 +398,7 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
|
|
|
394
398
|
|
|
395
399
|
// backend registry
|
|
396
400
|
|
|
397
|
-
#define GGML_REG_MAX_BACKENDS
|
|
401
|
+
#define GGML_REG_MAX_BACKENDS 64
|
|
398
402
|
|
|
399
403
|
struct ggml_backend_reg {
|
|
400
404
|
char name[128];
|
|
@@ -445,6 +449,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|
|
445
449
|
extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
|
|
446
450
|
ggml_backend_kompute_reg_devices();
|
|
447
451
|
#endif
|
|
452
|
+
|
|
453
|
+
#ifdef GGML_USE_CANN
|
|
454
|
+
extern GGML_CALL int ggml_backend_cann_reg_devices(void);
|
|
455
|
+
ggml_backend_cann_reg_devices();
|
|
456
|
+
#endif
|
|
448
457
|
}
|
|
449
458
|
|
|
450
459
|
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
|
@@ -639,12 +648,6 @@ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_
|
|
|
639
648
|
GGML_UNUSED(buft);
|
|
640
649
|
}
|
|
641
650
|
|
|
642
|
-
GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
643
|
-
return ggml_backend_is_cpu(backend);
|
|
644
|
-
|
|
645
|
-
GGML_UNUSED(buft);
|
|
646
|
-
}
|
|
647
|
-
|
|
648
651
|
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
|
649
652
|
return true;
|
|
650
653
|
|
|
@@ -659,7 +662,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
|
659
662
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
660
663
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
661
664
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
662
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
|
663
665
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
664
666
|
},
|
|
665
667
|
/* .context = */ NULL,
|
|
@@ -715,7 +717,6 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|
|
715
717
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
716
718
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
717
719
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
718
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
|
719
720
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
720
721
|
},
|
|
721
722
|
/* .context = */ NULL,
|
|
@@ -836,6 +837,12 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
|
|
|
836
837
|
GGML_UNUSED(backend);
|
|
837
838
|
}
|
|
838
839
|
|
|
840
|
+
GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
841
|
+
return ggml_backend_buft_is_host(buft);
|
|
842
|
+
|
|
843
|
+
GGML_UNUSED(backend);
|
|
844
|
+
}
|
|
845
|
+
|
|
839
846
|
static struct ggml_backend_i cpu_backend_i = {
|
|
840
847
|
/* .get_name = */ ggml_backend_cpu_name,
|
|
841
848
|
/* .free = */ ggml_backend_cpu_free,
|
|
@@ -846,9 +853,11 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
|
846
853
|
/* .synchronize = */ NULL,
|
|
847
854
|
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
|
848
855
|
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
|
856
|
+
/* .graph_plan_update = */ NULL,
|
|
849
857
|
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
|
850
858
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
|
851
859
|
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
|
860
|
+
/* .supports_buft = */ ggml_backend_cpu_supports_buft,
|
|
852
861
|
/* .offload_op = */ NULL,
|
|
853
862
|
/* .event_new = */ NULL,
|
|
854
863
|
/* .event_free = */ NULL,
|
|
@@ -1046,17 +1055,19 @@ struct ggml_backend_sched {
|
|
|
1046
1055
|
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
|
|
1047
1056
|
ggml_gallocr_t galloc;
|
|
1048
1057
|
|
|
1049
|
-
// hash
|
|
1050
|
-
struct ggml_hash_set
|
|
1051
|
-
//
|
|
1052
|
-
|
|
1053
|
-
struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
|
1058
|
+
// hash map of the nodes in the graph
|
|
1059
|
+
struct ggml_hash_set hash_set;
|
|
1060
|
+
int * hv_tensor_backend_ids; // [hash_set.size]
|
|
1061
|
+
struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
|
|
1054
1062
|
|
|
1055
1063
|
int * node_backend_ids; // [graph_size]
|
|
1056
1064
|
int * leaf_backend_ids; // [graph_size]
|
|
1057
1065
|
|
|
1066
|
+
int * prev_node_backend_ids; // [graph_size]
|
|
1067
|
+
int * prev_leaf_backend_ids; // [graph_size]
|
|
1068
|
+
|
|
1058
1069
|
// copy of the graph with modified inputs
|
|
1059
|
-
struct ggml_cgraph
|
|
1070
|
+
struct ggml_cgraph graph;
|
|
1060
1071
|
|
|
1061
1072
|
// graph splits
|
|
1062
1073
|
struct ggml_backend_sched_split * splits;
|
|
@@ -1075,17 +1086,16 @@ struct ggml_backend_sched {
|
|
|
1075
1086
|
ggml_backend_sched_eval_callback callback_eval;
|
|
1076
1087
|
void * callback_eval_user_data;
|
|
1077
1088
|
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
__attribute__((aligned(GGML_MEM_ALIGN)))
|
|
1083
|
-
#endif
|
|
1084
|
-
char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
|
1089
|
+
char * context_buffer;
|
|
1090
|
+
size_t context_buffer_size;
|
|
1091
|
+
|
|
1092
|
+
bool debug;
|
|
1085
1093
|
};
|
|
1086
1094
|
|
|
1087
|
-
#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
|
|
1088
|
-
#define tensor_backend_id(tensor) sched->
|
|
1095
|
+
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
|
1096
|
+
#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
|
|
1097
|
+
#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
|
|
1098
|
+
#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
|
|
1089
1099
|
|
|
1090
1100
|
// returns the priority of the backend, lower id is higher priority
|
|
1091
1101
|
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
|
@@ -1097,22 +1107,24 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
|
|
|
1097
1107
|
return -1;
|
|
1098
1108
|
}
|
|
1099
1109
|
|
|
1100
|
-
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
|
|
1110
|
+
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
|
|
1101
1111
|
ggml_backend_buffer_t buffer = tensor->buffer;
|
|
1102
1112
|
if (buffer == NULL) {
|
|
1103
1113
|
return -1;
|
|
1104
1114
|
}
|
|
1105
1115
|
|
|
1106
|
-
// find highest prio backend that supports the buffer type
|
|
1116
|
+
// find highest prio backend that supports the buffer type and the op
|
|
1107
1117
|
for (int i = 0; i < sched->n_backends; i++) {
|
|
1108
|
-
if (
|
|
1118
|
+
if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
|
|
1119
|
+
ggml_backend_supports_op(sched->backends[i], op)) {
|
|
1109
1120
|
return i;
|
|
1110
1121
|
}
|
|
1111
1122
|
}
|
|
1112
1123
|
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1124
|
+
#ifndef NDEBUG
|
|
1125
|
+
fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
|
1126
|
+
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
|
|
1127
|
+
#endif
|
|
1116
1128
|
|
|
1117
1129
|
return -1;
|
|
1118
1130
|
}
|
|
@@ -1131,7 +1143,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
|
1131
1143
|
// TODO: use supports_op to check if the backend supports the op
|
|
1132
1144
|
|
|
1133
1145
|
// assign pre-allocated nodes to their backend
|
|
1134
|
-
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
|
1146
|
+
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
|
|
1135
1147
|
if (cur_backend_id != -1) {
|
|
1136
1148
|
SET_CAUSE(tensor, "1.dst");
|
|
1137
1149
|
return cur_backend_id;
|
|
@@ -1139,7 +1151,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
|
1139
1151
|
|
|
1140
1152
|
// view_src
|
|
1141
1153
|
if (tensor->view_src != NULL) {
|
|
1142
|
-
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
|
1154
|
+
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
|
|
1143
1155
|
if (cur_backend_id != -1) {
|
|
1144
1156
|
SET_CAUSE(tensor, "1.vsrc");
|
|
1145
1157
|
return cur_backend_id;
|
|
@@ -1153,7 +1165,6 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
|
1153
1165
|
return cur_backend_id;
|
|
1154
1166
|
}
|
|
1155
1167
|
|
|
1156
|
-
// assign nodes that use weights to the backend of the weights
|
|
1157
1168
|
// operations with weights are preferably run on the same backend as the weights
|
|
1158
1169
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
1159
1170
|
const struct ggml_tensor * src = tensor->src[i];
|
|
@@ -1161,11 +1172,11 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
|
1161
1172
|
continue;
|
|
1162
1173
|
}
|
|
1163
1174
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
|
1164
|
-
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
|
|
1175
|
+
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
|
1165
1176
|
// check if a backend with higher prio wants to offload the op
|
|
1166
1177
|
if (src_backend_id == sched->n_backends - 1) {
|
|
1167
1178
|
for (int b = 0; b < src_backend_id; b++) {
|
|
1168
|
-
if (ggml_backend_offload_op(sched->backends[b], tensor)) {
|
|
1179
|
+
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
|
1169
1180
|
SET_CAUSE(tensor, "1.off");
|
|
1170
1181
|
return b;
|
|
1171
1182
|
}
|
|
@@ -1223,10 +1234,33 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
|
1223
1234
|
}
|
|
1224
1235
|
}
|
|
1225
1236
|
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1237
|
+
static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
|
|
1238
|
+
ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
|
|
1239
|
+
ggml_backend_buffer_type_t buft = NULL;
|
|
1240
|
+
|
|
1241
|
+
if (buf) {
|
|
1242
|
+
// the tensor is already allocated
|
|
1243
|
+
buft = buf->buft;
|
|
1244
|
+
} else {
|
|
1245
|
+
// see if the tensor already has a backend assigned, and use the buffer type of that backend
|
|
1246
|
+
int tensor_backend_id = tensor_backend_id(t);
|
|
1247
|
+
if (tensor_backend_id == -1 && t->view_src) {
|
|
1248
|
+
tensor_backend_id = tensor_backend_id(t->view_src);
|
|
1249
|
+
}
|
|
1250
|
+
if (tensor_backend_id != -1) {
|
|
1251
|
+
buft = sched->bufts[tensor_backend_id];
|
|
1252
|
+
}
|
|
1253
|
+
}
|
|
1254
|
+
|
|
1255
|
+
return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
|
|
1256
|
+
}
|
|
1257
|
+
|
|
1258
|
+
static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
|
|
1259
|
+
if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
|
|
1260
|
+
*node_backend_id = cur_backend_id;
|
|
1261
|
+
SET_CAUSE(node, "2.sup");
|
|
1262
|
+
}
|
|
1263
|
+
}
|
|
1230
1264
|
|
|
1231
1265
|
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
|
1232
1266
|
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
|
@@ -1236,7 +1270,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1236
1270
|
sched->is_reset = false;
|
|
1237
1271
|
|
|
1238
1272
|
struct ggml_init_params params = {
|
|
1239
|
-
/* .mem_size = */
|
|
1273
|
+
/* .mem_size = */ sched->context_buffer_size,
|
|
1240
1274
|
/* .mem_buffer = */ sched->context_buffer,
|
|
1241
1275
|
/* .no_alloc = */ true
|
|
1242
1276
|
};
|
|
@@ -1245,52 +1279,52 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1245
1279
|
|
|
1246
1280
|
sched->ctx = ggml_init(params);
|
|
1247
1281
|
if (sched->ctx == NULL) {
|
|
1248
|
-
|
|
1249
|
-
GGML_ASSERT(false);
|
|
1282
|
+
GGML_ABORT("%s: failed to initialize context\n", __func__);
|
|
1250
1283
|
}
|
|
1251
1284
|
|
|
1252
1285
|
// pass 1: assign backends to ops with pre-allocated inputs
|
|
1253
1286
|
for (int i = 0; i < graph->n_leafs; i++) {
|
|
1254
1287
|
struct ggml_tensor * leaf = graph->leafs[i];
|
|
1255
1288
|
int * leaf_backend_id = &tensor_backend_id(leaf);
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1289
|
+
// do not overwrite user assignments
|
|
1290
|
+
if (*leaf_backend_id == -1) {
|
|
1291
|
+
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
|
1259
1292
|
}
|
|
1260
|
-
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
|
1261
1293
|
}
|
|
1262
1294
|
|
|
1263
1295
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
1264
1296
|
struct ggml_tensor * node = graph->nodes[i];
|
|
1265
1297
|
int * node_backend_id = &tensor_backend_id(node);
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
struct ggml_tensor * src = node->src[j];
|
|
1274
|
-
if (src == NULL) {
|
|
1298
|
+
// do not overwrite user assignments
|
|
1299
|
+
if (*node_backend_id == -1) {
|
|
1300
|
+
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
|
1301
|
+
|
|
1302
|
+
#if 0
|
|
1303
|
+
// src
|
|
1304
|
+
if (node->op == GGML_OP_NONE) {
|
|
1275
1305
|
continue;
|
|
1276
1306
|
}
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
*
|
|
1307
|
+
|
|
1308
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
1309
|
+
struct ggml_tensor * src = node->src[j];
|
|
1310
|
+
if (src == NULL) {
|
|
1311
|
+
continue;
|
|
1312
|
+
}
|
|
1313
|
+
int * src_backend_id = &tensor_backend_id(src);
|
|
1314
|
+
if (*src_backend_id == -1) {
|
|
1315
|
+
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
|
|
1316
|
+
}
|
|
1280
1317
|
}
|
|
1318
|
+
#endif
|
|
1281
1319
|
}
|
|
1282
1320
|
}
|
|
1283
|
-
#ifdef DEBUG_PASS1
|
|
1284
|
-
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
|
1285
|
-
#endif
|
|
1286
1321
|
|
|
1287
1322
|
// pass 2: expand current backend assignments
|
|
1288
1323
|
// assign the same backend to adjacent nodes
|
|
1289
1324
|
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
|
1290
1325
|
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
// pass 2.2 expand gpu down
|
|
1326
|
+
// ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
|
|
1327
|
+
// expand gpu down
|
|
1294
1328
|
{
|
|
1295
1329
|
int cur_backend_id = -1;
|
|
1296
1330
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
@@ -1306,13 +1340,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1306
1340
|
} else {
|
|
1307
1341
|
cur_backend_id = *node_backend_id;
|
|
1308
1342
|
}
|
|
1309
|
-
} else {
|
|
1310
|
-
|
|
1311
|
-
SET_CAUSE(node, "2.2");
|
|
1343
|
+
} else if (cur_backend_id != -1) {
|
|
1344
|
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
|
1312
1345
|
}
|
|
1313
1346
|
}
|
|
1314
1347
|
}
|
|
1315
|
-
//
|
|
1348
|
+
// expand gpu up
|
|
1316
1349
|
{
|
|
1317
1350
|
int cur_backend_id = -1;
|
|
1318
1351
|
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
|
@@ -1328,13 +1361,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1328
1361
|
} else {
|
|
1329
1362
|
cur_backend_id = *node_backend_id;
|
|
1330
1363
|
}
|
|
1331
|
-
} else {
|
|
1332
|
-
|
|
1333
|
-
SET_CAUSE(node, "2.1");
|
|
1364
|
+
} else if (cur_backend_id != -1) {
|
|
1365
|
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
|
1334
1366
|
}
|
|
1335
1367
|
}
|
|
1336
1368
|
}
|
|
1337
|
-
//
|
|
1369
|
+
// expand rest down
|
|
1338
1370
|
{
|
|
1339
1371
|
int cur_backend_id = -1;
|
|
1340
1372
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
@@ -1345,13 +1377,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1345
1377
|
int * node_backend_id = &tensor_backend_id(node);
|
|
1346
1378
|
if (*node_backend_id != -1) {
|
|
1347
1379
|
cur_backend_id = *node_backend_id;
|
|
1348
|
-
} else {
|
|
1349
|
-
|
|
1350
|
-
SET_CAUSE(node, "2.4");
|
|
1380
|
+
} else if (cur_backend_id != -1) {
|
|
1381
|
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
|
1351
1382
|
}
|
|
1352
1383
|
}
|
|
1353
1384
|
}
|
|
1354
|
-
//
|
|
1385
|
+
// expand rest up
|
|
1355
1386
|
{
|
|
1356
1387
|
int cur_backend_id = -1;
|
|
1357
1388
|
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
|
@@ -1362,24 +1393,80 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1362
1393
|
int * node_backend_id = &tensor_backend_id(node);
|
|
1363
1394
|
if (*node_backend_id != -1) {
|
|
1364
1395
|
cur_backend_id = *node_backend_id;
|
|
1365
|
-
} else {
|
|
1366
|
-
|
|
1367
|
-
SET_CAUSE(node, "2.3");
|
|
1396
|
+
} else if (cur_backend_id != -1) {
|
|
1397
|
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
|
1368
1398
|
}
|
|
1369
1399
|
}
|
|
1370
1400
|
}
|
|
1371
1401
|
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1402
|
+
// pass 3: upgrade nodes to higher prio backends with compatible buffer types
|
|
1403
|
+
// if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
|
|
1404
|
+
// however, we also need to verify that the sources are in compatible buffer types
|
|
1405
|
+
// (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
|
|
1406
|
+
// however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
|
|
1407
|
+
// this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
|
|
1408
|
+
// additionally, set remaining unassigned nodes to the backend with the most supported inputs
|
|
1409
|
+
// only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
|
|
1410
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
|
1411
|
+
struct ggml_tensor * node = graph->nodes[i];
|
|
1412
|
+
if (ggml_is_view_op(node->op)) {
|
|
1413
|
+
continue;
|
|
1414
|
+
}
|
|
1415
|
+
int * node_backend_id = &tensor_backend_id(node);
|
|
1416
|
+
if (*node_backend_id == -1) {
|
|
1417
|
+
// unassigned node: find the backend with the most supported inputs
|
|
1418
|
+
int n_supported_best = -1;
|
|
1419
|
+
for (int b = 0; b < sched->n_backends; b++) {
|
|
1420
|
+
if (ggml_backend_supports_op(sched->backends[b], node)) {
|
|
1421
|
+
int n_supported = 0;
|
|
1422
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
1423
|
+
struct ggml_tensor * src = node->src[j];
|
|
1424
|
+
if (src == NULL) {
|
|
1425
|
+
continue;
|
|
1426
|
+
}
|
|
1427
|
+
if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
|
|
1428
|
+
n_supported++;
|
|
1429
|
+
}
|
|
1430
|
+
}
|
|
1431
|
+
if (n_supported > n_supported_best) {
|
|
1432
|
+
n_supported_best = n_supported;
|
|
1433
|
+
*node_backend_id = b;
|
|
1434
|
+
SET_CAUSE(node, "3.best");
|
|
1435
|
+
}
|
|
1436
|
+
}
|
|
1437
|
+
}
|
|
1438
|
+
} else {
|
|
1439
|
+
// assigned node: upgrade to higher prio backend if possible
|
|
1440
|
+
for (int b = 0; b < *node_backend_id; b++) {
|
|
1441
|
+
if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
|
|
1442
|
+
bool supported = true;
|
|
1443
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
1444
|
+
struct ggml_tensor * src = node->src[j];
|
|
1445
|
+
if (src == NULL) {
|
|
1446
|
+
continue;
|
|
1447
|
+
}
|
|
1448
|
+
if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
|
|
1449
|
+
supported = false;
|
|
1450
|
+
break;
|
|
1451
|
+
}
|
|
1452
|
+
}
|
|
1453
|
+
if (supported) {
|
|
1454
|
+
*node_backend_id = b;
|
|
1455
|
+
SET_CAUSE(node, "3.upg");
|
|
1456
|
+
break;
|
|
1457
|
+
}
|
|
1458
|
+
}
|
|
1459
|
+
}
|
|
1460
|
+
}
|
|
1461
|
+
}
|
|
1375
1462
|
|
|
1376
|
-
// pass
|
|
1463
|
+
// pass 4: assign backends to remaining src from dst and view_src
|
|
1377
1464
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
1378
1465
|
struct ggml_tensor * node = graph->nodes[i];
|
|
1379
1466
|
int * cur_backend_id = &tensor_backend_id(node);
|
|
1380
1467
|
if (node->view_src != NULL && *cur_backend_id == -1) {
|
|
1381
1468
|
*cur_backend_id = tensor_backend_id(node->view_src);
|
|
1382
|
-
SET_CAUSE(node, "
|
|
1469
|
+
SET_CAUSE(node, "4.vsrc");
|
|
1383
1470
|
}
|
|
1384
1471
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
1385
1472
|
struct ggml_tensor * src = node->src[j];
|
|
@@ -1391,24 +1478,22 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1391
1478
|
if (src->view_src != NULL) {
|
|
1392
1479
|
// views are always on the same backend as the source
|
|
1393
1480
|
*src_backend_id = tensor_backend_id(src->view_src);
|
|
1394
|
-
SET_CAUSE(src, "
|
|
1481
|
+
SET_CAUSE(src, "4.vsrc");
|
|
1395
1482
|
} else {
|
|
1396
1483
|
*src_backend_id = *cur_backend_id;
|
|
1397
|
-
SET_CAUSE(src, "
|
|
1484
|
+
SET_CAUSE(src, "4.cur");
|
|
1398
1485
|
}
|
|
1399
1486
|
}
|
|
1400
1487
|
}
|
|
1401
1488
|
}
|
|
1402
|
-
#ifdef DEBUG_PASS3
|
|
1403
|
-
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
|
1404
|
-
#endif
|
|
1405
1489
|
|
|
1406
|
-
// pass
|
|
1490
|
+
// pass 5: split graph, find tensors that need to be copied
|
|
1407
1491
|
{
|
|
1408
1492
|
int i_split = 0;
|
|
1409
1493
|
struct ggml_backend_sched_split * split = &sched->splits[0];
|
|
1410
1494
|
// find the backend of the first split, skipping view ops
|
|
1411
|
-
|
|
1495
|
+
int i = 0;
|
|
1496
|
+
for (; i < graph->n_nodes; i++) {
|
|
1412
1497
|
struct ggml_tensor * node = graph->nodes[i];
|
|
1413
1498
|
if (!ggml_is_view_op(node->op)) {
|
|
1414
1499
|
split->backend_id = tensor_backend_id(node);
|
|
@@ -1417,9 +1502,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1417
1502
|
}
|
|
1418
1503
|
split->i_start = 0;
|
|
1419
1504
|
split->n_inputs = 0;
|
|
1420
|
-
memset(split->inputs, 0, sizeof(split->inputs)); //HACK
|
|
1421
1505
|
int cur_backend_id = split->backend_id;
|
|
1422
|
-
for (
|
|
1506
|
+
for (; i < graph->n_nodes; i++) {
|
|
1423
1507
|
struct ggml_tensor * node = graph->nodes[i];
|
|
1424
1508
|
|
|
1425
1509
|
if (ggml_is_view_op(node->op)) {
|
|
@@ -1428,7 +1512,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1428
1512
|
|
|
1429
1513
|
const int node_backend_id = tensor_backend_id(node);
|
|
1430
1514
|
|
|
1431
|
-
|
|
1515
|
+
assert(node_backend_id != -1); // all nodes should be assigned by now
|
|
1432
1516
|
|
|
1433
1517
|
// check if we should start a new split based on the sources of the current node
|
|
1434
1518
|
bool need_new_split = false;
|
|
@@ -1442,16 +1526,18 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1442
1526
|
// by starting a new split, the memory of the previously offloaded weights can be reused
|
|
1443
1527
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
|
1444
1528
|
int src_backend_id = tensor_backend_id(src);
|
|
1445
|
-
if (src_backend_id !=
|
|
1529
|
+
if (src_backend_id != cur_backend_id) {
|
|
1446
1530
|
need_new_split = true;
|
|
1447
1531
|
break;
|
|
1448
1532
|
}
|
|
1449
1533
|
}
|
|
1450
1534
|
// check if the split has too many inputs
|
|
1535
|
+
// FIXME: count the number of inputs instead of only checking when full
|
|
1451
1536
|
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
|
1452
1537
|
const size_t id = hash_id(src);
|
|
1453
|
-
int src_backend_id = sched->
|
|
1454
|
-
|
|
1538
|
+
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
|
1539
|
+
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
|
1540
|
+
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
|
1455
1541
|
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
|
1456
1542
|
need_new_split = true;
|
|
1457
1543
|
break;
|
|
@@ -1483,12 +1569,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1483
1569
|
continue;
|
|
1484
1570
|
}
|
|
1485
1571
|
|
|
1486
|
-
|
|
1572
|
+
size_t src_id = hash_id(src);
|
|
1573
|
+
const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
|
|
1487
1574
|
assert(src_backend_id != -1); // all inputs should be assigned by now
|
|
1488
1575
|
|
|
1489
|
-
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1)
|
|
1490
|
-
|
|
1491
|
-
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
|
1576
|
+
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
|
1577
|
+
if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
|
|
1492
1578
|
ggml_backend_t backend = sched->backends[src_backend_id];
|
|
1493
1579
|
for (int c = 0; c < sched->n_copies; c++) {
|
|
1494
1580
|
struct ggml_tensor * tensor_copy;
|
|
@@ -1502,7 +1588,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1502
1588
|
ggml_set_input(tensor_copy);
|
|
1503
1589
|
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
|
1504
1590
|
}
|
|
1505
|
-
|
|
1591
|
+
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
|
|
1506
1592
|
SET_CAUSE(tensor_copy, "4.cpy");
|
|
1507
1593
|
}
|
|
1508
1594
|
int n_graph_inputs = sched->n_graph_inputs++;
|
|
@@ -1511,10 +1597,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1511
1597
|
}
|
|
1512
1598
|
}
|
|
1513
1599
|
|
|
1514
|
-
if (src_backend_id !=
|
|
1600
|
+
if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
|
1515
1601
|
// create a copy of the input in the split's backend
|
|
1516
|
-
|
|
1517
|
-
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
|
1602
|
+
if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
|
|
1518
1603
|
ggml_backend_t backend = sched->backends[cur_backend_id];
|
|
1519
1604
|
for (int c = 0; c < sched->n_copies; c++) {
|
|
1520
1605
|
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
|
@@ -1523,27 +1608,49 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1523
1608
|
ggml_set_input(tensor_copy);
|
|
1524
1609
|
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
|
1525
1610
|
}
|
|
1526
|
-
|
|
1611
|
+
tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
|
|
1527
1612
|
SET_CAUSE(tensor_copy, "4.cpy");
|
|
1528
1613
|
}
|
|
1529
1614
|
int n_inputs = split->n_inputs++;
|
|
1530
1615
|
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
|
1531
1616
|
split->inputs[n_inputs] = src;
|
|
1532
1617
|
}
|
|
1533
|
-
node->src[j] =
|
|
1618
|
+
node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
|
|
1534
1619
|
}
|
|
1535
1620
|
}
|
|
1536
1621
|
}
|
|
1537
1622
|
split->i_end = graph->n_nodes;
|
|
1538
1623
|
sched->n_splits = i_split + 1;
|
|
1539
1624
|
}
|
|
1540
|
-
#ifdef DEBUG_PASS4
|
|
1541
|
-
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
|
1542
|
-
#endif
|
|
1543
1625
|
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1626
|
+
if (sched->debug) {
|
|
1627
|
+
ggml_backend_sched_print_assignments(sched, graph);
|
|
1628
|
+
}
|
|
1629
|
+
|
|
1630
|
+
// swap node_backend_ids and leaf _backend_ids with prevs
|
|
1631
|
+
{
|
|
1632
|
+
int * tmp = sched->node_backend_ids;
|
|
1633
|
+
sched->node_backend_ids = sched->prev_node_backend_ids;
|
|
1634
|
+
sched->prev_node_backend_ids = tmp;
|
|
1635
|
+
|
|
1636
|
+
tmp = sched->leaf_backend_ids;
|
|
1637
|
+
sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
|
|
1638
|
+
sched->prev_leaf_backend_ids = tmp;
|
|
1639
|
+
}
|
|
1640
|
+
|
|
1641
|
+
int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
|
1642
|
+
if (sched->graph.size < graph_size) {
|
|
1643
|
+
sched->graph.size = graph_size;
|
|
1644
|
+
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
|
1645
|
+
sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
|
|
1646
|
+
GGML_ASSERT(sched->graph.nodes != NULL);
|
|
1647
|
+
GGML_ASSERT(sched->graph.leafs != NULL);
|
|
1648
|
+
}
|
|
1649
|
+
sched->graph.n_nodes = 0;
|
|
1650
|
+
sched->graph.n_leafs = 0;
|
|
1651
|
+
|
|
1652
|
+
struct ggml_cgraph * graph_copy = &sched->graph;
|
|
1653
|
+
|
|
1547
1654
|
for (int i = 0; i < sched->n_splits; i++) {
|
|
1548
1655
|
struct ggml_backend_sched_split * split = &sched->splits[i];
|
|
1549
1656
|
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
|
@@ -1554,12 +1661,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1554
1661
|
|
|
1555
1662
|
struct ggml_tensor * input = split->inputs[j];
|
|
1556
1663
|
const size_t input_id = hash_id(input);
|
|
1557
|
-
struct ggml_tensor * input_cpy =
|
|
1664
|
+
struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
|
|
1558
1665
|
|
|
1559
1666
|
// add a dependency to the input source so that it is not freed before the copy is done
|
|
1560
1667
|
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
|
1561
1668
|
input_dep->src[0] = input;
|
|
1562
|
-
sched->node_backend_ids[graph_copy->n_nodes] = sched->
|
|
1669
|
+
sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
|
|
1563
1670
|
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
|
1564
1671
|
|
|
1565
1672
|
// add a dependency to the input copy so that it is allocated at the start of the split
|
|
@@ -1581,7 +1688,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1581
1688
|
size_t id = hash_id(input);
|
|
1582
1689
|
int backend_id = tensor_backend_id(input);
|
|
1583
1690
|
for (int c = 0; c < sched->n_copies; c++) {
|
|
1584
|
-
struct ggml_tensor * input_cpy =
|
|
1691
|
+
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
|
1585
1692
|
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
|
1586
1693
|
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
|
1587
1694
|
}
|
|
@@ -1594,7 +1701,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1594
1701
|
struct ggml_tensor * input = split->inputs[j];
|
|
1595
1702
|
size_t id = hash_id(input);
|
|
1596
1703
|
for (int c = 0; c < sched->n_copies; c++) {
|
|
1597
|
-
struct ggml_tensor * input_cpy =
|
|
1704
|
+
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
|
1598
1705
|
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
|
1599
1706
|
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
|
1600
1707
|
}
|
|
@@ -1608,20 +1715,36 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1608
1715
|
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
|
1609
1716
|
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
|
1610
1717
|
}
|
|
1611
|
-
|
|
1612
|
-
sched->graph = graph_copy;
|
|
1613
1718
|
}
|
|
1614
1719
|
|
|
1615
1720
|
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
1721
|
+
bool backend_ids_changed = false;
|
|
1722
|
+
for (int i = 0; i < sched->graph.n_nodes; i++) {
|
|
1723
|
+
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
|
|
1724
|
+
sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
|
|
1725
|
+
backend_ids_changed = true;
|
|
1726
|
+
break;
|
|
1727
|
+
}
|
|
1728
|
+
}
|
|
1729
|
+
if (!backend_ids_changed) {
|
|
1730
|
+
for (int i = 0; i < sched->graph.n_leafs; i++) {
|
|
1731
|
+
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
|
|
1732
|
+
sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
|
|
1733
|
+
backend_ids_changed = true;
|
|
1734
|
+
break;
|
|
1735
|
+
}
|
|
1736
|
+
}
|
|
1737
|
+
}
|
|
1738
|
+
|
|
1616
1739
|
// allocate graph
|
|
1617
|
-
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
|
1740
|
+
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
|
1618
1741
|
// the re-allocation may cause the split inputs to be moved to a different address
|
|
1619
1742
|
ggml_backend_sched_synchronize(sched);
|
|
1620
1743
|
#ifndef NDEBUG
|
|
1621
|
-
fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
|
|
1744
|
+
fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
|
1622
1745
|
#endif
|
|
1623
|
-
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
|
1624
|
-
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
|
1746
|
+
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
|
1747
|
+
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
|
1625
1748
|
fprintf(stderr, "%s: failed to allocate graph\n", __func__);
|
|
1626
1749
|
return false;
|
|
1627
1750
|
}
|
|
@@ -1642,7 +1765,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|
|
1642
1765
|
for (int j = 0; j < split->n_inputs; j++) {
|
|
1643
1766
|
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
|
|
1644
1767
|
struct ggml_tensor * input = split->inputs[j];
|
|
1645
|
-
struct ggml_tensor * input_cpy =
|
|
1768
|
+
struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
|
|
1646
1769
|
|
|
1647
1770
|
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
|
1648
1771
|
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
|
@@ -1727,18 +1850,24 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
1727
1850
|
|
|
1728
1851
|
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
|
1729
1852
|
|
|
1853
|
+
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
|
|
1854
|
+
sched->n_backends = n_backends;
|
|
1855
|
+
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
|
1856
|
+
|
|
1730
1857
|
// initialize hash table
|
|
1731
|
-
|
|
1732
|
-
sched->
|
|
1733
|
-
sched->
|
|
1858
|
+
// FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
|
|
1859
|
+
sched->hash_set = ggml_hash_set_new(graph_size);
|
|
1860
|
+
sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
|
1861
|
+
sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
|
1734
1862
|
|
|
1735
1863
|
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
|
1736
|
-
sched->node_backend_ids
|
|
1737
|
-
sched->leaf_backend_ids
|
|
1738
|
-
|
|
1739
|
-
sched->
|
|
1864
|
+
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
|
1865
|
+
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
|
1866
|
+
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
|
1867
|
+
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
|
1740
1868
|
|
|
1741
|
-
sched->
|
|
1869
|
+
sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
|
|
1870
|
+
sched->context_buffer = malloc(sched->context_buffer_size);
|
|
1742
1871
|
|
|
1743
1872
|
const int initial_splits_capacity = 16;
|
|
1744
1873
|
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
|
@@ -1747,7 +1876,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
1747
1876
|
for (int b = 0; b < n_backends; b++) {
|
|
1748
1877
|
sched->backends[b] = backends[b];
|
|
1749
1878
|
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
|
|
1750
|
-
GGML_ASSERT(
|
|
1879
|
+
GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
|
1751
1880
|
if (sched->n_copies > 1) {
|
|
1752
1881
|
for (int c = 0; c < sched->n_copies; c++) {
|
|
1753
1882
|
sched->events[b][c] = ggml_backend_event_new(backends[b]);
|
|
@@ -1773,35 +1902,37 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
|
1773
1902
|
}
|
|
1774
1903
|
ggml_gallocr_free(sched->galloc);
|
|
1775
1904
|
ggml_free(sched->ctx);
|
|
1905
|
+
ggml_hash_set_free(&sched->hash_set);
|
|
1776
1906
|
free(sched->splits);
|
|
1777
|
-
free(sched->
|
|
1778
|
-
free(sched->
|
|
1779
|
-
free(sched->tensor_copies);
|
|
1907
|
+
free(sched->hv_tensor_backend_ids);
|
|
1908
|
+
free(sched->hv_tensor_copies);
|
|
1780
1909
|
free(sched->node_backend_ids);
|
|
1781
1910
|
free(sched->leaf_backend_ids);
|
|
1911
|
+
free(sched->prev_node_backend_ids);
|
|
1912
|
+
free(sched->prev_leaf_backend_ids);
|
|
1913
|
+
free(sched->context_buffer);
|
|
1914
|
+
free(sched->graph.nodes);
|
|
1915
|
+
free(sched->graph.leafs);
|
|
1782
1916
|
free(sched);
|
|
1783
1917
|
}
|
|
1784
1918
|
|
|
1785
1919
|
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|
1786
1920
|
// reset state for the next run
|
|
1787
1921
|
if (!sched->is_reset) {
|
|
1788
|
-
|
|
1789
|
-
memset(sched->
|
|
1790
|
-
memset(sched->
|
|
1791
|
-
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
|
1792
|
-
|
|
1922
|
+
ggml_hash_set_reset(&sched->hash_set);
|
|
1923
|
+
memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
|
1924
|
+
memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
|
1793
1925
|
sched->is_reset = true;
|
|
1794
1926
|
}
|
|
1795
1927
|
sched->is_alloc = false;
|
|
1796
1928
|
}
|
|
1797
1929
|
|
|
1798
1930
|
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
|
1799
|
-
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
|
|
1931
|
+
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
|
1800
1932
|
|
|
1801
1933
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
|
1802
1934
|
|
|
1803
|
-
|
|
1804
|
-
if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
|
1935
|
+
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
|
1805
1936
|
return false;
|
|
1806
1937
|
}
|
|
1807
1938
|
|
|
@@ -1812,10 +1943,11 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
|
|
1812
1943
|
}
|
|
1813
1944
|
|
|
1814
1945
|
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
|
1815
|
-
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
|
|
1946
|
+
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
|
|
1816
1947
|
|
|
1817
1948
|
ggml_backend_sched_split_graph(sched, graph);
|
|
1818
1949
|
|
|
1950
|
+
|
|
1819
1951
|
if (!ggml_backend_sched_alloc_splits(sched)) {
|
|
1820
1952
|
return false;
|
|
1821
1953
|
}
|
|
@@ -1864,6 +1996,15 @@ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
|
|
|
1864
1996
|
return sched->n_copies;
|
|
1865
1997
|
}
|
|
1866
1998
|
|
|
1999
|
+
int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
|
|
2000
|
+
return sched->n_backends;
|
|
2001
|
+
}
|
|
2002
|
+
|
|
2003
|
+
ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
|
|
2004
|
+
GGML_ASSERT(i >= 0 && i < sched->n_backends);
|
|
2005
|
+
return sched->backends[i];
|
|
2006
|
+
}
|
|
2007
|
+
|
|
1867
2008
|
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
|
1868
2009
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
|
1869
2010
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
|
@@ -1875,6 +2016,8 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
|
|
|
1875
2016
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
|
1876
2017
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
|
1877
2018
|
tensor_backend_id(node) = backend_index;
|
|
2019
|
+
SET_CAUSE(node, "usr");
|
|
2020
|
+
sched->is_reset = false;
|
|
1878
2021
|
}
|
|
1879
2022
|
|
|
1880
2023
|
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
|
@@ -1887,15 +2030,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
|
|
|
1887
2030
|
|
|
1888
2031
|
// utils
|
|
1889
2032
|
|
|
1890
|
-
void ggml_backend_view_init(
|
|
2033
|
+
void ggml_backend_view_init(struct ggml_tensor * tensor) {
|
|
1891
2034
|
GGML_ASSERT(tensor->buffer == NULL);
|
|
1892
2035
|
GGML_ASSERT(tensor->view_src != NULL);
|
|
1893
2036
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
|
1894
2037
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
|
1895
2038
|
|
|
1896
|
-
tensor->buffer = buffer;
|
|
2039
|
+
tensor->buffer = tensor->view_src->buffer;
|
|
1897
2040
|
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
|
1898
|
-
ggml_backend_buffer_init_tensor(buffer, tensor);
|
|
2041
|
+
ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
|
|
1899
2042
|
}
|
|
1900
2043
|
|
|
1901
2044
|
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
|
@@ -1917,9 +2060,9 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
|
|
|
1917
2060
|
GGML_ASSERT(src != NULL);
|
|
1918
2061
|
GGML_ASSERT(src->data && "graph must be allocated");
|
|
1919
2062
|
|
|
1920
|
-
size_t id = ggml_hash_insert(hash_set, src);
|
|
1921
|
-
if (id ==
|
|
1922
|
-
return node_copies[ggml_hash_find(hash_set, src)];
|
|
2063
|
+
size_t id = ggml_hash_insert(&hash_set, src);
|
|
2064
|
+
if (id == GGML_HASHSET_ALREADY_EXISTS) {
|
|
2065
|
+
return node_copies[ggml_hash_find(&hash_set, src)];
|
|
1923
2066
|
}
|
|
1924
2067
|
|
|
1925
2068
|
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
|
|
@@ -1944,7 +2087,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
|
|
|
1944
2087
|
return dst;
|
|
1945
2088
|
}
|
|
1946
2089
|
|
|
1947
|
-
static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
|
|
2090
|
+
static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
|
|
1948
2091
|
size_t id = ggml_hash_find(hash_set, src);
|
|
1949
2092
|
if (node_init[id]) {
|
|
1950
2093
|
return;
|
|
@@ -1954,7 +2097,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
|
|
1954
2097
|
struct ggml_tensor * dst = node_copies[id];
|
|
1955
2098
|
if (dst->view_src != NULL) {
|
|
1956
2099
|
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
|
1957
|
-
ggml_backend_view_init(dst
|
|
2100
|
+
ggml_backend_view_init(dst);
|
|
1958
2101
|
}
|
|
1959
2102
|
else {
|
|
1960
2103
|
ggml_backend_tensor_copy(src, dst);
|
|
@@ -1971,10 +2114,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
|
|
1971
2114
|
}
|
|
1972
2115
|
|
|
1973
2116
|
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
|
1974
|
-
struct ggml_hash_set hash_set =
|
|
1975
|
-
/* .size = */ graph->visited_hash_table.size,
|
|
1976
|
-
/* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
|
|
1977
|
-
};
|
|
2117
|
+
struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
|
|
1978
2118
|
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
|
1979
2119
|
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
|
|
1980
2120
|
|
|
@@ -1989,7 +2129,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
|
1989
2129
|
|
|
1990
2130
|
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
|
1991
2131
|
fprintf(stderr, "failed to allocate context for graph copy\n");
|
|
1992
|
-
|
|
2132
|
+
ggml_hash_set_free(&hash_set);
|
|
1993
2133
|
free(node_copies);
|
|
1994
2134
|
free(node_init);
|
|
1995
2135
|
ggml_free(ctx_allocated);
|
|
@@ -2012,7 +2152,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
|
2012
2152
|
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
|
2013
2153
|
if (buffer == NULL) {
|
|
2014
2154
|
fprintf(stderr, "failed to allocate buffer for graph copy\n");
|
|
2015
|
-
|
|
2155
|
+
ggml_hash_set_free(&hash_set);
|
|
2016
2156
|
free(node_copies);
|
|
2017
2157
|
free(node_init);
|
|
2018
2158
|
ggml_free(ctx_allocated);
|
|
@@ -2030,19 +2170,19 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
|
2030
2170
|
// copy data and init views
|
|
2031
2171
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
2032
2172
|
struct ggml_tensor * node = graph->nodes[i];
|
|
2033
|
-
graph_copy_init_tensor(hash_set, node_copies, node_init, node);
|
|
2173
|
+
graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
|
|
2034
2174
|
}
|
|
2035
2175
|
|
|
2036
2176
|
// build graph copy
|
|
2037
2177
|
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
|
|
2038
2178
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
2039
2179
|
struct ggml_tensor * node = graph->nodes[i];
|
|
2040
|
-
struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)];
|
|
2180
|
+
struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)];
|
|
2041
2181
|
graph_copy->nodes[i] = node_copy;
|
|
2042
2182
|
}
|
|
2043
2183
|
graph_copy->n_nodes = graph->n_nodes;
|
|
2044
2184
|
|
|
2045
|
-
|
|
2185
|
+
ggml_hash_set_free(&hash_set);
|
|
2046
2186
|
free(node_copies);
|
|
2047
2187
|
free(node_init);
|
|
2048
2188
|
|