@fugood/llama.node 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +8 -1
- package/package.json +1 -1
- package/patches/llama.patch +12 -12
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/LlamaContext.cpp +33 -1
- package/src/LlamaContext.h +1 -0
- package/src/LoadSessionWorker.cpp +1 -0
- package/src/llama.cpp/.github/workflows/bench.yml +310 -0
- package/src/llama.cpp/.github/workflows/build.yml +1315 -0
- package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
- package/src/llama.cpp/.github/workflows/docker.yml +116 -0
- package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
- package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
- package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
- package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
- package/src/llama.cpp/.github/workflows/server.yml +183 -0
- package/src/llama.cpp/CMakeLists.txt +91 -1245
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
- package/src/llama.cpp/cmake/build-info.cmake +58 -0
- package/src/llama.cpp/cmake/git-vars.cmake +22 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -3
- package/src/llama.cpp/common/build-info.cpp.in +4 -0
- package/src/llama.cpp/common/common.cpp +1116 -877
- package/src/llama.cpp/common/common.h +191 -77
- package/src/llama.cpp/common/grammar-parser.cpp +118 -31
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
- package/src/llama.cpp/common/log.h +1 -1
- package/src/llama.cpp/common/ngram-cache.h +10 -3
- package/src/llama.cpp/common/sampling.cpp +19 -10
- package/src/llama.cpp/docs/build.md +353 -0
- package/src/llama.cpp/examples/CMakeLists.txt +22 -22
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +52 -55
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/chat-13B.bat +57 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
- package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
- package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
- package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +38 -153
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
- package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
- package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
- package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
- package/src/llama.cpp/examples/llava/clip.cpp +23 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
- package/src/llama.cpp/examples/llava/requirements.txt +3 -2
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +98 -75
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
- package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
- package/src/llama.cpp/examples/server/server.cpp +274 -671
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +28 -29
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +21 -29
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +23 -0
- package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
- package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
- package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
- package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
- package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
- package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
- package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
- package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
- package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
- package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
- package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
- package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
- package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
- package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
- package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
- package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
- package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
- package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
- package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
- package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
- package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
- package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
- package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
- package/src/llama.cpp/models/.editorconfig +1 -0
- package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/requirements/requirements-all.txt +12 -0
- package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
- package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
- package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
- package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
- package/src/llama.cpp/requirements.txt +5 -4
- package/src/llama.cpp/scripts/build-info.sh +30 -0
- package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
- package/src/llama.cpp/src/CMakeLists.txt +33 -0
- package/src/llama.cpp/src/llama-grammar.cpp +539 -0
- package/src/llama.cpp/src/llama-grammar.h +39 -0
- package/src/llama.cpp/src/llama-impl.h +26 -0
- package/src/llama.cpp/src/llama-sampling.cpp +635 -0
- package/src/llama.cpp/src/llama-sampling.h +56 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
- package/src/llama.cpp/src/llama-vocab.h +130 -0
- package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
- package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
- package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
- package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
- package/src/llama.cpp/tests/CMakeLists.txt +19 -20
- package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
- package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
- package/src/llama.cpp/tests/test-double-float.cpp +2 -2
- package/src/llama.cpp/tests/test-grad0.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
- package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
- package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
- package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
- package/src/llama.cpp/tests/test-rope.cpp +3 -4
- package/src/llama.cpp/tests/test-sampling.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
- package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
- package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
- package/src/llama.cpp/ggml-opencl.cpp +0 -2305
- package/src/llama.cpp/ggml-opencl.h +0 -36
- package/src/llama.cpp/ggml-sycl.cpp +0 -17340
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
- package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
- /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
- /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
- /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
- /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
- /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
|
|
2
|
+
#pragma once
|
|
3
|
+
|
|
4
|
+
#define GGML_COMMON_DECL_C
|
|
5
|
+
#include "ggml-common.h"
|
|
6
|
+
|
|
7
|
+
#include "ggml.h"
|
|
8
|
+
|
|
9
|
+
// GGML internal header
|
|
10
|
+
|
|
11
|
+
#ifdef __cplusplus
|
|
12
|
+
extern "C" {
|
|
13
|
+
#endif
|
|
14
|
+
|
|
15
|
+
// Quantization
|
|
16
|
+
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
17
|
+
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
18
|
+
|
|
19
|
+
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
|
|
20
|
+
|
|
21
|
+
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
|
22
|
+
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
23
|
+
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
24
|
+
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
25
|
+
|
|
26
|
+
// GEMV
|
|
27
|
+
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
28
|
+
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
29
|
+
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
30
|
+
|
|
31
|
+
// GEMM
|
|
32
|
+
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
33
|
+
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
34
|
+
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
35
|
+
|
|
36
|
+
#ifdef __cplusplus
|
|
37
|
+
}
|
|
38
|
+
#endif
|
|
39
|
+
|
|
@@ -91,8 +91,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
|
|
|
91
91
|
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
|
|
92
92
|
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
|
93
93
|
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
|
94
|
-
|
|
95
|
-
return;
|
|
94
|
+
GGML_ABORT("not enough space in the buffer");
|
|
96
95
|
}
|
|
97
96
|
|
|
98
97
|
void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
|
|
@@ -133,7 +132,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset,
|
|
|
133
132
|
return;
|
|
134
133
|
}
|
|
135
134
|
}
|
|
136
|
-
|
|
135
|
+
GGML_ABORT("out of allocated_tensors");
|
|
137
136
|
}
|
|
138
137
|
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
|
|
139
138
|
for (int i = 0; i < 1024; i++) {
|
|
@@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
|
|
|
142
141
|
return;
|
|
143
142
|
}
|
|
144
143
|
}
|
|
145
|
-
|
|
146
|
-
GGML_ASSERT(!"tensor not found");
|
|
144
|
+
GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
|
|
147
145
|
}
|
|
148
146
|
#endif
|
|
149
147
|
|
|
@@ -176,8 +174,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
|
|
176
174
|
// this should never happen
|
|
177
175
|
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
|
178
176
|
__func__, size, max_avail);
|
|
179
|
-
|
|
180
|
-
GGML_UNREACHABLE();
|
|
177
|
+
GGML_ABORT("not enough space in the buffer");
|
|
181
178
|
}
|
|
182
179
|
}
|
|
183
180
|
|
|
@@ -339,6 +336,7 @@ struct hash_node {
|
|
|
339
336
|
};
|
|
340
337
|
|
|
341
338
|
struct tensor_alloc {
|
|
339
|
+
int buffer_id;
|
|
342
340
|
size_t offset;
|
|
343
341
|
size_t size_max; // 0 = pre-allocated, unused, or view
|
|
344
342
|
};
|
|
@@ -349,7 +347,6 @@ struct leaf_alloc {
|
|
|
349
347
|
};
|
|
350
348
|
|
|
351
349
|
struct node_alloc {
|
|
352
|
-
int buffer_id;
|
|
353
350
|
struct tensor_alloc dst;
|
|
354
351
|
struct tensor_alloc src[GGML_MAX_SRC];
|
|
355
352
|
};
|
|
@@ -377,7 +374,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
|
|
377
374
|
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
|
378
375
|
GGML_ASSERT(galloc->bufts != NULL);
|
|
379
376
|
|
|
380
|
-
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t)
|
|
377
|
+
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
|
|
381
378
|
GGML_ASSERT(galloc->buffers != NULL);
|
|
382
379
|
|
|
383
380
|
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
|
@@ -386,8 +383,19 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
|
|
386
383
|
for (int i = 0; i < n_bufs; i++) {
|
|
387
384
|
galloc->bufts[i] = bufts[i];
|
|
388
385
|
galloc->buffers[i] = NULL;
|
|
389
|
-
|
|
390
|
-
|
|
386
|
+
|
|
387
|
+
// check if the same buffer type is used multiple times and reuse the same allocator
|
|
388
|
+
for (int j = 0; j < i; j++) {
|
|
389
|
+
if (bufts[i] == bufts[j]) {
|
|
390
|
+
galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
|
|
391
|
+
break;
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
if (galloc->buf_tallocs[i] == NULL) {
|
|
396
|
+
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
|
|
397
|
+
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
|
|
398
|
+
}
|
|
391
399
|
}
|
|
392
400
|
galloc->n_buffers = n_bufs;
|
|
393
401
|
|
|
@@ -405,14 +413,34 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
|
|
405
413
|
|
|
406
414
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
|
407
415
|
if (galloc->buffers != NULL) {
|
|
408
|
-
|
|
416
|
+
// skip if already freed
|
|
417
|
+
bool freed = false;
|
|
418
|
+
for (int j = 0; j < i; j++) {
|
|
419
|
+
if (galloc->buffers[j] == galloc->buffers[i]) {
|
|
420
|
+
freed = true;
|
|
421
|
+
break;
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
if (!freed) {
|
|
425
|
+
ggml_backend_buffer_free(galloc->buffers[i]);
|
|
426
|
+
}
|
|
409
427
|
}
|
|
410
428
|
if (galloc->buf_tallocs != NULL) {
|
|
411
|
-
|
|
429
|
+
// skip if already freed
|
|
430
|
+
bool freed = false;
|
|
431
|
+
for (int j = 0; j < i; j++) {
|
|
432
|
+
if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
|
|
433
|
+
freed = true;
|
|
434
|
+
break;
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
if (!freed) {
|
|
438
|
+
ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
|
|
439
|
+
}
|
|
412
440
|
}
|
|
413
441
|
}
|
|
414
442
|
|
|
415
|
-
|
|
443
|
+
ggml_hash_set_free(&galloc->hash_set);
|
|
416
444
|
free(galloc->hash_values);
|
|
417
445
|
free(galloc->bufts);
|
|
418
446
|
free(galloc->buffers);
|
|
@@ -425,7 +453,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
|
|
425
453
|
typedef struct ggml_gallocr * ggml_gallocr_t;
|
|
426
454
|
|
|
427
455
|
static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
|
428
|
-
size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
|
|
456
|
+
size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
|
|
429
457
|
return &galloc->hash_values[i];
|
|
430
458
|
}
|
|
431
459
|
|
|
@@ -511,17 +539,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
|
|
511
539
|
}
|
|
512
540
|
}
|
|
513
541
|
|
|
514
|
-
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node
|
|
542
|
+
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
|
|
515
543
|
// graph outputs are never freed
|
|
516
544
|
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
|
|
517
545
|
AT_PRINTF("not freeing output %s\n", node->name);
|
|
518
546
|
return;
|
|
519
547
|
}
|
|
520
548
|
|
|
521
|
-
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
|
522
|
-
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
|
523
549
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
|
524
550
|
size_t offset = hn->offset;
|
|
551
|
+
int buffer_id = hn->buffer_id;
|
|
552
|
+
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
|
553
|
+
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
|
525
554
|
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
|
526
555
|
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
|
|
527
556
|
hn->allocated = false;
|
|
@@ -533,8 +562,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
|
|
|
533
562
|
|
|
534
563
|
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
|
535
564
|
// clear hash tables
|
|
536
|
-
|
|
537
|
-
memset(galloc->hash_values,
|
|
565
|
+
ggml_hash_set_reset(&galloc->hash_set);
|
|
566
|
+
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
|
|
538
567
|
|
|
539
568
|
// allocate leafs
|
|
540
569
|
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
|
@@ -626,11 +655,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
|
626
655
|
AT_PRINTF("view_src %s: %d children, %d views\n",
|
|
627
656
|
view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
|
628
657
|
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
|
|
629
|
-
ggml_gallocr_free_node(galloc, view_src
|
|
658
|
+
ggml_gallocr_free_node(galloc, view_src);
|
|
630
659
|
}
|
|
631
660
|
}
|
|
632
661
|
else if (p_hn->allocated) {
|
|
633
|
-
ggml_gallocr_free_node(galloc, parent
|
|
662
|
+
ggml_gallocr_free_node(galloc, parent);
|
|
634
663
|
}
|
|
635
664
|
}
|
|
636
665
|
AT_PRINTF("\n");
|
|
@@ -639,21 +668,19 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
|
639
668
|
}
|
|
640
669
|
|
|
641
670
|
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
|
642
|
-
size_t
|
|
671
|
+
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
|
|
672
|
+
// add 25% margin to avoid hash collisions
|
|
673
|
+
min_hash_size += min_hash_size / 4;
|
|
643
674
|
|
|
644
675
|
// initialize hash table
|
|
645
|
-
if (galloc->hash_set.size <
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
galloc->hash_set.size = hash_size;
|
|
649
|
-
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
|
|
650
|
-
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
|
|
676
|
+
if (galloc->hash_set.size < min_hash_size) {
|
|
677
|
+
ggml_hash_set_free(&galloc->hash_set);
|
|
678
|
+
galloc->hash_set = ggml_hash_set_new(min_hash_size);
|
|
651
679
|
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
|
680
|
+
|
|
681
|
+
free(galloc->hash_values);
|
|
682
|
+
galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
|
|
652
683
|
GGML_ASSERT(galloc->hash_values != NULL);
|
|
653
|
-
} else {
|
|
654
|
-
// reset hash table
|
|
655
|
-
memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
|
|
656
|
-
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
|
|
657
684
|
}
|
|
658
685
|
|
|
659
686
|
// reset allocators
|
|
@@ -674,22 +701,25 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
|
674
701
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
675
702
|
struct ggml_tensor * node = graph->nodes[i];
|
|
676
703
|
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
|
677
|
-
node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
|
|
678
704
|
if (node->view_src || node->data) {
|
|
705
|
+
node_alloc->dst.buffer_id = -1;
|
|
679
706
|
node_alloc->dst.offset = SIZE_MAX;
|
|
680
707
|
node_alloc->dst.size_max = 0;
|
|
681
708
|
} else {
|
|
682
709
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
|
683
|
-
node_alloc->dst.
|
|
684
|
-
node_alloc->dst.
|
|
710
|
+
node_alloc->dst.buffer_id = hn->buffer_id;
|
|
711
|
+
node_alloc->dst.offset = hn->offset;
|
|
712
|
+
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
|
|
685
713
|
}
|
|
686
714
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
687
715
|
struct ggml_tensor * src = node->src[j];
|
|
688
716
|
if (!src || src->view_src || src->data) {
|
|
717
|
+
node_alloc->src[j].buffer_id = -1;
|
|
689
718
|
node_alloc->src[j].offset = SIZE_MAX;
|
|
690
719
|
node_alloc->src[j].size_max = 0;
|
|
691
720
|
} else {
|
|
692
721
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
|
|
722
|
+
node_alloc->src[j].buffer_id = hn->buffer_id;
|
|
693
723
|
node_alloc->src[j].offset = hn->offset;
|
|
694
724
|
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
|
|
695
725
|
}
|
|
@@ -706,9 +736,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
|
706
736
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
|
707
737
|
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
|
708
738
|
if (leaf->view_src || leaf->data) {
|
|
739
|
+
galloc->leaf_allocs[i].leaf.buffer_id = -1;
|
|
709
740
|
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
|
710
741
|
galloc->leaf_allocs[i].leaf.size_max = 0;
|
|
711
742
|
} else {
|
|
743
|
+
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
|
|
712
744
|
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
|
713
745
|
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
|
714
746
|
}
|
|
@@ -716,6 +748,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
|
716
748
|
|
|
717
749
|
// reallocate buffers if needed
|
|
718
750
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
|
751
|
+
// if the buffer type is used multiple times, we reuse the same buffer
|
|
752
|
+
for (int j = 0; j < i; j++) {
|
|
753
|
+
if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
|
|
754
|
+
galloc->buffers[i] = galloc->buffers[j];
|
|
755
|
+
break;
|
|
756
|
+
}
|
|
757
|
+
}
|
|
758
|
+
|
|
719
759
|
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
|
720
760
|
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
|
721
761
|
|
|
@@ -724,12 +764,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
|
724
764
|
#ifndef NDEBUG
|
|
725
765
|
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
|
726
766
|
#endif
|
|
767
|
+
|
|
727
768
|
ggml_backend_buffer_free(galloc->buffers[i]);
|
|
728
769
|
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
|
729
770
|
if (galloc->buffers[i] == NULL) {
|
|
730
771
|
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
|
731
772
|
return false;
|
|
732
773
|
}
|
|
774
|
+
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
|
733
775
|
}
|
|
734
776
|
}
|
|
735
777
|
|
|
@@ -740,7 +782,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
|
|
740
782
|
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
|
741
783
|
}
|
|
742
784
|
|
|
743
|
-
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor,
|
|
785
|
+
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
|
|
786
|
+
int buffer_id = tensor_alloc->buffer_id;
|
|
744
787
|
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
|
745
788
|
|
|
746
789
|
if (tensor->view_src != NULL) {
|
|
@@ -750,7 +793,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|
|
750
793
|
// this tensor was allocated without ggml-backend
|
|
751
794
|
return;
|
|
752
795
|
}
|
|
753
|
-
ggml_backend_view_init(
|
|
796
|
+
ggml_backend_view_init(tensor);
|
|
754
797
|
}
|
|
755
798
|
} else {
|
|
756
799
|
if (tensor->data == NULL) {
|
|
@@ -768,9 +811,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|
|
768
811
|
}
|
|
769
812
|
}
|
|
770
813
|
|
|
771
|
-
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct
|
|
772
|
-
|
|
773
|
-
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
|
|
814
|
+
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
|
|
815
|
+
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
|
|
774
816
|
return talloc->size_max >= node_size;
|
|
775
817
|
}
|
|
776
818
|
|
|
@@ -793,7 +835,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
|
793
835
|
struct ggml_tensor * node = graph->nodes[i];
|
|
794
836
|
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
|
795
837
|
|
|
796
|
-
if (!ggml_gallocr_node_needs_realloc(galloc, node,
|
|
838
|
+
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
|
|
797
839
|
#ifndef NDEBUG
|
|
798
840
|
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
|
|
799
841
|
#endif
|
|
@@ -805,7 +847,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
|
805
847
|
if (src == NULL) {
|
|
806
848
|
continue;
|
|
807
849
|
}
|
|
808
|
-
if (!ggml_gallocr_node_needs_realloc(galloc, src,
|
|
850
|
+
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
|
|
809
851
|
#ifndef NDEBUG
|
|
810
852
|
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
|
811
853
|
#endif
|
|
@@ -846,7 +888,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
|
846
888
|
for (int i = 0; i < graph->n_leafs; i++) {
|
|
847
889
|
struct ggml_tensor * leaf = graph->leafs[i];
|
|
848
890
|
struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
|
849
|
-
ggml_gallocr_init_tensor(galloc, leaf,
|
|
891
|
+
ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
|
|
850
892
|
}
|
|
851
893
|
// nodes
|
|
852
894
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
@@ -857,9 +899,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
|
857
899
|
if (src == NULL) {
|
|
858
900
|
continue;
|
|
859
901
|
}
|
|
860
|
-
ggml_gallocr_init_tensor(galloc, src,
|
|
902
|
+
ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
|
|
861
903
|
}
|
|
862
|
-
ggml_gallocr_init_tensor(galloc, node,
|
|
904
|
+
ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
|
|
863
905
|
}
|
|
864
906
|
|
|
865
907
|
return true;
|
|
@@ -871,6 +913,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
|
|
871
913
|
if (galloc->buffers[buffer_id] == NULL) {
|
|
872
914
|
return 0;
|
|
873
915
|
}
|
|
916
|
+
|
|
917
|
+
for (int i = 0; i < buffer_id; i++) {
|
|
918
|
+
if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
|
|
919
|
+
// this buffer is the same as a previous one due to the same buffer type being used multiple times
|
|
920
|
+
// only return the buffer size the first time it appears to avoid double counting
|
|
921
|
+
return 0;
|
|
922
|
+
}
|
|
923
|
+
}
|
|
924
|
+
|
|
874
925
|
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
|
875
926
|
}
|
|
876
927
|
|
|
@@ -886,7 +937,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
|
886
937
|
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
|
887
938
|
#endif
|
|
888
939
|
for (size_t i = 0; i < *n_buffers; i++) {
|
|
889
|
-
ggml_backend_buffer_free(*buffers[i]);
|
|
940
|
+
ggml_backend_buffer_free((*buffers)[i]);
|
|
890
941
|
}
|
|
891
942
|
free(*buffers);
|
|
892
943
|
return false;
|
|
@@ -899,12 +950,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
|
899
950
|
if (t->view_src == NULL) {
|
|
900
951
|
ggml_tallocr_alloc(&tallocr, t);
|
|
901
952
|
} else if (t->buffer == NULL) {
|
|
902
|
-
ggml_backend_view_init(
|
|
953
|
+
ggml_backend_view_init(t);
|
|
903
954
|
}
|
|
904
955
|
} else {
|
|
905
956
|
if (t->view_src != NULL && t->buffer == NULL) {
|
|
906
957
|
// view of a pre-allocated tensor
|
|
907
|
-
ggml_backend_view_init(
|
|
958
|
+
ggml_backend_view_init(t);
|
|
908
959
|
}
|
|
909
960
|
}
|
|
910
961
|
}
|
|
@@ -17,13 +17,15 @@ extern "C" {
|
|
|
17
17
|
|
|
18
18
|
struct ggml_backend_buffer_type_i {
|
|
19
19
|
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
|
20
|
+
// allocate a buffer of this type
|
|
20
21
|
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
|
21
|
-
|
|
22
|
-
size_t (*GGML_CALL
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
// tensor alignment
|
|
23
|
+
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
|
|
24
|
+
// max buffer size that can be allocated
|
|
25
|
+
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
|
|
26
|
+
// data size needed to allocate the tensor, including padding
|
|
27
|
+
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
|
25
28
|
// check if tensor data is in host memory
|
|
26
|
-
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
|
27
29
|
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
|
|
28
30
|
};
|
|
29
31
|
|
|
@@ -92,27 +94,37 @@ extern "C" {
|
|
|
92
94
|
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
|
93
95
|
|
|
94
96
|
// compute graph with a plan (not used currently)
|
|
97
|
+
// create a new plan for a graph
|
|
95
98
|
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
|
96
99
|
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
100
|
+
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
|
101
|
+
void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
|
|
102
|
+
// compute the graph with the plan
|
|
103
|
+
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
97
104
|
|
|
98
|
-
// compute graph with a plan
|
|
99
|
-
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
100
105
|
// compute graph without a plan (async)
|
|
101
106
|
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
102
107
|
|
|
103
|
-
// check if the backend
|
|
108
|
+
// check if the backend can compute an operation
|
|
104
109
|
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
105
110
|
|
|
111
|
+
// check if the backend can use tensors allocated in a buffer type
|
|
112
|
+
bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
|
113
|
+
|
|
106
114
|
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
|
107
115
|
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
|
108
116
|
// even if the weight has to be copied from the CPU temporarily
|
|
109
117
|
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
110
118
|
|
|
111
119
|
// (optional) event synchronization
|
|
120
|
+
// create a new event that can record events on this backend instance
|
|
112
121
|
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
|
113
122
|
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
|
123
|
+
// record an event on the backend instance that created it
|
|
114
124
|
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
|
125
|
+
// wait for an event on on a different backend instance
|
|
115
126
|
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
|
127
|
+
// block until an event is recorded
|
|
116
128
|
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
|
117
129
|
};
|
|
118
130
|
|