@fugood/llama.node 0.2.3 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -3
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +8 -1
- package/package.json +3 -3
- package/patches/llama.patch +12 -12
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/LlamaContext.cpp +33 -1
- package/src/LlamaContext.h +1 -0
- package/src/llama.cpp/.github/workflows/bench.yml +310 -0
- package/src/llama.cpp/.github/workflows/build.yml +1315 -0
- package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
- package/src/llama.cpp/.github/workflows/docker.yml +116 -0
- package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
- package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
- package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
- package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
- package/src/llama.cpp/.github/workflows/server.yml +183 -0
- package/src/llama.cpp/CMakeLists.txt +91 -1245
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
- package/src/llama.cpp/cmake/build-info.cmake +58 -0
- package/src/llama.cpp/cmake/git-vars.cmake +22 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -3
- package/src/llama.cpp/common/build-info.cpp.in +4 -0
- package/src/llama.cpp/common/common.cpp +1116 -877
- package/src/llama.cpp/common/common.h +191 -77
- package/src/llama.cpp/common/grammar-parser.cpp +118 -31
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
- package/src/llama.cpp/common/log.h +1 -1
- package/src/llama.cpp/common/ngram-cache.h +10 -3
- package/src/llama.cpp/common/sampling.cpp +19 -10
- package/src/llama.cpp/docs/build.md +353 -0
- package/src/llama.cpp/examples/CMakeLists.txt +22 -22
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +52 -55
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/chat-13B.bat +57 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
- package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
- package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
- package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +38 -153
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
- package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
- package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
- package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
- package/src/llama.cpp/examples/llava/clip.cpp +23 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
- package/src/llama.cpp/examples/llava/requirements.txt +3 -2
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +98 -75
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
- package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
- package/src/llama.cpp/examples/server/server.cpp +274 -671
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +28 -29
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +21 -29
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +23 -0
- package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
- package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
- package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
- package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
- package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
- package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
- package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
- package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
- package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
- package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
- package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
- package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
- package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
- package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
- package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
- package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
- package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
- package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
- package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
- package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
- package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
- package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
- package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
- package/src/llama.cpp/models/.editorconfig +1 -0
- package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/requirements/requirements-all.txt +12 -0
- package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
- package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
- package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
- package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
- package/src/llama.cpp/requirements.txt +5 -4
- package/src/llama.cpp/scripts/build-info.sh +30 -0
- package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
- package/src/llama.cpp/src/CMakeLists.txt +33 -0
- package/src/llama.cpp/src/llama-grammar.cpp +539 -0
- package/src/llama.cpp/src/llama-grammar.h +39 -0
- package/src/llama.cpp/src/llama-impl.h +26 -0
- package/src/llama.cpp/src/llama-sampling.cpp +635 -0
- package/src/llama.cpp/src/llama-sampling.h +56 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
- package/src/llama.cpp/src/llama-vocab.h +130 -0
- package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
- package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
- package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
- package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
- package/src/llama.cpp/tests/CMakeLists.txt +19 -20
- package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
- package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
- package/src/llama.cpp/tests/test-double-float.cpp +2 -2
- package/src/llama.cpp/tests/test-grad0.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
- package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
- package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
- package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
- package/src/llama.cpp/tests/test-rope.cpp +3 -4
- package/src/llama.cpp/tests/test-sampling.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
- package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
- package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
- package/src/llama.cpp/ggml-opencl.cpp +0 -2305
- package/src/llama.cpp/ggml-opencl.h +0 -36
- package/src/llama.cpp/ggml-sycl.cpp +0 -17340
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
- package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
- /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
- /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
- /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
- /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
- /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
|
@@ -19,7 +19,11 @@ typedef half2 ggml_half2;
|
|
|
19
19
|
|
|
20
20
|
#define GGML_COMMON_DECL
|
|
21
21
|
#elif defined(GGML_COMMON_DECL_CUDA)
|
|
22
|
+
#if defined(GGML_COMMON_DECL_MUSA)
|
|
23
|
+
#include <musa_fp16.h>
|
|
24
|
+
#else
|
|
22
25
|
#include <cuda_fp16.h>
|
|
26
|
+
#endif
|
|
23
27
|
#include <cstdint>
|
|
24
28
|
|
|
25
29
|
typedef half ggml_half;
|
|
@@ -106,28 +110,34 @@ typedef sycl::half2 ggml_half2;
|
|
|
106
110
|
#define QR6_K 2
|
|
107
111
|
|
|
108
112
|
#define QI2_XXS (QK_K / (4*QR2_XXS))
|
|
109
|
-
#define QR2_XXS
|
|
113
|
+
#define QR2_XXS 4
|
|
110
114
|
|
|
111
115
|
#define QI2_XS (QK_K / (4*QR2_XS))
|
|
112
|
-
#define QR2_XS
|
|
116
|
+
#define QR2_XS 4
|
|
113
117
|
|
|
114
118
|
#define QI2_S (QK_K / (4*QR2_S))
|
|
115
|
-
#define QR2_S
|
|
119
|
+
#define QR2_S 4
|
|
116
120
|
|
|
117
121
|
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
|
118
|
-
#define QR3_XXS
|
|
122
|
+
#define QR3_XXS 4
|
|
119
123
|
|
|
120
124
|
#define QI3_XS (QK_K / (4*QR3_XS))
|
|
121
|
-
#define QR3_XS
|
|
125
|
+
#define QR3_XS 4
|
|
122
126
|
|
|
123
127
|
#define QI1_S (QK_K / (4*QR1_S))
|
|
124
128
|
#define QR1_S 8
|
|
125
129
|
|
|
130
|
+
#define QI1_M (QK_K / (4*QR1_M))
|
|
131
|
+
#define QR1_M 8
|
|
132
|
+
|
|
126
133
|
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
|
127
134
|
#define QR4_NL 2
|
|
128
135
|
|
|
129
136
|
#define QI4_XS (QK_K / (4*QR4_XS))
|
|
130
|
-
#define QR4_XS
|
|
137
|
+
#define QR4_XS 2
|
|
138
|
+
|
|
139
|
+
#define QI3_S (QK_K / (4*QR3_S))
|
|
140
|
+
#define QR3_S 4
|
|
131
141
|
|
|
132
142
|
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
|
133
143
|
|
|
@@ -193,6 +203,30 @@ typedef struct {
|
|
|
193
203
|
} block_q8_1;
|
|
194
204
|
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
|
|
195
205
|
|
|
206
|
+
typedef struct {
|
|
207
|
+
ggml_half d[4]; // deltas for 4 q4_0 blocks
|
|
208
|
+
uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
|
|
209
|
+
} block_q4_0x4;
|
|
210
|
+
static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
|
|
211
|
+
|
|
212
|
+
typedef struct {
|
|
213
|
+
ggml_half d[8]; // deltas for 8 q4_0 blocks
|
|
214
|
+
uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
|
|
215
|
+
} block_q4_0x8;
|
|
216
|
+
static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
|
|
217
|
+
|
|
218
|
+
typedef struct {
|
|
219
|
+
ggml_half d[4]; // deltas for 4 q8_0 blocks
|
|
220
|
+
int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
|
|
221
|
+
} block_q8_0x4;
|
|
222
|
+
static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
|
|
223
|
+
|
|
224
|
+
typedef struct {
|
|
225
|
+
ggml_half d[8]; // deltas for 8 q8_0 blocks
|
|
226
|
+
int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks
|
|
227
|
+
} block_q8_0x8;
|
|
228
|
+
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
|
|
229
|
+
|
|
196
230
|
//
|
|
197
231
|
// Super-block quantization structures
|
|
198
232
|
//
|
|
@@ -385,7 +419,7 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
|
|
|
385
419
|
#define GGML_TABLE_END() };
|
|
386
420
|
|
|
387
421
|
#define GGML_COMMON_IMPL
|
|
388
|
-
#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP)
|
|
422
|
+
#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP) || defined(GGML_COMMON_IMPL_MUSA)
|
|
389
423
|
#include <cstdint>
|
|
390
424
|
|
|
391
425
|
#define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
|
18
18
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
|
19
19
|
|
|
20
|
-
#if defined(
|
|
20
|
+
#if defined(_MSC_VER)
|
|
21
21
|
|
|
22
22
|
#define m512bh(p) p
|
|
23
23
|
#define m512i(p) p
|
|
@@ -609,6 +609,10 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
|
609
609
|
|
|
610
610
|
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
|
611
611
|
|
|
612
|
+
#ifdef __ARM_FEATURE_SVE
|
|
613
|
+
#include <arm_sve.h>
|
|
614
|
+
#endif // __ARM_FEATURE_SVE
|
|
615
|
+
|
|
612
616
|
// precomputed f32 table for f16 (256 KB)
|
|
613
617
|
// defined in ggml.c, initialized in ggml_init()
|
|
614
618
|
extern float ggml_table_f32_f16[1 << 16];
|
|
@@ -630,21 +634,121 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
|
630
634
|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
|
631
635
|
#endif
|
|
632
636
|
|
|
633
|
-
|
|
634
|
-
|
|
637
|
+
// bitset
|
|
638
|
+
|
|
639
|
+
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
|
|
640
|
+
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
|
|
641
|
+
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
|
|
642
|
+
|
|
643
|
+
static size_t ggml_bitset_size(size_t n) {
|
|
644
|
+
return (n + BITSET_MASK) >> BITSET_SHR;
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) {
|
|
648
|
+
return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK)));
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) {
|
|
652
|
+
bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK));
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
|
|
656
|
+
bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK));
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
// hash set
|
|
660
|
+
|
|
661
|
+
#define GGML_HASHSET_FULL ((size_t)-1)
|
|
662
|
+
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
|
|
635
663
|
|
|
636
664
|
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
|
665
|
+
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
|
|
637
666
|
|
|
638
|
-
|
|
667
|
+
// returns the minimum size for a hash set that can hold min_sz elements
|
|
668
|
+
size_t ggml_hash_size(size_t min_sz);
|
|
639
669
|
|
|
640
|
-
//
|
|
641
|
-
|
|
670
|
+
// remove all elements from the hash set
|
|
671
|
+
void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
|
|
642
672
|
|
|
643
|
-
// returns
|
|
644
|
-
|
|
673
|
+
// returns true if key is in the hash set
|
|
674
|
+
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
|
675
|
+
|
|
676
|
+
// returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
|
677
|
+
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
|
678
|
+
|
|
679
|
+
// returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
|
680
|
+
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
|
645
681
|
|
|
646
682
|
// return index, asserts if table is full
|
|
647
|
-
size_t ggml_hash_find_or_insert(
|
|
683
|
+
static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
|
684
|
+
|
|
685
|
+
// hash function for ggml_tensor
|
|
686
|
+
static inline size_t ggml_hash(const struct ggml_tensor * p) {
|
|
687
|
+
// the last 4 bits are always zero due to alignment
|
|
688
|
+
return (size_t)(uintptr_t)p >> 4;
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
|
692
|
+
size_t h = ggml_hash(key) % hash_set->size;
|
|
693
|
+
|
|
694
|
+
// linear probing
|
|
695
|
+
size_t i = h;
|
|
696
|
+
while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) {
|
|
697
|
+
i = (i + 1) % hash_set->size;
|
|
698
|
+
if (i == h) {
|
|
699
|
+
// visited all hash table entries -> not found
|
|
700
|
+
return GGML_HASHSET_FULL;
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
return i;
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
|
707
|
+
size_t i = ggml_hash_find(hash_set, key);
|
|
708
|
+
return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i);
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
|
712
|
+
size_t h = ggml_hash(key) % hash_set->size;
|
|
713
|
+
|
|
714
|
+
// linear probing
|
|
715
|
+
size_t i = h;
|
|
716
|
+
do {
|
|
717
|
+
if (!ggml_bitset_get(hash_set->used, i)) {
|
|
718
|
+
ggml_bitset_set(hash_set->used, i);
|
|
719
|
+
hash_set->keys[i] = key;
|
|
720
|
+
return i;
|
|
721
|
+
}
|
|
722
|
+
if (hash_set->keys[i] == key) {
|
|
723
|
+
return GGML_HASHSET_ALREADY_EXISTS;
|
|
724
|
+
}
|
|
725
|
+
i = (i + 1) % hash_set->size;
|
|
726
|
+
} while (i != h);
|
|
727
|
+
|
|
728
|
+
// visited all hash table entries -> not found
|
|
729
|
+
GGML_ABORT("fatal error");
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
|
733
|
+
size_t h = ggml_hash(key) % hash_set->size;
|
|
734
|
+
|
|
735
|
+
// linear probing
|
|
736
|
+
size_t i = h;
|
|
737
|
+
do {
|
|
738
|
+
if (!ggml_bitset_get(hash_set->used, i)) {
|
|
739
|
+
ggml_bitset_set(hash_set->used, i);
|
|
740
|
+
hash_set->keys[i] = key;
|
|
741
|
+
return i;
|
|
742
|
+
}
|
|
743
|
+
if (hash_set->keys[i] == key) {
|
|
744
|
+
return i;
|
|
745
|
+
}
|
|
746
|
+
i = (i + 1) % hash_set->size;
|
|
747
|
+
} while (i != h);
|
|
748
|
+
|
|
749
|
+
// visited all hash table entries -> not found
|
|
750
|
+
GGML_ABORT("fatal error");
|
|
751
|
+
}
|
|
648
752
|
|
|
649
753
|
#ifdef __cplusplus
|
|
650
754
|
}
|
|
@@ -22,6 +22,7 @@
|
|
|
22
22
|
#include "shaderop_mul_mat_q4_1.h"
|
|
23
23
|
#include "shaderop_mul_mat_q6_k.h"
|
|
24
24
|
#include "shaderop_mul_mat_mat_f32.h"
|
|
25
|
+
#include "shaderop_getrows_f32.h"
|
|
25
26
|
#include "shaderop_getrows_f16.h"
|
|
26
27
|
#include "shaderop_getrows_q4_0.h"
|
|
27
28
|
#include "shaderop_getrows_q4_1.h"
|
|
@@ -565,7 +566,7 @@ uint32_t safe_divide(uint32_t a, uint32_t b) {
|
|
|
565
566
|
}
|
|
566
567
|
if ((a % b) != 0) {
|
|
567
568
|
fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
|
|
568
|
-
|
|
569
|
+
GGML_ABORT("safe_divide result would've had remainder");
|
|
569
570
|
}
|
|
570
571
|
return a / b;
|
|
571
572
|
}
|
|
@@ -1146,6 +1147,14 @@ static void ggml_vk_get_rows(
|
|
|
1146
1147
|
seq.record<kp::OpAlgoDispatch>(s_algo);
|
|
1147
1148
|
}
|
|
1148
1149
|
|
|
1150
|
+
template <typename... Args>
|
|
1151
|
+
static void ggml_vk_get_rows_f32(Args&&... args) {
|
|
1152
|
+
const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f32_comp_spv,
|
|
1153
|
+
kp::shader_data::op_getrows_f32_comp_spv_len);
|
|
1154
|
+
|
|
1155
|
+
ggml_vk_get_rows(spirv, "f32", sizeof(float), 0, std::forward<Args>(args)...);
|
|
1156
|
+
}
|
|
1157
|
+
|
|
1149
1158
|
template <typename... Args>
|
|
1150
1159
|
static void ggml_vk_get_rows_f16(Args&&... args) {
|
|
1151
1160
|
const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv,
|
|
@@ -1183,7 +1192,7 @@ static void ggml_vk_rope(
|
|
|
1183
1192
|
const std::shared_ptr<kp::Tensor>& inB,
|
|
1184
1193
|
const std::shared_ptr<kp::Tensor>& out,
|
|
1185
1194
|
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
|
1186
|
-
ggml_type src0t, int32_t n_dims, int32_t mode, int32_t
|
|
1195
|
+
ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig,
|
|
1187
1196
|
float freq_base, float freq_scale, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
|
|
1188
1197
|
int32_t ne01, int32_t ne02, int32_t ne03,
|
|
1189
1198
|
uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
|
|
@@ -1212,14 +1221,14 @@ static void ggml_vk_rope(
|
|
|
1212
1221
|
|
|
1213
1222
|
struct PushConstants {
|
|
1214
1223
|
uint32_t inAOff, inBOff, outOff;
|
|
1215
|
-
int32_t n_dims, mode,
|
|
1224
|
+
int32_t n_dims, mode, n_ctx_orig;
|
|
1216
1225
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
|
1217
1226
|
uint32_t nb00, nb01, nb02, nb03;
|
|
1218
1227
|
int32_t ne0;
|
|
1219
1228
|
uint32_t nb0, nb1, nb2, nb3;
|
|
1220
1229
|
} pushConsts {
|
|
1221
1230
|
safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size),
|
|
1222
|
-
n_dims, mode,
|
|
1231
|
+
n_dims, mode, n_ctx_orig,
|
|
1223
1232
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
|
|
1224
1233
|
nb00, nb01, nb02, nb03,
|
|
1225
1234
|
ne0,
|
|
@@ -1331,7 +1340,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
|
|
|
1331
1340
|
case GGML_UNARY_OP_RELU:
|
|
1332
1341
|
case GGML_UNARY_OP_GELU:
|
|
1333
1342
|
case GGML_UNARY_OP_SILU:
|
|
1334
|
-
return
|
|
1343
|
+
return ggml_is_contiguous(op->src[0]);
|
|
1335
1344
|
default:
|
|
1336
1345
|
;
|
|
1337
1346
|
}
|
|
@@ -1371,6 +1380,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
|
|
|
1371
1380
|
return op->ne[3] == 1;
|
|
1372
1381
|
case GGML_OP_GET_ROWS:
|
|
1373
1382
|
switch (op->src[0]->type) {
|
|
1383
|
+
case GGML_TYPE_F32:
|
|
1374
1384
|
case GGML_TYPE_F16:
|
|
1375
1385
|
case GGML_TYPE_Q4_0:
|
|
1376
1386
|
case GGML_TYPE_Q4_1:
|
|
@@ -1450,7 +1460,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
|
1450
1460
|
|
|
1451
1461
|
if (!ggml_vk_supports_op(dst)) {
|
|
1452
1462
|
fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
|
|
1453
|
-
|
|
1463
|
+
GGML_ABORT("unsupported op");
|
|
1454
1464
|
}
|
|
1455
1465
|
|
|
1456
1466
|
const int32_t ne00 = src0 ? src0->ne[0] : 0;
|
|
@@ -1552,7 +1562,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
|
1552
1562
|
default:
|
|
1553
1563
|
{
|
|
1554
1564
|
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
|
1555
|
-
|
|
1565
|
+
GGML_ABORT("fatal error");
|
|
1556
1566
|
}
|
|
1557
1567
|
}
|
|
1558
1568
|
} break;
|
|
@@ -1597,7 +1607,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
|
1597
1607
|
{
|
|
1598
1608
|
GGML_ASSERT(ne00 == ne10);
|
|
1599
1609
|
|
|
1600
|
-
// TODO: assert that dim2 and dim3 are contiguous
|
|
1601
1610
|
GGML_ASSERT(ne12 % ne02 == 0);
|
|
1602
1611
|
GGML_ASSERT(ne13 % ne03 == 0);
|
|
1603
1612
|
|
|
@@ -1662,7 +1671,9 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
|
1662
1671
|
} break;
|
|
1663
1672
|
case GGML_OP_GET_ROWS:
|
|
1664
1673
|
{
|
|
1665
|
-
if (src0t ==
|
|
1674
|
+
if (src0t == GGML_TYPE_F32) {
|
|
1675
|
+
ggml_vk_get_rows_f32(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
|
1676
|
+
} else if (src0t == GGML_TYPE_F16) {
|
|
1666
1677
|
ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
|
1667
1678
|
} else if (src0t == GGML_TYPE_Q4_0) {
|
|
1668
1679
|
ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
|
@@ -1681,13 +1692,16 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
|
1681
1692
|
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
|
1682
1693
|
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
|
1683
1694
|
|
|
1695
|
+
#pragma message("TODO: update rope NORM mode to match NEOX mode")
|
|
1696
|
+
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
|
|
1697
|
+
|
|
1684
1698
|
GGML_ASSERT(ne10 == ne02);
|
|
1685
1699
|
GGML_ASSERT(src0t == dstt);
|
|
1686
1700
|
// const int n_past = ((int32_t *) dst->op_params)[0];
|
|
1687
1701
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
1688
1702
|
const int mode = ((int32_t *) dst->op_params)[2];
|
|
1689
1703
|
// skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
|
|
1690
|
-
const int
|
|
1704
|
+
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
|
1691
1705
|
|
|
1692
1706
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
|
1693
1707
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
|
@@ -1697,7 +1711,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
|
1697
1711
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
|
1698
1712
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
|
1699
1713
|
ggml_vk_rope(
|
|
1700
|
-
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode,
|
|
1714
|
+
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_ctx_orig,
|
|
1701
1715
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
|
|
1702
1716
|
ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
|
|
1703
1717
|
);
|
|
@@ -1731,7 +1745,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
|
1731
1745
|
continue;
|
|
1732
1746
|
not_implemented: {}
|
|
1733
1747
|
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
|
1734
|
-
//
|
|
1748
|
+
//GGML_ABORT("fatal error");
|
|
1735
1749
|
}
|
|
1736
1750
|
|
|
1737
1751
|
// Evaluate sequence
|
|
@@ -1888,18 +1902,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_
|
|
|
1888
1902
|
return ctx->max_alloc;
|
|
1889
1903
|
}
|
|
1890
1904
|
|
|
1891
|
-
static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
1892
|
-
GGML_UNUSED(buft);
|
|
1893
|
-
return ggml_backend_is_kompute(backend);
|
|
1894
|
-
}
|
|
1895
|
-
|
|
1896
1905
|
static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
|
|
1897
1906
|
/* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
|
|
1898
1907
|
/* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
|
|
1899
1908
|
/* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
|
|
1900
1909
|
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
|
1901
1910
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
1902
|
-
/* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
|
|
1903
1911
|
/* .is_host = */ NULL,
|
|
1904
1912
|
};
|
|
1905
1913
|
|
|
@@ -1959,6 +1967,11 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc
|
|
|
1959
1967
|
return ggml_vk_supports_op(op);
|
|
1960
1968
|
}
|
|
1961
1969
|
|
|
1970
|
+
static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
1971
|
+
GGML_UNUSED(backend);
|
|
1972
|
+
return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
|
|
1973
|
+
}
|
|
1974
|
+
|
|
1962
1975
|
static struct ggml_backend_i kompute_backend_i = {
|
|
1963
1976
|
/* .get_name = */ ggml_backend_kompute_name,
|
|
1964
1977
|
/* .free = */ ggml_backend_kompute_free,
|
|
@@ -1969,9 +1982,11 @@ static struct ggml_backend_i kompute_backend_i = {
|
|
|
1969
1982
|
/* .synchronize = */ NULL,
|
|
1970
1983
|
/* .graph_plan_create = */ NULL,
|
|
1971
1984
|
/* .graph_plan_free = */ NULL,
|
|
1985
|
+
/* .graph_plan_update = */ NULL,
|
|
1972
1986
|
/* .graph_plan_compute = */ NULL,
|
|
1973
1987
|
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
|
1974
1988
|
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
|
1989
|
+
/* .supports_buft = */ ggml_backend_kompute_supports_buft,
|
|
1975
1990
|
/* .offload_op = */ NULL,
|
|
1976
1991
|
/* .event_new = */ NULL,
|
|
1977
1992
|
/* .event_free = */ NULL,
|