@fugood/llama.node 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +8 -1
- package/package.json +1 -1
- package/patches/llama.patch +12 -12
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/LlamaContext.cpp +33 -1
- package/src/LlamaContext.h +1 -0
- package/src/LoadSessionWorker.cpp +1 -0
- package/src/llama.cpp/.github/workflows/bench.yml +310 -0
- package/src/llama.cpp/.github/workflows/build.yml +1315 -0
- package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
- package/src/llama.cpp/.github/workflows/docker.yml +116 -0
- package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
- package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
- package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
- package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
- package/src/llama.cpp/.github/workflows/server.yml +183 -0
- package/src/llama.cpp/CMakeLists.txt +91 -1245
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
- package/src/llama.cpp/cmake/build-info.cmake +58 -0
- package/src/llama.cpp/cmake/git-vars.cmake +22 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -3
- package/src/llama.cpp/common/build-info.cpp.in +4 -0
- package/src/llama.cpp/common/common.cpp +1116 -877
- package/src/llama.cpp/common/common.h +191 -77
- package/src/llama.cpp/common/grammar-parser.cpp +118 -31
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
- package/src/llama.cpp/common/log.h +1 -1
- package/src/llama.cpp/common/ngram-cache.h +10 -3
- package/src/llama.cpp/common/sampling.cpp +19 -10
- package/src/llama.cpp/docs/build.md +353 -0
- package/src/llama.cpp/examples/CMakeLists.txt +22 -22
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +52 -55
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/chat-13B.bat +57 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
- package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
- package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
- package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +38 -153
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
- package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
- package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
- package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
- package/src/llama.cpp/examples/llava/clip.cpp +23 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
- package/src/llama.cpp/examples/llava/requirements.txt +3 -2
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +98 -75
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
- package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
- package/src/llama.cpp/examples/server/server.cpp +274 -671
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +28 -29
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +21 -29
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +23 -0
- package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
- package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
- package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
- package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
- package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
- package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
- package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
- package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
- package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
- package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
- package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
- package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
- package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
- package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
- package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
- package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
- package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
- package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
- package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
- package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
- package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
- package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
- package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
- package/src/llama.cpp/models/.editorconfig +1 -0
- package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/requirements/requirements-all.txt +12 -0
- package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
- package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
- package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
- package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
- package/src/llama.cpp/requirements.txt +5 -4
- package/src/llama.cpp/scripts/build-info.sh +30 -0
- package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
- package/src/llama.cpp/src/CMakeLists.txt +33 -0
- package/src/llama.cpp/src/llama-grammar.cpp +539 -0
- package/src/llama.cpp/src/llama-grammar.h +39 -0
- package/src/llama.cpp/src/llama-impl.h +26 -0
- package/src/llama.cpp/src/llama-sampling.cpp +635 -0
- package/src/llama.cpp/src/llama-sampling.h +56 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
- package/src/llama.cpp/src/llama-vocab.h +130 -0
- package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
- package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
- package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
- package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
- package/src/llama.cpp/tests/CMakeLists.txt +19 -20
- package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
- package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
- package/src/llama.cpp/tests/test-double-float.cpp +2 -2
- package/src/llama.cpp/tests/test-grad0.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
- package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
- package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
- package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
- package/src/llama.cpp/tests/test-rope.cpp +3 -4
- package/src/llama.cpp/tests/test-sampling.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
- package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
- package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
- package/src/llama.cpp/ggml-opencl.cpp +0 -2305
- package/src/llama.cpp/ggml-opencl.h +0 -36
- package/src/llama.cpp/ggml-sycl.cpp +0 -17340
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
- package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
- /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
- /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
- /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
- /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
- /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
#include <ggml.h>
|
|
2
2
|
#include <ggml-alloc.h>
|
|
3
3
|
#include <ggml-backend.h>
|
|
4
|
-
#include <ggml-backend-impl.h>
|
|
5
4
|
|
|
6
5
|
#include <algorithm>
|
|
7
6
|
#include <array>
|
|
@@ -80,14 +79,22 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
|
|
80
79
|
im = nullptr;
|
|
81
80
|
}
|
|
82
81
|
}
|
|
82
|
+
|
|
83
83
|
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
|
|
84
84
|
GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
|
|
85
|
+
// TODO: other cases
|
|
86
|
+
//#pragma omp parallel for
|
|
87
|
+
//for (int i = 0; i < tensor->ne[1]; i++) {
|
|
88
|
+
// ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
|
|
89
|
+
// i * tensor->ne[0], 1, tensor->ne[0], im);
|
|
90
|
+
//}
|
|
91
|
+
|
|
85
92
|
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
|
86
93
|
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
|
87
94
|
// This is going to create some weird integers though.
|
|
88
95
|
ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
|
|
89
96
|
} else {
|
|
90
|
-
|
|
97
|
+
GGML_ABORT("fatal error");
|
|
91
98
|
}
|
|
92
99
|
}
|
|
93
100
|
|
|
@@ -125,7 +132,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
|
|
125
132
|
tt.to_float(&buf[i], vq.data(), bs);
|
|
126
133
|
tv.insert(tv.end(), vq.begin(), vq.end());
|
|
127
134
|
} else {
|
|
128
|
-
|
|
135
|
+
GGML_ABORT("fatal error");
|
|
129
136
|
}
|
|
130
137
|
}
|
|
131
138
|
}
|
|
@@ -642,20 +649,29 @@ struct test_case {
|
|
|
642
649
|
struct test_unary : public test_case {
|
|
643
650
|
const ggml_unary_op op;
|
|
644
651
|
const ggml_type type;
|
|
645
|
-
const std::array<int64_t, 4>
|
|
652
|
+
const std::array<int64_t, 4> ne_a;
|
|
653
|
+
int v; // view (1 : non-contiguous a)
|
|
646
654
|
|
|
647
655
|
std::string vars() override {
|
|
648
|
-
return
|
|
656
|
+
return VARS_TO_STR3(type, ne_a, v);
|
|
649
657
|
}
|
|
650
658
|
|
|
651
659
|
test_unary(ggml_unary_op op,
|
|
652
660
|
ggml_type type = GGML_TYPE_F32,
|
|
653
|
-
std::array<int64_t, 4>
|
|
654
|
-
|
|
661
|
+
std::array<int64_t, 4> ne_a = {128, 10, 10, 10},
|
|
662
|
+
int v = 0)
|
|
663
|
+
: op(op), type(type), ne_a(ne_a), v(v) {}
|
|
655
664
|
|
|
656
665
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
657
|
-
ggml_tensor *
|
|
658
|
-
|
|
666
|
+
ggml_tensor * a;
|
|
667
|
+
if (v & 1) {
|
|
668
|
+
auto ne = ne_a; ne[0] *= 3;
|
|
669
|
+
a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
670
|
+
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
|
|
671
|
+
} else {
|
|
672
|
+
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
673
|
+
}
|
|
674
|
+
ggml_tensor * out = ggml_unary(ctx, a, op);
|
|
659
675
|
return out;
|
|
660
676
|
}
|
|
661
677
|
|
|
@@ -751,7 +767,7 @@ struct test_dup : public test_case {
|
|
|
751
767
|
}
|
|
752
768
|
|
|
753
769
|
test_dup(ggml_type type = GGML_TYPE_F32,
|
|
754
|
-
std::array<int64_t, 4> ne = {10, 10,
|
|
770
|
+
std::array<int64_t, 4> ne = {10, 10, 20, 1},
|
|
755
771
|
std::array<int64_t, 4> permute = {0, 0, 0, 0})
|
|
756
772
|
: type(type), ne(ne), permute(permute),
|
|
757
773
|
_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
|
|
@@ -771,9 +787,15 @@ struct test_cpy : public test_case {
|
|
|
771
787
|
const ggml_type type_src;
|
|
772
788
|
const ggml_type type_dst;
|
|
773
789
|
const std::array<int64_t, 4> ne;
|
|
790
|
+
const std::array<int64_t, 4> permute;
|
|
791
|
+
bool _src_use_permute;
|
|
774
792
|
|
|
775
793
|
std::string vars() override {
|
|
776
|
-
return
|
|
794
|
+
return VARS_TO_STR4(type_src, type_dst, ne, permute);
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
double max_nmse_err() override {
|
|
798
|
+
return 1e-6;
|
|
777
799
|
}
|
|
778
800
|
|
|
779
801
|
size_t op_size(ggml_tensor * t) override {
|
|
@@ -781,12 +803,18 @@ struct test_cpy : public test_case {
|
|
|
781
803
|
}
|
|
782
804
|
|
|
783
805
|
test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
|
|
784
|
-
std::array<int64_t, 4> ne = {10, 10, 10, 1}
|
|
785
|
-
|
|
806
|
+
std::array<int64_t, 4> ne = {10, 10, 10, 1},
|
|
807
|
+
std::array<int64_t, 4> permute = {0, 0, 0, 0},
|
|
808
|
+
bool _dst_use_permute = false)
|
|
809
|
+
: type_src(type_src), type_dst(type_dst), ne(ne), permute(permute),
|
|
810
|
+
_src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
|
|
786
811
|
|
|
787
812
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
788
813
|
ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
|
|
789
|
-
|
|
814
|
+
if (_src_use_permute) {
|
|
815
|
+
src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
|
|
816
|
+
}
|
|
817
|
+
ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
|
|
790
818
|
ggml_tensor * out = ggml_cpy(ctx, src, dst);
|
|
791
819
|
return out;
|
|
792
820
|
}
|
|
@@ -1054,6 +1082,33 @@ struct test_sqr : public test_case {
|
|
|
1054
1082
|
}
|
|
1055
1083
|
};
|
|
1056
1084
|
|
|
1085
|
+
// GGML_OP_SQRT
|
|
1086
|
+
struct test_sqrt : public test_case {
|
|
1087
|
+
const ggml_type type;
|
|
1088
|
+
const std::array<int64_t, 4> ne;
|
|
1089
|
+
|
|
1090
|
+
std::string vars() override {
|
|
1091
|
+
return VARS_TO_STR2(type, ne);
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
test_sqrt(ggml_type type = GGML_TYPE_F32,
|
|
1095
|
+
std::array<int64_t, 4> ne = {10, 10, 10, 10})
|
|
1096
|
+
: type(type), ne(ne) {}
|
|
1097
|
+
|
|
1098
|
+
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
1099
|
+
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
1100
|
+
ggml_tensor * out = ggml_sqrt(ctx, a);
|
|
1101
|
+
return out;
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
void initialize_tensors(ggml_context * ctx) override {
|
|
1105
|
+
// fill with positive values
|
|
1106
|
+
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
1107
|
+
init_tensor_uniform(t, 0.0f, 100.0f);
|
|
1108
|
+
}
|
|
1109
|
+
}
|
|
1110
|
+
};
|
|
1111
|
+
|
|
1057
1112
|
// GGML_OP_CLAMP
|
|
1058
1113
|
struct test_clamp : public test_case {
|
|
1059
1114
|
const ggml_type type;
|
|
@@ -1135,29 +1190,41 @@ struct test_soft_max : public test_case {
|
|
|
1135
1190
|
}
|
|
1136
1191
|
};
|
|
1137
1192
|
|
|
1193
|
+
|
|
1138
1194
|
// GGML_OP_ROPE
|
|
1139
1195
|
struct test_rope : public test_case {
|
|
1140
1196
|
const ggml_type type;
|
|
1141
|
-
const std::array<int64_t, 4>
|
|
1197
|
+
const std::array<int64_t, 4> ne_a;
|
|
1142
1198
|
int n_dims;
|
|
1143
1199
|
int mode;
|
|
1144
|
-
int n_ctx;
|
|
1200
|
+
int n_ctx; // used to generate positions
|
|
1201
|
+
float fs; // freq_scale
|
|
1202
|
+
float ef; // ext_factor
|
|
1203
|
+
float af; // attn_factor
|
|
1145
1204
|
bool ff;
|
|
1205
|
+
int v; // view (1 : non-contiguous a)
|
|
1146
1206
|
|
|
1147
1207
|
std::string vars() override {
|
|
1148
|
-
return
|
|
1208
|
+
return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v);
|
|
1149
1209
|
}
|
|
1150
1210
|
|
|
1151
1211
|
test_rope(ggml_type type = GGML_TYPE_F32,
|
|
1152
|
-
std::array<int64_t, 4>
|
|
1153
|
-
int n_dims = 10, int mode = 0, int n_ctx = 512, bool ff = false)
|
|
1154
|
-
: type(type),
|
|
1212
|
+
std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
|
|
1213
|
+
int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f, float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0)
|
|
1214
|
+
: type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v) {}
|
|
1155
1215
|
|
|
1156
1216
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
1157
|
-
ggml_tensor * a
|
|
1158
|
-
|
|
1217
|
+
ggml_tensor * a;
|
|
1218
|
+
if (v & 1) {
|
|
1219
|
+
auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
|
|
1220
|
+
a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
1221
|
+
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
|
|
1222
|
+
} else {
|
|
1223
|
+
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
1224
|
+
}
|
|
1225
|
+
ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
|
|
1159
1226
|
ggml_tensor * freq = ff ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2) : nullptr;
|
|
1160
|
-
ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode,
|
|
1227
|
+
ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
|
|
1161
1228
|
return out;
|
|
1162
1229
|
}
|
|
1163
1230
|
|
|
@@ -1165,11 +1232,11 @@ struct test_rope : public test_case {
|
|
|
1165
1232
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
1166
1233
|
if (t->type == GGML_TYPE_I32) {
|
|
1167
1234
|
// pos
|
|
1168
|
-
std::vector<int> data(
|
|
1169
|
-
for (int i = 0; i <
|
|
1235
|
+
std::vector<int> data(ne_a[2]);
|
|
1236
|
+
for (int i = 0; i < ne_a[2]; i++) {
|
|
1170
1237
|
data[i] = rand() % n_ctx;
|
|
1171
1238
|
}
|
|
1172
|
-
ggml_backend_tensor_set(t, data.data(), 0,
|
|
1239
|
+
ggml_backend_tensor_set(t, data.data(), 0, ne_a[2] * sizeof(int));
|
|
1173
1240
|
} else {
|
|
1174
1241
|
if (t->ne[0] == n_dims/2) {
|
|
1175
1242
|
// frequency factors in the range [0.9f, 1.1f]
|
|
@@ -1216,6 +1283,32 @@ struct test_pool2d : public test_case {
|
|
|
1216
1283
|
}
|
|
1217
1284
|
};
|
|
1218
1285
|
|
|
1286
|
+
// GGML_OP_CONV_TRANSPOSE_1D
|
|
1287
|
+
struct test_conv_transpose_1d : public test_case {
|
|
1288
|
+
const std::array<int64_t, 4> ne_input;
|
|
1289
|
+
const std::array<int64_t, 4> ne_kernel;
|
|
1290
|
+
|
|
1291
|
+
const int s0; // stride
|
|
1292
|
+
const int p0; // padding
|
|
1293
|
+
const int d0; // dilation
|
|
1294
|
+
|
|
1295
|
+
std::string vars() override {
|
|
1296
|
+
return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
|
|
1297
|
+
}
|
|
1298
|
+
|
|
1299
|
+
test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1]
|
|
1300
|
+
std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1]
|
|
1301
|
+
int s0 = 1, int p0 = 0, int d0 = 1)
|
|
1302
|
+
: ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}
|
|
1303
|
+
|
|
1304
|
+
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
1305
|
+
ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
|
|
1306
|
+
ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
|
|
1307
|
+
ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0);
|
|
1308
|
+
return out;
|
|
1309
|
+
}
|
|
1310
|
+
};
|
|
1311
|
+
|
|
1219
1312
|
// GGML_OP_IM2COL
|
|
1220
1313
|
struct test_im2col : public test_case {
|
|
1221
1314
|
const ggml_type type_input;
|
|
@@ -1229,7 +1322,7 @@ struct test_im2col : public test_case {
|
|
|
1229
1322
|
// padding
|
|
1230
1323
|
const int p0;
|
|
1231
1324
|
const int p1;
|
|
1232
|
-
//
|
|
1325
|
+
// dilation
|
|
1233
1326
|
const int d0;
|
|
1234
1327
|
const int d1;
|
|
1235
1328
|
// mode
|
|
@@ -1262,22 +1355,37 @@ struct test_concat : public test_case {
|
|
|
1262
1355
|
const std::array<int64_t, 4> ne_a;
|
|
1263
1356
|
const int64_t ne_b_d;
|
|
1264
1357
|
const int dim;
|
|
1358
|
+
const int v; // view (1 << 0: non-cont a, 1 << 1: non-cont b)
|
|
1265
1359
|
|
|
1266
1360
|
std::string vars() override {
|
|
1267
|
-
return
|
|
1361
|
+
return VARS_TO_STR5(type, ne_a, ne_b_d, dim, v);
|
|
1268
1362
|
}
|
|
1269
1363
|
|
|
1270
1364
|
test_concat(ggml_type type = GGML_TYPE_F32,
|
|
1271
1365
|
std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
|
|
1272
1366
|
int64_t ne_b_d = 10,
|
|
1273
|
-
int dim = 2)
|
|
1274
|
-
: type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim) {}
|
|
1367
|
+
int dim = 2, int v = 0)
|
|
1368
|
+
: type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim), v(v) {}
|
|
1275
1369
|
|
|
1276
1370
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
1277
1371
|
auto ne_b = ne_a;
|
|
1278
1372
|
ne_b[dim] = ne_b_d;
|
|
1279
|
-
ggml_tensor * a
|
|
1280
|
-
|
|
1373
|
+
ggml_tensor * a;
|
|
1374
|
+
if (v & 1) {
|
|
1375
|
+
auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
|
|
1376
|
+
a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
1377
|
+
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
|
|
1378
|
+
} else {
|
|
1379
|
+
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
1380
|
+
}
|
|
1381
|
+
ggml_tensor * b;
|
|
1382
|
+
if (v & 2) {
|
|
1383
|
+
auto ne = ne_b; ne[0] *= 3; ne[1] *= 2; ne[2] *= 4;
|
|
1384
|
+
b = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
1385
|
+
b = ggml_view_4d(ctx, b, ne_b[0], ne_b[1], ne_b[2], ne_b[3], b->nb[1], b->nb[2], b->nb[3], 0);
|
|
1386
|
+
} else {
|
|
1387
|
+
b = ggml_new_tensor(ctx, type, 4, ne_b.data());
|
|
1388
|
+
}
|
|
1281
1389
|
ggml_tensor * out = ggml_concat(ctx, a, b, dim);
|
|
1282
1390
|
return out;
|
|
1283
1391
|
}
|
|
@@ -1327,7 +1435,7 @@ struct test_argsort : public test_case {
|
|
|
1327
1435
|
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
|
|
1328
1436
|
}
|
|
1329
1437
|
} else {
|
|
1330
|
-
|
|
1438
|
+
GGML_ABORT("fatal error");
|
|
1331
1439
|
}
|
|
1332
1440
|
}
|
|
1333
1441
|
}
|
|
@@ -1544,21 +1652,25 @@ struct test_flash_attn_ext : public test_case {
|
|
|
1544
1652
|
|
|
1545
1653
|
const float max_bias; // ALiBi
|
|
1546
1654
|
|
|
1655
|
+
const ggml_type type_KV;
|
|
1656
|
+
|
|
1547
1657
|
std::string vars() override {
|
|
1548
|
-
return
|
|
1658
|
+
return VARS_TO_STR7(hs, nh, kv, nb, mask, max_bias, type_KV);
|
|
1549
1659
|
}
|
|
1550
1660
|
|
|
1551
1661
|
double max_nmse_err() override {
|
|
1552
1662
|
return 5e-4;
|
|
1553
1663
|
}
|
|
1554
1664
|
|
|
1555
|
-
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f)
|
|
1556
|
-
: hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias) {}
|
|
1665
|
+
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
|
|
1666
|
+
: hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), type_KV(type_KV) {}
|
|
1557
1667
|
|
|
1558
1668
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
ggml_tensor *
|
|
1669
|
+
const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV));
|
|
1670
|
+
|
|
1671
|
+
ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs_padded, nb, nh, 1);
|
|
1672
|
+
ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
|
|
1673
|
+
ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
|
|
1562
1674
|
ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr;
|
|
1563
1675
|
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias);
|
|
1564
1676
|
return out;
|
|
@@ -1585,7 +1697,7 @@ struct llama_hparams {
|
|
|
1585
1697
|
|
|
1586
1698
|
// cparams
|
|
1587
1699
|
static constexpr uint32_t n_ctx = 512; // user-specified context size
|
|
1588
|
-
static constexpr uint32_t
|
|
1700
|
+
static constexpr uint32_t n_ctx_orig = n_ctx;
|
|
1589
1701
|
|
|
1590
1702
|
// batch
|
|
1591
1703
|
int32_t n_tokens;
|
|
@@ -1776,13 +1888,13 @@ struct test_llama : public test_llm {
|
|
|
1776
1888
|
|
|
1777
1889
|
Qcur = ggml_rope_ext(
|
|
1778
1890
|
ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens), inp_pos, nullptr,
|
|
1779
|
-
hp.n_rot, 0,
|
|
1891
|
+
hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
|
|
1780
1892
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
1781
1893
|
);
|
|
1782
1894
|
|
|
1783
1895
|
Kcur = ggml_rope_ext(
|
|
1784
1896
|
ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos, nullptr,
|
|
1785
|
-
hp.n_rot, 0,
|
|
1897
|
+
hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
|
|
1786
1898
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
1787
1899
|
);
|
|
1788
1900
|
|
|
@@ -1901,12 +2013,12 @@ struct test_falcon : public test_llm {
|
|
|
1901
2013
|
|
|
1902
2014
|
// using mode = 2 for neox mode
|
|
1903
2015
|
Qcur = ggml_rope_ext(
|
|
1904
|
-
ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2,
|
|
2016
|
+
ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
|
|
1905
2017
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
1906
2018
|
);
|
|
1907
2019
|
|
|
1908
2020
|
Kcur = ggml_rope_ext(
|
|
1909
|
-
ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2,
|
|
2021
|
+
ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
|
|
1910
2022
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
1911
2023
|
);
|
|
1912
2024
|
|
|
@@ -1983,12 +2095,15 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|
|
1983
2095
|
GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
|
|
1984
2096
|
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
|
|
1985
2097
|
GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
|
|
2098
|
+
GGML_TYPE_BF16,
|
|
1986
2099
|
};
|
|
1987
2100
|
|
|
1988
2101
|
// unary ops
|
|
1989
|
-
for (int
|
|
1990
|
-
|
|
1991
|
-
|
|
2102
|
+
for (int v : {0, 1}) {
|
|
2103
|
+
for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
|
|
2104
|
+
test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 128, 10, 10, 10 }, v));
|
|
2105
|
+
test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 7, 13, 19, 23 }, v));
|
|
2106
|
+
}
|
|
1992
2107
|
}
|
|
1993
2108
|
|
|
1994
2109
|
test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
|
|
@@ -2026,6 +2141,16 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|
|
2026
2141
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
|
|
2027
2142
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
|
|
2028
2143
|
|
|
2144
|
+
test_cases.emplace_back(new test_conv_transpose_1d());
|
|
2145
|
+
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
|
|
2146
|
+
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
|
|
2147
|
+
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1));
|
|
2148
|
+
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1));
|
|
2149
|
+
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1));
|
|
2150
|
+
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
|
|
2151
|
+
test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
|
|
2152
|
+
|
|
2153
|
+
|
|
2029
2154
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
|
|
2030
2155
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
|
|
2031
2156
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
|
|
@@ -2038,12 +2163,22 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|
|
2038
2163
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
|
|
2039
2164
|
test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
|
|
2040
2165
|
test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
|
|
2166
|
+
test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3}));
|
|
2167
|
+
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
|
|
2168
|
+
test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
|
|
2169
|
+
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
|
|
2041
2170
|
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
|
|
2042
2171
|
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
|
|
2043
2172
|
|
|
2044
2173
|
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
|
2045
2174
|
for (ggml_type type_dst : all_types) {
|
|
2046
2175
|
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
|
|
2176
|
+
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
|
|
2177
|
+
}
|
|
2178
|
+
}
|
|
2179
|
+
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
|
2180
|
+
for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
|
2181
|
+
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
|
|
2047
2182
|
}
|
|
2048
2183
|
}
|
|
2049
2184
|
|
|
@@ -2093,6 +2228,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|
|
2093
2228
|
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
|
|
2094
2229
|
}
|
|
2095
2230
|
|
|
2231
|
+
#if 1
|
|
2096
2232
|
for (ggml_type type_a : base_types) {
|
|
2097
2233
|
for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
|
2098
2234
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
|
|
@@ -2112,6 +2248,24 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|
|
2112
2248
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
|
|
2113
2249
|
}
|
|
2114
2250
|
}
|
|
2251
|
+
#else
|
|
2252
|
+
// m = a rows
|
|
2253
|
+
// n = b rows
|
|
2254
|
+
// k = cols
|
|
2255
|
+
std::uniform_int_distribution<> dist_m(1, 128);
|
|
2256
|
+
std::uniform_int_distribution<> dist_n(16, 128);
|
|
2257
|
+
std::uniform_int_distribution<> dist_k(1, 16);
|
|
2258
|
+
for (int i = 0; i < 1000; i++) {
|
|
2259
|
+
for (ggml_type type_a : all_types) {
|
|
2260
|
+
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
|
2261
|
+
int m = dist_m(rng);
|
|
2262
|
+
int n = dist_n(rng);
|
|
2263
|
+
int k = dist_k(rng) * ggml_blck_size(type_a);
|
|
2264
|
+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1, 1}, {1, 1}));
|
|
2265
|
+
}
|
|
2266
|
+
}
|
|
2267
|
+
}
|
|
2268
|
+
#endif
|
|
2115
2269
|
|
|
2116
2270
|
for (ggml_type type_a : other_types) {
|
|
2117
2271
|
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
|
@@ -2159,6 +2313,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|
|
2159
2313
|
}
|
|
2160
2314
|
|
|
2161
2315
|
test_cases.emplace_back(new test_sqr());
|
|
2316
|
+
test_cases.emplace_back(new test_sqrt());
|
|
2162
2317
|
test_cases.emplace_back(new test_clamp());
|
|
2163
2318
|
|
|
2164
2319
|
test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
|
|
@@ -2174,7 +2329,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|
|
2174
2329
|
for (int n = 0; n < 10; ++n) {
|
|
2175
2330
|
int64_t ne0 = dist_ne0(rng);
|
|
2176
2331
|
int64_t ne1 = dist_ne1(rng);
|
|
2177
|
-
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
|
|
2332
|
+
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
|
|
2178
2333
|
}
|
|
2179
2334
|
|
|
2180
2335
|
exponent <<= 1;
|
|
@@ -2193,31 +2348,52 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|
|
2193
2348
|
}
|
|
2194
2349
|
}
|
|
2195
2350
|
}
|
|
2196
|
-
|
|
2351
|
+
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f));
|
|
2197
2352
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
|
|
2198
2353
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
|
|
2199
2354
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
|
|
2200
2355
|
|
|
2201
|
-
|
|
2202
|
-
|
|
2203
|
-
|
|
2204
|
-
|
|
2205
|
-
|
|
2206
|
-
|
|
2356
|
+
{
|
|
2357
|
+
bool all = true;
|
|
2358
|
+
|
|
2359
|
+
for (float v : { 0, 1 }) {
|
|
2360
|
+
for (float fs : { 1.0f, 1.4245f }) {
|
|
2361
|
+
for (float ef : { 0.0f, 0.7465f }) {
|
|
2362
|
+
for (float af : { 1.0f, 1.4245f }) {
|
|
2363
|
+
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
|
2364
|
+
for (bool ff : {false, true}) { // freq_factors
|
|
2365
|
+
test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 7B
|
|
2366
|
+
|
|
2367
|
+
if (all) {
|
|
2368
|
+
test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 13B
|
|
2369
|
+
test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 30B
|
|
2370
|
+
test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 65B
|
|
2371
|
+
}
|
|
2372
|
+
|
|
2373
|
+
if (all) {
|
|
2374
|
+
test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
|
|
2375
|
+
test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
|
|
2376
|
+
test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
|
|
2377
|
+
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm)
|
|
2378
|
+
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
|
|
2379
|
+
}
|
|
2380
|
+
|
|
2381
|
+
test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
|
|
2382
|
+
}
|
|
2383
|
+
}
|
|
2207
2384
|
|
|
2208
|
-
|
|
2209
|
-
|
|
2210
|
-
|
|
2211
|
-
|
|
2212
|
-
test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512, ff)); // neox (falcon 40B)
|
|
2213
|
-
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512, ff)); // neox (stablelm)
|
|
2214
|
-
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512, ff)); // neox (phi-2)
|
|
2385
|
+
all = false;
|
|
2386
|
+
}
|
|
2387
|
+
}
|
|
2388
|
+
}
|
|
2215
2389
|
}
|
|
2216
2390
|
}
|
|
2217
2391
|
|
|
2218
|
-
for (int
|
|
2219
|
-
|
|
2220
|
-
|
|
2392
|
+
for (int v : { 0, 1, 2, 3 }) {
|
|
2393
|
+
for (int dim : { 0, 1, 2, 3, }) {
|
|
2394
|
+
test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
|
|
2395
|
+
test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim, v));
|
|
2396
|
+
}
|
|
2221
2397
|
}
|
|
2222
2398
|
|
|
2223
2399
|
for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
|
|
@@ -2244,7 +2420,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|
|
2244
2420
|
for (int nh : { 32, }) {
|
|
2245
2421
|
for (int kv : { 512, 1024, }) {
|
|
2246
2422
|
for (int nb : { 1, 2, 4, 8, }) {
|
|
2247
|
-
|
|
2423
|
+
for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
|
|
2424
|
+
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, type_KV));
|
|
2425
|
+
}
|
|
2248
2426
|
}
|
|
2249
2427
|
}
|
|
2250
2428
|
}
|
|
@@ -2284,7 +2462,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|
|
2284
2462
|
return true;
|
|
2285
2463
|
}
|
|
2286
2464
|
|
|
2287
|
-
|
|
2465
|
+
GGML_ABORT("fatal error");
|
|
2288
2466
|
return false;
|
|
2289
2467
|
}
|
|
2290
2468
|
|