@fugood/llama.node 0.2.3 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -3
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +8 -1
- package/package.json +3 -3
- package/patches/llama.patch +12 -12
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/LlamaContext.cpp +33 -1
- package/src/LlamaContext.h +1 -0
- package/src/llama.cpp/.github/workflows/bench.yml +310 -0
- package/src/llama.cpp/.github/workflows/build.yml +1315 -0
- package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
- package/src/llama.cpp/.github/workflows/docker.yml +116 -0
- package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
- package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
- package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
- package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
- package/src/llama.cpp/.github/workflows/server.yml +183 -0
- package/src/llama.cpp/CMakeLists.txt +91 -1245
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
- package/src/llama.cpp/cmake/build-info.cmake +58 -0
- package/src/llama.cpp/cmake/git-vars.cmake +22 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -3
- package/src/llama.cpp/common/build-info.cpp.in +4 -0
- package/src/llama.cpp/common/common.cpp +1116 -877
- package/src/llama.cpp/common/common.h +191 -77
- package/src/llama.cpp/common/grammar-parser.cpp +118 -31
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
- package/src/llama.cpp/common/log.h +1 -1
- package/src/llama.cpp/common/ngram-cache.h +10 -3
- package/src/llama.cpp/common/sampling.cpp +19 -10
- package/src/llama.cpp/docs/build.md +353 -0
- package/src/llama.cpp/examples/CMakeLists.txt +22 -22
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +52 -55
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/chat-13B.bat +57 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
- package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
- package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
- package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +38 -153
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
- package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
- package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
- package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
- package/src/llama.cpp/examples/llava/clip.cpp +23 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
- package/src/llama.cpp/examples/llava/requirements.txt +3 -2
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +98 -75
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
- package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
- package/src/llama.cpp/examples/server/server.cpp +274 -671
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +28 -29
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +21 -29
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +23 -0
- package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
- package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
- package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
- package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
- package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
- package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
- package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
- package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
- package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
- package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
- package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
- package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
- package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
- package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
- package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
- package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
- package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
- package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
- package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
- package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
- package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
- package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
- package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
- package/src/llama.cpp/models/.editorconfig +1 -0
- package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/requirements/requirements-all.txt +12 -0
- package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
- package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
- package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
- package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
- package/src/llama.cpp/requirements.txt +5 -4
- package/src/llama.cpp/scripts/build-info.sh +30 -0
- package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
- package/src/llama.cpp/src/CMakeLists.txt +33 -0
- package/src/llama.cpp/src/llama-grammar.cpp +539 -0
- package/src/llama.cpp/src/llama-grammar.h +39 -0
- package/src/llama.cpp/src/llama-impl.h +26 -0
- package/src/llama.cpp/src/llama-sampling.cpp +635 -0
- package/src/llama.cpp/src/llama-sampling.h +56 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
- package/src/llama.cpp/src/llama-vocab.h +130 -0
- package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
- package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
- package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
- package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
- package/src/llama.cpp/tests/CMakeLists.txt +19 -20
- package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
- package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
- package/src/llama.cpp/tests/test-double-float.cpp +2 -2
- package/src/llama.cpp/tests/test-grad0.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
- package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
- package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
- package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
- package/src/llama.cpp/tests/test-rope.cpp +3 -4
- package/src/llama.cpp/tests/test-sampling.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
- package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
- package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
- package/src/llama.cpp/ggml-opencl.cpp +0 -2305
- package/src/llama.cpp/ggml-opencl.h +0 -36
- package/src/llama.cpp/ggml-sycl.cpp +0 -17340
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
- package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
- /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
- /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
- /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
- /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
- /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
#include "llama.h"
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
|
|
5
|
+
#ifdef GGML_USE_CUDA
|
|
6
|
+
#include "ggml-cuda.h"
|
|
7
|
+
#endif
|
|
8
|
+
|
|
9
|
+
#ifdef GGML_USE_METAL
|
|
10
|
+
#include "ggml-metal.h"
|
|
11
|
+
#endif
|
|
12
|
+
|
|
13
|
+
#include <cstdio>
|
|
14
|
+
#include <ctime>
|
|
15
|
+
#include <string>
|
|
16
|
+
#include <tuple>
|
|
17
|
+
#include <vector>
|
|
18
|
+
#include <algorithm>
|
|
19
|
+
#include <iostream>
|
|
20
|
+
#include <fstream>
|
|
21
|
+
|
|
22
|
+
#define DEBUG_POS 5
|
|
23
|
+
|
|
24
|
+
static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
|
|
25
|
+
printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]);
|
|
26
|
+
if (!with_data) return;
|
|
27
|
+
printf("%s: %s[0] = [", __func__, t->name);
|
|
28
|
+
for (size_t i = 0; i <= DEBUG_POS; i++) {
|
|
29
|
+
printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0));
|
|
30
|
+
}
|
|
31
|
+
printf(" ... ]\n");
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
namespace PCA {
|
|
35
|
+
|
|
36
|
+
// input params for PCA computations
|
|
37
|
+
struct pca_params {
|
|
38
|
+
int n_threads = 1;
|
|
39
|
+
int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used
|
|
40
|
+
int n_iterations = 1000;
|
|
41
|
+
float tolerance = 1e-7;
|
|
42
|
+
|
|
43
|
+
// for debugging
|
|
44
|
+
int i_layer = 0;
|
|
45
|
+
int n_layers = 0;
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
// result from each iteration
|
|
49
|
+
struct pca_result {
|
|
50
|
+
struct ggml_tensor * calculated_square = NULL;
|
|
51
|
+
std::vector<struct ggml_tensor *> eigenvectors;
|
|
52
|
+
std::vector<float> distances;
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
struct pca_model {
|
|
56
|
+
ggml_backend_t backend = NULL;
|
|
57
|
+
ggml_backend_buffer_t buffer;
|
|
58
|
+
struct ggml_context * ctx; // context to compute graph on target device
|
|
59
|
+
struct ggml_context * ctx_host; // host context to store results
|
|
60
|
+
|
|
61
|
+
// tensors on target device
|
|
62
|
+
struct ggml_tensor * dev_input;
|
|
63
|
+
struct ggml_tensor * dev_square;
|
|
64
|
+
struct ggml_tensor * dev_eigenvector;
|
|
65
|
+
|
|
66
|
+
pca_model(struct ggml_tensor * t_input) {
|
|
67
|
+
#ifdef GGML_USE_CUDA
|
|
68
|
+
fprintf(stderr, "%s: using CUDA backend\n", __func__);
|
|
69
|
+
backend = ggml_backend_cuda_init(0); // init device 0
|
|
70
|
+
if (!backend) {
|
|
71
|
+
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
|
|
72
|
+
}
|
|
73
|
+
#endif
|
|
74
|
+
|
|
75
|
+
// TODO: enable Metal support when support for GGML_OP_SQRT is added
|
|
76
|
+
// #ifdef GGML_USE_METAL
|
|
77
|
+
// fprintf(stderr, "%s: using Metal backend\n", __func__);
|
|
78
|
+
// backend = ggml_backend_metal_init();
|
|
79
|
+
// if (!backend) {
|
|
80
|
+
// fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
|
|
81
|
+
// }
|
|
82
|
+
// #endif
|
|
83
|
+
|
|
84
|
+
// if there aren't GPU Backends fallback to CPU backend
|
|
85
|
+
if (!backend) {
|
|
86
|
+
backend = ggml_backend_cpu_init();
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const int num_tensors = 4;
|
|
90
|
+
struct ggml_init_params params {
|
|
91
|
+
/*.mem_size =*/ ggml_tensor_overhead() * num_tensors,
|
|
92
|
+
/*.mem_buffer =*/ NULL,
|
|
93
|
+
/*.no_alloc =*/ true,
|
|
94
|
+
};
|
|
95
|
+
ctx = ggml_init(params);
|
|
96
|
+
|
|
97
|
+
auto n_samples = t_input->ne[0];
|
|
98
|
+
auto n_embd = t_input->ne[1];
|
|
99
|
+
|
|
100
|
+
dev_input = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd);
|
|
101
|
+
dev_square = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
|
102
|
+
dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
|
103
|
+
|
|
104
|
+
ggml_set_name(dev_input, "dev_input");
|
|
105
|
+
ggml_set_name(dev_square, "dev_square");
|
|
106
|
+
ggml_set_name(dev_eigenvector, "dev_eigenvector");
|
|
107
|
+
buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
|
108
|
+
ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input));
|
|
109
|
+
|
|
110
|
+
// initialize eigenvector to random normalized vector
|
|
111
|
+
{
|
|
112
|
+
std::vector<float> random_vec(ggml_nelements(dev_eigenvector), 0.0);
|
|
113
|
+
std::default_random_engine generator(static_cast<unsigned int>(std::time(0)));
|
|
114
|
+
std::uniform_real_distribution<float> distribution(0.0, 1.0);
|
|
115
|
+
float sum_sqr = 0.0; // for normalizing random_vec
|
|
116
|
+
for (size_t i = 0; i < random_vec.size(); ++i) {
|
|
117
|
+
float f = distribution(generator);
|
|
118
|
+
sum_sqr += f * f;
|
|
119
|
+
random_vec[i] = f;
|
|
120
|
+
}
|
|
121
|
+
// normalize it
|
|
122
|
+
float random_vec_norm = std::sqrt(sum_sqr);
|
|
123
|
+
for (size_t i = 0; i < random_vec.size(); ++i) {
|
|
124
|
+
random_vec[i] /= random_vec_norm;
|
|
125
|
+
}
|
|
126
|
+
ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector));
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
~pca_model() {
|
|
131
|
+
ggml_free(ctx);
|
|
132
|
+
ggml_backend_buffer_free(buffer);
|
|
133
|
+
ggml_backend_free(backend);
|
|
134
|
+
}
|
|
135
|
+
};
|
|
136
|
+
|
|
137
|
+
static struct ggml_cgraph * build_graph_piter(
|
|
138
|
+
const struct pca_params & params,
|
|
139
|
+
const pca_model & model,
|
|
140
|
+
bool calc_square = false) {
|
|
141
|
+
GGML_ASSERT(params.n_batch > 0);
|
|
142
|
+
// TODO: buf_size must be able to scale with params.n_batch
|
|
143
|
+
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
|
|
144
|
+
static std::vector<uint8_t> buf(buf_size);
|
|
145
|
+
|
|
146
|
+
struct ggml_init_params params0 = {
|
|
147
|
+
/*.mem_size =*/ buf_size,
|
|
148
|
+
/*.mem_buffer =*/ buf.data(),
|
|
149
|
+
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
|
|
150
|
+
};
|
|
151
|
+
// create a temporally context to build the graph
|
|
152
|
+
struct ggml_context * ctx0 = ggml_init(params0);
|
|
153
|
+
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
|
154
|
+
|
|
155
|
+
// turn v_diff_original into square matrix if needed
|
|
156
|
+
struct ggml_tensor * tmp_square;
|
|
157
|
+
if (calc_square) {
|
|
158
|
+
tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
|
|
159
|
+
ggml_set_name(tmp_square, "tmp_square");
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
struct ggml_tensor * b_tensor;
|
|
163
|
+
struct ggml_tensor * distance;
|
|
164
|
+
struct ggml_tensor * old_eigen = model.dev_eigenvector;
|
|
165
|
+
struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square;
|
|
166
|
+
|
|
167
|
+
for (int i = 0; i < params.n_batch; ++i) {
|
|
168
|
+
// b_tensor = square * eigenvector^T
|
|
169
|
+
b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen);
|
|
170
|
+
ggml_set_name(b_tensor, "b_tensor");
|
|
171
|
+
|
|
172
|
+
// normalize
|
|
173
|
+
b_tensor = ggml_div_inplace(ctx0,
|
|
174
|
+
b_tensor,
|
|
175
|
+
ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor)))
|
|
176
|
+
);
|
|
177
|
+
ggml_format_name(b_tensor, "b_tensor_norm_%d", i);
|
|
178
|
+
|
|
179
|
+
// calculate distance(new eigenvector - old eigenvector)
|
|
180
|
+
// we don't use ggml_sub because it may not be implemented on GPU backend
|
|
181
|
+
struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1));
|
|
182
|
+
distance = ggml_sqrt_inplace(ctx0,
|
|
183
|
+
ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old)));
|
|
184
|
+
ggml_format_name(distance, "distance_%d", i);
|
|
185
|
+
|
|
186
|
+
old_eigen = b_tensor;
|
|
187
|
+
|
|
188
|
+
// build operations nodes
|
|
189
|
+
ggml_build_forward_expand(gf, distance);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// delete the temporally context used to build the graph
|
|
193
|
+
ggml_free(ctx0);
|
|
194
|
+
return gf;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
static ggml_status compute_piter(
|
|
198
|
+
const struct pca_params & params,
|
|
199
|
+
const pca_model & model,
|
|
200
|
+
struct ggml_cgraph * gf,
|
|
201
|
+
ggml_gallocr_t allocr,
|
|
202
|
+
struct pca_result & result) {
|
|
203
|
+
// allocate tensors
|
|
204
|
+
ggml_gallocr_alloc_graph(allocr, gf);
|
|
205
|
+
|
|
206
|
+
if (ggml_backend_is_cpu(model.backend)) {
|
|
207
|
+
ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// TODO: enable GPU support when support for GGML_OP_SQRT is added
|
|
211
|
+
//#ifdef GGML_USE_METAL
|
|
212
|
+
// if (ggml_backend_is_metal(model.backend)) {
|
|
213
|
+
// ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
|
|
214
|
+
// }
|
|
215
|
+
//#endif
|
|
216
|
+
|
|
217
|
+
ggml_status res = ggml_backend_graph_compute(model.backend, gf);
|
|
218
|
+
if (res == GGML_STATUS_SUCCESS) {
|
|
219
|
+
auto extract_i = [](std::string prefix, std::string str) -> int {
|
|
220
|
+
int i = -1;
|
|
221
|
+
if (str.rfind(prefix, 0) == 0) {
|
|
222
|
+
sscanf(str.c_str(), (prefix + "%d").c_str(), &i);
|
|
223
|
+
}
|
|
224
|
+
return i;
|
|
225
|
+
};
|
|
226
|
+
result.calculated_square = NULL;
|
|
227
|
+
result.eigenvectors.clear();
|
|
228
|
+
result.distances.clear();
|
|
229
|
+
result.eigenvectors.resize(params.n_batch);
|
|
230
|
+
result.distances.resize(params.n_batch);
|
|
231
|
+
// get output nodes
|
|
232
|
+
for (int i = 0; i < gf->n_nodes; ++i) {
|
|
233
|
+
auto node = gf->nodes[i];
|
|
234
|
+
int iter = -1;
|
|
235
|
+
// find b_tensor (without copying data from device)
|
|
236
|
+
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
|
|
237
|
+
result.eigenvectors[iter] = node;
|
|
238
|
+
}
|
|
239
|
+
// find distances, then copy data from device
|
|
240
|
+
if ((iter = extract_i("distance_", node->name)) > -1) {
|
|
241
|
+
float d;
|
|
242
|
+
ggml_backend_tensor_get(node, &d, 0, sizeof(float));
|
|
243
|
+
result.distances[iter] = d;
|
|
244
|
+
// std::cout << node->name << " = " << d << "\n";
|
|
245
|
+
}
|
|
246
|
+
// find tmp_square if it exists (without copying data from device)
|
|
247
|
+
if (std::string(node->name) == "tmp_square") {
|
|
248
|
+
result.calculated_square = node;
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
return res;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
static void power_iteration(
|
|
256
|
+
const struct pca_params & params,
|
|
257
|
+
struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
|
|
258
|
+
struct ggml_tensor * output) {
|
|
259
|
+
//printf("in power iteration\n");
|
|
260
|
+
struct pca_model model(input);
|
|
261
|
+
|
|
262
|
+
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
|
|
263
|
+
struct pca_result result;
|
|
264
|
+
struct ggml_tensor * last_eigenvector = NULL;
|
|
265
|
+
|
|
266
|
+
int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
|
|
267
|
+
for (int iter = 0; iter < n_iters; ++iter) {
|
|
268
|
+
bool calc_square = (iter == 0); // only need to calculate square for first iteration
|
|
269
|
+
struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
|
|
270
|
+
// ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
|
|
271
|
+
compute_piter(params, model, gf, allocr, result);
|
|
272
|
+
|
|
273
|
+
for (size_t k = 0; k < result.distances.size(); ++k) {
|
|
274
|
+
last_eigenvector = result.eigenvectors[k];
|
|
275
|
+
if (result.distances[k] < params.tolerance) {
|
|
276
|
+
break; // done
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
if (calc_square) {
|
|
281
|
+
// copy and store the square matrix if needed
|
|
282
|
+
GGML_ASSERT(result.calculated_square != NULL);
|
|
283
|
+
ggml_backend_tensor_copy(result.calculated_square, model.dev_square);
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
{
|
|
287
|
+
// copy last eigen vector and store as input for next iteration
|
|
288
|
+
GGML_ASSERT(last_eigenvector != NULL);
|
|
289
|
+
ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
|
|
293
|
+
__func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch);
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// get output tensor
|
|
297
|
+
GGML_ASSERT(last_eigenvector);
|
|
298
|
+
ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
|
|
299
|
+
//print_debug_tensor(output);
|
|
300
|
+
ggml_gallocr_free(allocr);
|
|
301
|
+
|
|
302
|
+
// TODO @ngxson : The output vector is randomly inverted
|
|
303
|
+
// Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
static void run_pca(
|
|
307
|
+
struct pca_params & params,
|
|
308
|
+
const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
|
|
309
|
+
const std::vector<struct ggml_tensor *> & v_output) {
|
|
310
|
+
printf("%s: Running PCA...\n", __func__);
|
|
311
|
+
for (size_t il = 0; il < v_input.size(); ++il) {
|
|
312
|
+
|
|
313
|
+
// prepare output vector
|
|
314
|
+
struct ggml_tensor * ctrl_out = v_output[il];
|
|
315
|
+
ggml_format_name(ctrl_out, "direction.%ld", il+1);
|
|
316
|
+
|
|
317
|
+
// run power_iteration
|
|
318
|
+
params.i_layer = il;
|
|
319
|
+
params.n_layers = v_input.size();
|
|
320
|
+
power_iteration(params, v_input[il], ctrl_out);
|
|
321
|
+
printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world
|
|
2
|
+
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever!
|
|
3
|
+
<|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you
|
|
4
|
+
<|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now!
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
// Warns users that this filename was deprecated, and provides a link for more information.
|
|
2
|
+
|
|
3
|
+
#include <cstdio>
|
|
4
|
+
#include <string>
|
|
5
|
+
#include <unordered_map>
|
|
6
|
+
|
|
7
|
+
// Main
|
|
8
|
+
int main(int argc, char** argv) {
|
|
9
|
+
std::string filename = "main";
|
|
10
|
+
if (argc >= 1) {
|
|
11
|
+
filename = argv[0];
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
// Get only the program name from the full path
|
|
15
|
+
auto pos = filename.find_last_of('/');
|
|
16
|
+
if (pos != std::string::npos) {
|
|
17
|
+
filename = filename.substr(pos+1);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// Append "llama-" to the beginning of filename to get the replacemnt filename
|
|
21
|
+
auto replacement_filename = "llama-" + filename;
|
|
22
|
+
|
|
23
|
+
// The exception is if the filename is "main", then our replacement filename is "llama-cli"
|
|
24
|
+
if (filename == "main") {
|
|
25
|
+
replacement_filename = "llama-cli";
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
fprintf(stdout, "\n");
|
|
29
|
+
fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
|
|
30
|
+
fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
|
|
31
|
+
fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
|
|
32
|
+
fprintf(stdout, "\n");
|
|
33
|
+
|
|
34
|
+
return EXIT_FAILURE;
|
|
35
|
+
}
|
|
@@ -7,23 +7,30 @@
|
|
|
7
7
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
8
8
|
#endif
|
|
9
9
|
|
|
10
|
-
static std::vector<std::string> split_lines(const std::string & s) {
|
|
11
|
-
std::string line;
|
|
10
|
+
static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
|
|
12
11
|
std::vector<std::string> lines;
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
12
|
+
size_t start = 0;
|
|
13
|
+
size_t end = s.find(separator);
|
|
14
|
+
|
|
15
|
+
while (end != std::string::npos) {
|
|
16
|
+
lines.push_back(s.substr(start, end - start));
|
|
17
|
+
start = end + separator.length();
|
|
18
|
+
end = s.find(separator, start);
|
|
16
19
|
}
|
|
20
|
+
|
|
21
|
+
lines.push_back(s.substr(start)); // Add the last part
|
|
22
|
+
|
|
17
23
|
return lines;
|
|
18
24
|
}
|
|
19
25
|
|
|
20
|
-
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens,
|
|
21
|
-
|
|
22
|
-
|
|
26
|
+
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
|
27
|
+
size_t n_tokens = tokens.size();
|
|
28
|
+
for (size_t i = 0; i < n_tokens; i++) {
|
|
29
|
+
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
|
|
23
30
|
}
|
|
24
31
|
}
|
|
25
32
|
|
|
26
|
-
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
|
33
|
+
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
|
|
27
34
|
// clear previous kv_cache values (irrelevant for embeddings)
|
|
28
35
|
llama_kv_cache_clear(ctx);
|
|
29
36
|
|
|
@@ -40,22 +47,10 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|
|
40
47
|
|
|
41
48
|
// try to get sequence embeddings - supported only when pooling_type is not NONE
|
|
42
49
|
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
|
|
43
|
-
|
|
44
|
-
embd = llama_get_embeddings_ith(ctx, i);
|
|
45
|
-
if (embd == NULL) {
|
|
46
|
-
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
|
|
47
|
-
continue;
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
+
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
|
|
50
51
|
|
|
51
52
|
float * out = output + batch.seq_id[i][0] * n_embd;
|
|
52
|
-
|
|
53
|
-
/*fprintf(stdout, "unnormalized_embedding:");
|
|
54
|
-
for (int hh = 0; hh < n_embd; hh++) {
|
|
55
|
-
fprintf(stdout, "%9.6f ", embd[hh]);
|
|
56
|
-
}
|
|
57
|
-
fprintf(stdout, "\n");*/
|
|
58
|
-
llama_embd_normalize(embd, out, n_embd);
|
|
53
|
+
llama_embd_normalize(embd, out, n_embd, embd_norm);
|
|
59
54
|
}
|
|
60
55
|
}
|
|
61
56
|
|
|
@@ -63,6 +58,7 @@ int main(int argc, char ** argv) {
|
|
|
63
58
|
gpt_params params;
|
|
64
59
|
|
|
65
60
|
if (!gpt_params_parse(argc, argv, params)) {
|
|
61
|
+
gpt_params_print_usage(argc, argv, params);
|
|
66
62
|
return 1;
|
|
67
63
|
}
|
|
68
64
|
|
|
@@ -79,9 +75,6 @@ int main(int argc, char ** argv) {
|
|
|
79
75
|
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
|
80
76
|
|
|
81
77
|
std::mt19937 rng(params.seed);
|
|
82
|
-
if (params.random_prompt) {
|
|
83
|
-
params.prompt = string_random_prompt(rng);
|
|
84
|
-
}
|
|
85
78
|
|
|
86
79
|
llama_backend_init();
|
|
87
80
|
llama_numa_init(params.numa);
|
|
@@ -99,6 +92,12 @@ int main(int argc, char ** argv) {
|
|
|
99
92
|
const int n_ctx_train = llama_n_ctx_train(model);
|
|
100
93
|
const int n_ctx = llama_n_ctx(ctx);
|
|
101
94
|
|
|
95
|
+
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
|
96
|
+
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
|
97
|
+
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
|
|
98
|
+
return 1;
|
|
99
|
+
}
|
|
100
|
+
|
|
102
101
|
if (n_ctx > n_ctx_train) {
|
|
103
102
|
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
|
104
103
|
__func__, n_ctx_train, n_ctx);
|
|
@@ -111,7 +110,7 @@ int main(int argc, char ** argv) {
|
|
|
111
110
|
}
|
|
112
111
|
|
|
113
112
|
// split the prompt into lines
|
|
114
|
-
std::vector<std::string> prompts = split_lines(params.prompt);
|
|
113
|
+
std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
|
|
115
114
|
|
|
116
115
|
// max batch size
|
|
117
116
|
const uint64_t n_batch = params.n_batch;
|
|
@@ -171,7 +170,7 @@ int main(int argc, char ** argv) {
|
|
|
171
170
|
// encode if at capacity
|
|
172
171
|
if (batch.n_tokens + n_toks > n_batch) {
|
|
173
172
|
float * out = emb + p * n_embd;
|
|
174
|
-
batch_decode(ctx, batch, out, s, n_embd);
|
|
173
|
+
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
|
175
174
|
llama_batch_clear(batch);
|
|
176
175
|
p += s;
|
|
177
176
|
s = 0;
|
|
@@ -184,29 +183,78 @@ int main(int argc, char ** argv) {
|
|
|
184
183
|
|
|
185
184
|
// final batch
|
|
186
185
|
float * out = emb + p * n_embd;
|
|
187
|
-
batch_decode(ctx, batch, out, s, n_embd);
|
|
188
|
-
|
|
189
|
-
// print the first part of the embeddings or for a single prompt, the full embedding
|
|
190
|
-
fprintf(stdout, "\n");
|
|
191
|
-
for (int j = 0; j < n_prompts; j++) {
|
|
192
|
-
fprintf(stdout, "embedding %d: ", j);
|
|
193
|
-
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
|
194
|
-
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
|
195
|
-
}
|
|
196
|
-
fprintf(stdout, "\n");
|
|
197
|
-
}
|
|
186
|
+
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
|
198
187
|
|
|
199
|
-
|
|
200
|
-
|
|
188
|
+
if (params.embd_out.empty()) {
|
|
189
|
+
// print the first part of the embeddings or for a single prompt, the full embedding
|
|
201
190
|
fprintf(stdout, "\n");
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
for (int
|
|
205
|
-
|
|
206
|
-
|
|
191
|
+
for (int j = 0; j < n_prompts; j++) {
|
|
192
|
+
fprintf(stdout, "embedding %d: ", j);
|
|
193
|
+
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
|
194
|
+
if (params.embd_normalize == 0) {
|
|
195
|
+
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
|
|
196
|
+
} else {
|
|
197
|
+
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
|
198
|
+
}
|
|
207
199
|
}
|
|
208
200
|
fprintf(stdout, "\n");
|
|
209
201
|
}
|
|
202
|
+
|
|
203
|
+
// print cosine similarity matrix
|
|
204
|
+
if (n_prompts > 1) {
|
|
205
|
+
fprintf(stdout, "\n");
|
|
206
|
+
printf("cosine similarity matrix:\n\n");
|
|
207
|
+
for (int i = 0; i < n_prompts; i++) {
|
|
208
|
+
fprintf(stdout, "%6.6s ", prompts[i].c_str());
|
|
209
|
+
}
|
|
210
|
+
fprintf(stdout, "\n");
|
|
211
|
+
for (int i = 0; i < n_prompts; i++) {
|
|
212
|
+
for (int j = 0; j < n_prompts; j++) {
|
|
213
|
+
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
|
214
|
+
fprintf(stdout, "%6.2f ", sim);
|
|
215
|
+
}
|
|
216
|
+
fprintf(stdout, "%1.10s", prompts[i].c_str());
|
|
217
|
+
fprintf(stdout, "\n");
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
|
|
223
|
+
const bool notArray = params.embd_out != "array";
|
|
224
|
+
|
|
225
|
+
fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
|
|
226
|
+
for (int j = 0;;) { // at least one iteration (one prompt)
|
|
227
|
+
if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
|
|
228
|
+
fprintf(stdout, "[");
|
|
229
|
+
for (int i = 0;;) { // at least one iteration (n_embd > 0)
|
|
230
|
+
fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
|
|
231
|
+
i++;
|
|
232
|
+
if (i < n_embd) fprintf(stdout, ","); else break;
|
|
233
|
+
}
|
|
234
|
+
fprintf(stdout, notArray ? "]\n }" : "]");
|
|
235
|
+
j++;
|
|
236
|
+
if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
|
|
237
|
+
}
|
|
238
|
+
fprintf(stdout, notArray ? "\n ]" : "]\n");
|
|
239
|
+
|
|
240
|
+
if (params.embd_out == "json+" && n_prompts > 1) {
|
|
241
|
+
fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
|
|
242
|
+
for (int i = 0;;) { // at least two iteration (n_prompts > 1)
|
|
243
|
+
fprintf(stdout, " [");
|
|
244
|
+
for (int j = 0;;) { // at least two iteration (n_prompts > 1)
|
|
245
|
+
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
|
246
|
+
fprintf(stdout, "%6.2f", sim);
|
|
247
|
+
j++;
|
|
248
|
+
if (j < n_prompts) fprintf(stdout, ", "); else break;
|
|
249
|
+
}
|
|
250
|
+
fprintf(stdout, " ]");
|
|
251
|
+
i++;
|
|
252
|
+
if (i < n_prompts) fprintf(stdout, ",\n"); else break;
|
|
253
|
+
}
|
|
254
|
+
fprintf(stdout, "\n ]");
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
if (notArray) fprintf(stdout, "\n}\n");
|
|
210
258
|
}
|
|
211
259
|
|
|
212
260
|
// clean up
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
set(TARGET eval-callback)
|
|
1
|
+
set(TARGET llama-eval-callback)
|
|
2
2
|
add_executable(${TARGET} eval-callback.cpp)
|
|
3
3
|
install(TARGETS ${TARGET} RUNTIME)
|
|
4
4
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
5
5
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
|
6
6
|
|
|
7
7
|
set(TEST_TARGET test-eval-callback)
|
|
8
|
-
add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
|
8
|
+
add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
|
9
9
|
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
|
@@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
|
|
62
62
|
} else if (type == GGML_TYPE_I8) {
|
|
63
63
|
v = (float) *(int8_t *) &data[i];
|
|
64
64
|
} else {
|
|
65
|
-
|
|
65
|
+
GGML_ABORT("fatal error");
|
|
66
66
|
}
|
|
67
67
|
printf("%12.4f", v);
|
|
68
68
|
sum += v;
|
|
@@ -99,7 +99,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
|
|
99
99
|
|
|
100
100
|
char src1_str[128] = {0};
|
|
101
101
|
if (src1) {
|
|
102
|
-
|
|
102
|
+
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
|
103
103
|
}
|
|
104
104
|
|
|
105
105
|
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
|
@@ -140,20 +140,18 @@ static bool run(llama_context * ctx, const gpt_params & params) {
|
|
|
140
140
|
}
|
|
141
141
|
|
|
142
142
|
int main(int argc, char ** argv) {
|
|
143
|
-
|
|
144
143
|
callback_data cb_data;
|
|
145
144
|
|
|
146
145
|
gpt_params params;
|
|
146
|
+
|
|
147
147
|
if (!gpt_params_parse(argc, argv, params)) {
|
|
148
|
+
gpt_params_print_usage(argc, argv, params);
|
|
148
149
|
return 1;
|
|
149
150
|
}
|
|
150
151
|
|
|
151
152
|
print_build_info();
|
|
152
153
|
|
|
153
154
|
std::mt19937 rng(params.seed);
|
|
154
|
-
if (params.random_prompt) {
|
|
155
|
-
params.prompt = string_random_prompt(rng);
|
|
156
|
-
}
|
|
157
155
|
|
|
158
156
|
llama_backend_init();
|
|
159
157
|
llama_numa_init(params.numa);
|