@fugood/llama.node 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +8 -1
- package/package.json +1 -1
- package/patches/llama.patch +12 -12
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/LlamaContext.cpp +33 -1
- package/src/LlamaContext.h +1 -0
- package/src/LoadSessionWorker.cpp +1 -0
- package/src/llama.cpp/.github/workflows/bench.yml +310 -0
- package/src/llama.cpp/.github/workflows/build.yml +1315 -0
- package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
- package/src/llama.cpp/.github/workflows/docker.yml +116 -0
- package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
- package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
- package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
- package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
- package/src/llama.cpp/.github/workflows/server.yml +183 -0
- package/src/llama.cpp/CMakeLists.txt +91 -1245
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
- package/src/llama.cpp/cmake/build-info.cmake +58 -0
- package/src/llama.cpp/cmake/git-vars.cmake +22 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -3
- package/src/llama.cpp/common/build-info.cpp.in +4 -0
- package/src/llama.cpp/common/common.cpp +1116 -877
- package/src/llama.cpp/common/common.h +191 -77
- package/src/llama.cpp/common/grammar-parser.cpp +118 -31
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
- package/src/llama.cpp/common/log.h +1 -1
- package/src/llama.cpp/common/ngram-cache.h +10 -3
- package/src/llama.cpp/common/sampling.cpp +19 -10
- package/src/llama.cpp/docs/build.md +353 -0
- package/src/llama.cpp/examples/CMakeLists.txt +22 -22
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +52 -55
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/chat-13B.bat +57 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
- package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
- package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
- package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +38 -153
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
- package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
- package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
- package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
- package/src/llama.cpp/examples/llava/clip.cpp +23 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
- package/src/llama.cpp/examples/llava/requirements.txt +3 -2
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +98 -75
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
- package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
- package/src/llama.cpp/examples/server/server.cpp +274 -671
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +28 -29
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +21 -29
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +23 -0
- package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
- package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
- package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
- package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
- package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
- package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
- package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
- package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
- package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
- package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
- package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
- package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
- package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
- package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
- package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
- package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
- package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
- package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
- package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
- package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
- package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
- package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
- package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
- package/src/llama.cpp/models/.editorconfig +1 -0
- package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/requirements/requirements-all.txt +12 -0
- package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
- package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
- package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
- package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
- package/src/llama.cpp/requirements.txt +5 -4
- package/src/llama.cpp/scripts/build-info.sh +30 -0
- package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
- package/src/llama.cpp/src/CMakeLists.txt +33 -0
- package/src/llama.cpp/src/llama-grammar.cpp +539 -0
- package/src/llama.cpp/src/llama-grammar.h +39 -0
- package/src/llama.cpp/src/llama-impl.h +26 -0
- package/src/llama.cpp/src/llama-sampling.cpp +635 -0
- package/src/llama.cpp/src/llama-sampling.h +56 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
- package/src/llama.cpp/src/llama-vocab.h +130 -0
- package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
- package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
- package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
- package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
- package/src/llama.cpp/tests/CMakeLists.txt +19 -20
- package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
- package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
- package/src/llama.cpp/tests/test-double-float.cpp +2 -2
- package/src/llama.cpp/tests/test-grad0.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
- package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
- package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
- package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
- package/src/llama.cpp/tests/test-rope.cpp +3 -4
- package/src/llama.cpp/tests/test-sampling.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
- package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
- package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
- package/src/llama.cpp/ggml-opencl.cpp +0 -2305
- package/src/llama.cpp/ggml-opencl.h +0 -36
- package/src/llama.cpp/ggml-sycl.cpp +0 -17340
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
- package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
- /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
- /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
- /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
- /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
- /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
|
@@ -61,10 +61,10 @@ static size_t split_str_to_n_bytes(std::string str) {
|
|
|
61
61
|
int n;
|
|
62
62
|
if (str.back() == 'M') {
|
|
63
63
|
sscanf(str.c_str(), "%d", &n);
|
|
64
|
-
n_bytes = (size_t)n *
|
|
64
|
+
n_bytes = (size_t)n * 1000 * 1000; // megabytes
|
|
65
65
|
} else if (str.back() == 'G') {
|
|
66
66
|
sscanf(str.c_str(), "%d", &n);
|
|
67
|
-
n_bytes = (size_t)n *
|
|
67
|
+
n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes
|
|
68
68
|
} else {
|
|
69
69
|
throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
|
|
70
70
|
}
|
|
@@ -284,7 +284,7 @@ struct split_strategy {
|
|
|
284
284
|
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
|
|
285
285
|
total_size += ggml_nbytes(t);
|
|
286
286
|
}
|
|
287
|
-
total_size = total_size /
|
|
287
|
+
total_size = total_size / 1000 / 1000; // convert to megabytes
|
|
288
288
|
printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
|
|
289
289
|
i_split++;
|
|
290
290
|
}
|
|
@@ -44,6 +44,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
|
|
44
44
|
|
|
45
45
|
// clear previous kv_cache values (irrelevant for embeddings)
|
|
46
46
|
llama_kv_cache_clear(ctx);
|
|
47
|
+
llama_set_embeddings(ctx, true);
|
|
47
48
|
llama_set_causal_attn(ctx, false);
|
|
48
49
|
|
|
49
50
|
// run model
|
|
@@ -98,7 +99,9 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
|
|
|
98
99
|
llama_token eos_token = llama_token_eos(mdl);
|
|
99
100
|
|
|
100
101
|
llama_kv_cache_clear(ctx);
|
|
102
|
+
llama_set_embeddings(ctx, false);
|
|
101
103
|
llama_set_causal_attn(ctx, true);
|
|
104
|
+
|
|
102
105
|
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
|
103
106
|
|
|
104
107
|
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
|
|
@@ -153,7 +156,9 @@ static std::string gritlm_instruction(const std::string & instruction) {
|
|
|
153
156
|
|
|
154
157
|
int main(int argc, char * argv[]) {
|
|
155
158
|
gpt_params params;
|
|
159
|
+
|
|
156
160
|
if (!gpt_params_parse(argc, argv, params)) {
|
|
161
|
+
gpt_params_print_usage(argc, argv, params);
|
|
157
162
|
return 1;
|
|
158
163
|
}
|
|
159
164
|
|
|
@@ -164,8 +169,7 @@ int main(int argc, char * argv[]) {
|
|
|
164
169
|
|
|
165
170
|
llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
|
|
166
171
|
|
|
167
|
-
// create
|
|
168
|
-
cparams.embeddings = true;
|
|
172
|
+
// create generation context
|
|
169
173
|
llama_context * ctx = llama_new_context_with_model(mdl, cparams);
|
|
170
174
|
|
|
171
175
|
// ### Embedding/Representation ###
|
|
@@ -17,39 +17,37 @@
|
|
|
17
17
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
18
18
|
#endif
|
|
19
19
|
|
|
20
|
+
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
|
21
|
+
gpt_params_print_usage(argc, argv, params);
|
|
22
|
+
|
|
23
|
+
LOG_TEE("\nexample usage:\n");
|
|
24
|
+
LOG_TEE("\n %s \\\n"
|
|
25
|
+
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
|
|
26
|
+
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
|
|
27
|
+
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
|
|
28
|
+
LOG_TEE("\n");
|
|
29
|
+
}
|
|
30
|
+
|
|
20
31
|
struct Stats {
|
|
21
32
|
std::vector<float> values;
|
|
22
33
|
std::vector<int> counts;
|
|
23
34
|
int ncall = 0;
|
|
24
35
|
};
|
|
25
36
|
|
|
26
|
-
struct StatParams {
|
|
27
|
-
std::string dataset;
|
|
28
|
-
std::string ofile = "imatrix.dat";
|
|
29
|
-
int n_output_frequency = 10;
|
|
30
|
-
int verbosity = 1;
|
|
31
|
-
int keep_every = 0;
|
|
32
|
-
bool collect_output_weight = false;
|
|
33
|
-
};
|
|
34
|
-
|
|
35
37
|
class IMatrixCollector {
|
|
36
38
|
public:
|
|
37
39
|
IMatrixCollector() = default;
|
|
38
|
-
void
|
|
40
|
+
void set_params(gpt_params params) { m_params = std::move(params); }
|
|
39
41
|
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
|
|
40
|
-
void save_imatrix() const;
|
|
41
|
-
bool load_imatrix(const char * file_name
|
|
42
|
-
static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix);
|
|
42
|
+
void save_imatrix(int ncall = -1) const;
|
|
43
|
+
bool load_imatrix(const char * file_name);
|
|
43
44
|
private:
|
|
44
45
|
std::unordered_map<std::string, Stats> m_stats;
|
|
45
|
-
|
|
46
|
+
gpt_params m_params;
|
|
46
47
|
std::mutex m_mutex;
|
|
47
48
|
int m_last_call = 0;
|
|
48
49
|
std::vector<float> m_src1_data;
|
|
49
50
|
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
|
|
50
|
-
//
|
|
51
|
-
void save_imatrix(const char * file_name, const char * dataset) const;
|
|
52
|
-
void keep_imatrix(int ncall) const;
|
|
53
51
|
};
|
|
54
52
|
|
|
55
53
|
// remove any prefix and suffixes from the name
|
|
@@ -85,7 +83,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|
|
85
83
|
if (t->op != GGML_OP_MUL_MAT) return false;
|
|
86
84
|
// why are small batches ignored (<16 tokens)?
|
|
87
85
|
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
|
|
88
|
-
if (!(wname.substr(0, 4) == "blk." || (m_params.
|
|
86
|
+
if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
|
|
89
87
|
return true;
|
|
90
88
|
}
|
|
91
89
|
|
|
@@ -129,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|
|
129
127
|
}
|
|
130
128
|
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
|
|
131
129
|
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
|
|
132
|
-
exit(1); //
|
|
130
|
+
exit(1); //GGML_ABORT("fatal error");
|
|
133
131
|
}
|
|
134
132
|
if (m_params.verbosity > 1) {
|
|
135
133
|
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
|
|
@@ -153,28 +151,32 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|
|
153
151
|
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
|
154
152
|
e.values[e_start + j] += x[j]*x[j];
|
|
155
153
|
e.counts[e_start + j]++;
|
|
154
|
+
if (!std::isfinite(e.values[e_start + j])) {
|
|
155
|
+
fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
|
|
156
|
+
exit(1);
|
|
157
|
+
}
|
|
156
158
|
}
|
|
157
159
|
}
|
|
158
160
|
}
|
|
159
161
|
if (e.ncall > m_last_call) {
|
|
160
162
|
m_last_call = e.ncall;
|
|
161
|
-
if (m_last_call % m_params.
|
|
163
|
+
if (m_last_call % m_params.n_out_freq == 0) {
|
|
162
164
|
save_imatrix();
|
|
163
165
|
}
|
|
164
|
-
if (m_params.
|
|
165
|
-
|
|
166
|
+
if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
|
|
167
|
+
save_imatrix(m_last_call);
|
|
166
168
|
}
|
|
167
169
|
}
|
|
168
170
|
}
|
|
169
171
|
} else {
|
|
170
|
-
auto& e = m_stats[wname];
|
|
172
|
+
auto & e = m_stats[wname];
|
|
171
173
|
if (e.values.empty()) {
|
|
172
174
|
e.values.resize(src1->ne[0], 0);
|
|
173
175
|
e.counts.resize(src1->ne[0], 0);
|
|
174
176
|
}
|
|
175
177
|
else if (e.values.size() != (size_t)src1->ne[0]) {
|
|
176
178
|
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
|
177
|
-
exit(1); //
|
|
179
|
+
exit(1); //GGML_ABORT("fatal error");
|
|
178
180
|
}
|
|
179
181
|
++e.ncall;
|
|
180
182
|
if (m_params.verbosity > 1) {
|
|
@@ -185,15 +187,19 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|
|
185
187
|
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
|
186
188
|
e.values[j] += x[j]*x[j];
|
|
187
189
|
e.counts[j]++;
|
|
190
|
+
if (!std::isfinite(e.values[j])) {
|
|
191
|
+
fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
|
|
192
|
+
exit(1);
|
|
193
|
+
}
|
|
188
194
|
}
|
|
189
195
|
}
|
|
190
196
|
if (e.ncall > m_last_call) {
|
|
191
197
|
m_last_call = e.ncall;
|
|
192
|
-
if (m_last_call % m_params.
|
|
198
|
+
if (m_last_call % m_params.n_out_freq == 0) {
|
|
193
199
|
save_imatrix();
|
|
194
200
|
}
|
|
195
|
-
if (m_params.
|
|
196
|
-
|
|
201
|
+
if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
|
|
202
|
+
save_imatrix(m_last_call);
|
|
197
203
|
}
|
|
198
204
|
}
|
|
199
205
|
}
|
|
@@ -201,33 +207,75 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|
|
201
207
|
return true;
|
|
202
208
|
}
|
|
203
209
|
|
|
204
|
-
void IMatrixCollector::save_imatrix() const {
|
|
205
|
-
|
|
206
|
-
|
|
210
|
+
void IMatrixCollector::save_imatrix(int ncall) const {
|
|
211
|
+
auto fname = m_params.out_file;
|
|
212
|
+
if (fname.empty()) {
|
|
213
|
+
fname = "imatrix.dat";
|
|
214
|
+
}
|
|
207
215
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
216
|
+
if (ncall > 0) {
|
|
217
|
+
fname += ".at_";
|
|
218
|
+
fname += std::to_string(ncall);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// avoid writing imatrix entries that do not have full data
|
|
222
|
+
// this can happen with MoE models where some of the experts end up not being exercised by the provided training data
|
|
223
|
+
|
|
224
|
+
int n_entries = 0;
|
|
225
|
+
std::vector<std::string> to_store;
|
|
226
|
+
|
|
227
|
+
bool is_first = true; // for printing
|
|
228
|
+
for (const auto & kv : m_stats) {
|
|
229
|
+
const int n_all = kv.second.counts.size();
|
|
230
|
+
|
|
231
|
+
if (n_all == 0) {
|
|
232
|
+
continue;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
int n_zeros = 0;
|
|
236
|
+
for (const int c : kv.second.counts) {
|
|
237
|
+
if (c == 0) {
|
|
238
|
+
n_zeros++;
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
if (n_zeros != 0 && is_first) {
|
|
243
|
+
fprintf(stderr, "\n");
|
|
244
|
+
is_first = false;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
if (n_zeros == n_all) {
|
|
248
|
+
fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
|
|
249
|
+
continue;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
if (n_zeros > 0) {
|
|
253
|
+
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
|
|
254
|
+
continue;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
n_entries++;
|
|
258
|
+
to_store.push_back(kv.first);
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
if (to_store.size() < m_stats.size()) {
|
|
262
|
+
fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
|
|
263
|
+
}
|
|
215
264
|
|
|
216
|
-
void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
|
|
217
265
|
std::ofstream out(fname, std::ios::binary);
|
|
218
|
-
int n_entries = m_stats.size();
|
|
219
266
|
out.write((const char *) &n_entries, sizeof(n_entries));
|
|
220
|
-
for (const auto &
|
|
221
|
-
|
|
267
|
+
for (const auto & name : to_store) {
|
|
268
|
+
const auto & stat = m_stats.at(name);
|
|
269
|
+
int len = name.size();
|
|
222
270
|
out.write((const char *) &len, sizeof(len));
|
|
223
|
-
out.write(
|
|
224
|
-
out.write((const char *) &
|
|
225
|
-
int nval =
|
|
271
|
+
out.write(name.c_str(), len);
|
|
272
|
+
out.write((const char *) &stat.ncall, sizeof(stat.ncall));
|
|
273
|
+
int nval = stat.values.size();
|
|
226
274
|
out.write((const char *) &nval, sizeof(nval));
|
|
227
275
|
if (nval > 0) {
|
|
228
276
|
std::vector<float> tmp(nval);
|
|
229
277
|
for (int i = 0; i < nval; i++) {
|
|
230
|
-
tmp[i] = (
|
|
278
|
+
tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
|
|
231
279
|
}
|
|
232
280
|
out.write((const char*)tmp.data(), nval*sizeof(float));
|
|
233
281
|
}
|
|
@@ -236,26 +284,28 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
|
|
|
236
284
|
// Write the number of call the matrix was computed with
|
|
237
285
|
out.write((const char *) &m_last_call, sizeof(m_last_call));
|
|
238
286
|
|
|
239
|
-
// Write the
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
287
|
+
// Write the input filename at the end of the file to later on specify it in quantize
|
|
288
|
+
{
|
|
289
|
+
int len = m_params.prompt_file.size();
|
|
290
|
+
out.write((const char *) &len, sizeof(len));
|
|
291
|
+
out.write(m_params.prompt_file.c_str(), len);
|
|
292
|
+
}
|
|
243
293
|
|
|
244
294
|
if (m_params.verbosity > 0) {
|
|
245
|
-
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
|
|
295
|
+
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
|
|
246
296
|
}
|
|
247
297
|
}
|
|
248
298
|
|
|
249
|
-
bool IMatrixCollector::load_imatrix(const char *
|
|
250
|
-
std::ifstream in(
|
|
299
|
+
bool IMatrixCollector::load_imatrix(const char * fname) {
|
|
300
|
+
std::ifstream in(fname, std::ios::binary);
|
|
251
301
|
if (!in) {
|
|
252
|
-
printf("%s: failed to open %s\n",__func__,
|
|
302
|
+
printf("%s: failed to open %s\n",__func__, fname);
|
|
253
303
|
return false;
|
|
254
304
|
}
|
|
255
305
|
int n_entries;
|
|
256
306
|
in.read((char*)&n_entries, sizeof(n_entries));
|
|
257
307
|
if (in.fail() || n_entries < 1) {
|
|
258
|
-
printf("%s: no data in file %s\n", __func__,
|
|
308
|
+
printf("%s: no data in file %s\n", __func__, fname);
|
|
259
309
|
return false;
|
|
260
310
|
}
|
|
261
311
|
for (int i = 0; i < n_entries; ++i) {
|
|
@@ -263,23 +313,22 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
|
|
|
263
313
|
std::vector<char> name_as_vec(len+1);
|
|
264
314
|
in.read((char *)name_as_vec.data(), len);
|
|
265
315
|
if (in.fail()) {
|
|
266
|
-
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,
|
|
316
|
+
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
|
|
267
317
|
return false;
|
|
268
318
|
}
|
|
269
319
|
name_as_vec[len] = 0;
|
|
270
320
|
std::string name{name_as_vec.data()};
|
|
271
|
-
auto& e =
|
|
321
|
+
auto & e = m_stats[std::move(name)];
|
|
272
322
|
int ncall;
|
|
273
323
|
in.read((char*)&ncall, sizeof(ncall));
|
|
274
324
|
int nval;
|
|
275
325
|
in.read((char *)&nval, sizeof(nval));
|
|
276
326
|
if (in.fail() || nval < 1) {
|
|
277
327
|
printf("%s: failed reading number of values for entry %d\n",__func__,i);
|
|
278
|
-
|
|
328
|
+
m_stats = {};
|
|
279
329
|
return false;
|
|
280
330
|
}
|
|
281
331
|
|
|
282
|
-
// When re-called from load_imatrix() with add set, this will already be created.
|
|
283
332
|
if (e.values.empty()) {
|
|
284
333
|
e.values.resize(nval, 0);
|
|
285
334
|
e.counts.resize(nval, 0);
|
|
@@ -289,7 +338,7 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
|
|
|
289
338
|
in.read((char*)tmp.data(), nval*sizeof(float));
|
|
290
339
|
if (in.fail()) {
|
|
291
340
|
printf("%s: failed reading data for entry %d\n",__func__,i);
|
|
292
|
-
|
|
341
|
+
m_stats = {};
|
|
293
342
|
return false;
|
|
294
343
|
}
|
|
295
344
|
|
|
@@ -304,13 +353,6 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
|
|
|
304
353
|
return true;
|
|
305
354
|
}
|
|
306
355
|
|
|
307
|
-
bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
|
|
308
|
-
if (!add) {
|
|
309
|
-
m_stats.clear();
|
|
310
|
-
}
|
|
311
|
-
return load_imatrix(file_name, m_stats);
|
|
312
|
-
}
|
|
313
|
-
|
|
314
356
|
static IMatrixCollector g_collector;
|
|
315
357
|
|
|
316
358
|
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
|
|
@@ -324,7 +366,7 @@ struct results_log_softmax {
|
|
|
324
366
|
float prob;
|
|
325
367
|
};
|
|
326
368
|
|
|
327
|
-
static std::vector<float> softmax(const std::vector<float
|
|
369
|
+
static std::vector<float> softmax(const std::vector<float> & logits) {
|
|
328
370
|
std::vector<float> probs(logits.size());
|
|
329
371
|
float max_logit = logits[0];
|
|
330
372
|
for (float v : logits) {
|
|
@@ -358,8 +400,7 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
|
|
|
358
400
|
|
|
359
401
|
static void process_logits(
|
|
360
402
|
int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
|
|
361
|
-
double & nll, double & nll2, float * logit_history, float * prob_history
|
|
362
|
-
) {
|
|
403
|
+
double & nll, double & nll2, float * logit_history, float * prob_history) {
|
|
363
404
|
std::mutex mutex;
|
|
364
405
|
int counter = 0;
|
|
365
406
|
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
|
|
@@ -391,8 +432,7 @@ static void process_logits(
|
|
|
391
432
|
}
|
|
392
433
|
}
|
|
393
434
|
|
|
394
|
-
static bool compute_imatrix(llama_context * ctx, const gpt_params & params
|
|
395
|
-
|
|
435
|
+
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|
396
436
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
|
397
437
|
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
|
398
438
|
const int n_ctx = llama_n_ctx(ctx);
|
|
@@ -405,13 +445,13 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
|
|
|
405
445
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
|
406
446
|
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
|
407
447
|
|
|
408
|
-
if (
|
|
409
|
-
if (size_t((
|
|
410
|
-
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__,
|
|
448
|
+
if (params.i_chunk > 0) {
|
|
449
|
+
if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
|
|
450
|
+
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
|
|
411
451
|
return false;
|
|
412
452
|
}
|
|
413
|
-
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__,
|
|
414
|
-
tokens.erase(tokens.begin(), tokens.begin() +
|
|
453
|
+
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
|
|
454
|
+
tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
|
|
415
455
|
}
|
|
416
456
|
|
|
417
457
|
if (int(tokens.size()) < 2*n_ctx) {
|
|
@@ -424,7 +464,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
|
|
|
424
464
|
std::vector<float> logit_history;
|
|
425
465
|
std::vector<float> prob_history;
|
|
426
466
|
|
|
427
|
-
if (compute_ppl) {
|
|
467
|
+
if (params.compute_ppl) {
|
|
428
468
|
logit_history.resize(tokens.size());
|
|
429
469
|
prob_history.resize(tokens.size());
|
|
430
470
|
}
|
|
@@ -446,7 +486,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
|
|
|
446
486
|
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
|
|
447
487
|
|
|
448
488
|
std::vector<float> logits;
|
|
449
|
-
if (compute_ppl && num_batches > 1) {
|
|
489
|
+
if (params.compute_ppl && num_batches > 1) {
|
|
450
490
|
logits.reserve((size_t)n_ctx * n_vocab);
|
|
451
491
|
}
|
|
452
492
|
|
|
@@ -482,7 +522,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
|
|
|
482
522
|
// restore the original token in case it was set to BOS
|
|
483
523
|
tokens[batch_start] = token_org;
|
|
484
524
|
|
|
485
|
-
if (compute_ppl && num_batches > 1) {
|
|
525
|
+
if (params.compute_ppl && num_batches > 1) {
|
|
486
526
|
const auto * batch_logits = llama_get_logits(ctx);
|
|
487
527
|
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
|
488
528
|
}
|
|
@@ -501,7 +541,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
|
|
|
501
541
|
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
|
502
542
|
}
|
|
503
543
|
|
|
504
|
-
if (compute_ppl) {
|
|
544
|
+
if (params.compute_ppl) {
|
|
505
545
|
const int first = n_ctx/2;
|
|
506
546
|
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
|
507
547
|
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
|
@@ -516,7 +556,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
|
|
|
516
556
|
}
|
|
517
557
|
printf("\n");
|
|
518
558
|
|
|
519
|
-
if (compute_ppl) {
|
|
559
|
+
if (params.compute_ppl) {
|
|
520
560
|
nll2 /= count;
|
|
521
561
|
nll /= count;
|
|
522
562
|
const double ppl = exp(nll);
|
|
@@ -533,111 +573,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
|
|
|
533
573
|
}
|
|
534
574
|
|
|
535
575
|
int main(int argc, char ** argv) {
|
|
536
|
-
|
|
537
|
-
StatParams sparams;
|
|
538
|
-
std::string prev_result_file;
|
|
539
|
-
std::string combine_files;
|
|
540
|
-
bool compute_ppl = true;
|
|
541
|
-
int from_chunk = 0;
|
|
542
|
-
std::vector<char*> args;
|
|
543
|
-
args.push_back(argv[0]);
|
|
544
|
-
int iarg = 1;
|
|
545
|
-
for (; iarg < argc-1; ++iarg) {
|
|
546
|
-
std::string arg{argv[iarg]};
|
|
547
|
-
if (arg == "-o" || arg == "--output-file") {
|
|
548
|
-
sparams.ofile = argv[++iarg];
|
|
549
|
-
}
|
|
550
|
-
else if (arg == "-ofreq" || arg == "--output-frequency") {
|
|
551
|
-
sparams.n_output_frequency = std::stoi(argv[++iarg]);
|
|
552
|
-
}
|
|
553
|
-
else if (arg == "-ow" || arg == "--output-weight") {
|
|
554
|
-
sparams.collect_output_weight = std::stoi(argv[++iarg]);
|
|
555
|
-
}
|
|
556
|
-
else if (arg == "--verbosity") {
|
|
557
|
-
sparams.verbosity = std::stoi(argv[++iarg]);
|
|
558
|
-
} else if (arg == "--no-ppl") {
|
|
559
|
-
compute_ppl = false;
|
|
560
|
-
} else if (arg == "--keep-imatrix") {
|
|
561
|
-
sparams.keep_every = std::stoi(argv[++iarg]);
|
|
562
|
-
} else if (arg == "--continue-from") {
|
|
563
|
-
prev_result_file = argv[++iarg];
|
|
564
|
-
} else if (arg == "--combine") {
|
|
565
|
-
combine_files = argv[++iarg];
|
|
566
|
-
}
|
|
567
|
-
else if (arg == "--from-chunk") {
|
|
568
|
-
from_chunk = std::stoi(argv[++iarg]);
|
|
569
|
-
} else {
|
|
570
|
-
args.push_back(argv[iarg]);
|
|
571
|
-
}
|
|
572
|
-
}
|
|
573
|
-
if (iarg < argc) {
|
|
574
|
-
std::string arg{argv[iarg]};
|
|
575
|
-
if (arg == "--no-ppl") {
|
|
576
|
-
compute_ppl = false;
|
|
577
|
-
} else {
|
|
578
|
-
args.push_back(argv[iarg]);
|
|
579
|
-
}
|
|
580
|
-
}
|
|
581
|
-
|
|
582
576
|
gpt_params params;
|
|
583
|
-
params.n_batch = 512;
|
|
584
|
-
if (!gpt_params_parse(args.size(), args.data(), params)) {
|
|
585
|
-
return 1;
|
|
586
|
-
}
|
|
587
577
|
|
|
578
|
+
params.n_ctx = 512;
|
|
588
579
|
params.logits_all = true;
|
|
589
|
-
params.
|
|
580
|
+
params.verbosity = 1;
|
|
590
581
|
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
params.seed = time(NULL);
|
|
582
|
+
if (!gpt_params_parse(argc, argv, params)) {
|
|
583
|
+
print_usage(argc, argv, params);
|
|
584
|
+
return 1;
|
|
595
585
|
}
|
|
596
586
|
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
std::mt19937 rng(params.seed);
|
|
600
|
-
if (params.random_prompt) {
|
|
601
|
-
params.prompt = string_random_prompt(rng);
|
|
602
|
-
}
|
|
587
|
+
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
|
603
588
|
|
|
604
|
-
|
|
605
|
-
g_collector.set_parameters(std::move(sparams));
|
|
589
|
+
g_collector.set_params(params);
|
|
606
590
|
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
auto new_pos = combine_files.find(',', pos);
|
|
612
|
-
if (new_pos != std::string::npos) {
|
|
613
|
-
files.emplace_back(combine_files.substr(pos, new_pos - pos));
|
|
614
|
-
pos = new_pos + 1;
|
|
615
|
-
} else {
|
|
616
|
-
files.emplace_back(combine_files.substr(pos));
|
|
617
|
-
break;
|
|
618
|
-
}
|
|
619
|
-
}
|
|
620
|
-
if (files.size() < 2) {
|
|
621
|
-
fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
|
|
591
|
+
for (const auto & in_file : params.in_files) {
|
|
592
|
+
printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
|
|
593
|
+
if (!g_collector.load_imatrix(in_file.c_str())) {
|
|
594
|
+
fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
|
|
622
595
|
return 1;
|
|
623
596
|
}
|
|
624
|
-
printf("Combining the following %d files\n", int(files.size()));
|
|
625
|
-
for (auto& file : files) {
|
|
626
|
-
printf(" %s\n", file.c_str());
|
|
627
|
-
if (!g_collector.load_imatrix(file.c_str(), true)) {
|
|
628
|
-
fprintf(stderr, "Failed to load %s\n", file.c_str());
|
|
629
|
-
return 1;
|
|
630
|
-
}
|
|
631
|
-
}
|
|
632
|
-
g_collector.save_imatrix();
|
|
633
|
-
return 0;
|
|
634
597
|
}
|
|
635
598
|
|
|
636
|
-
if (
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
return 1;
|
|
640
|
-
}
|
|
599
|
+
if (params.in_files.size() > 1) {
|
|
600
|
+
printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
|
|
601
|
+
g_collector.save_imatrix();
|
|
641
602
|
}
|
|
642
603
|
|
|
643
604
|
llama_backend_init();
|
|
@@ -652,6 +613,7 @@ int main(int argc, char ** argv) {
|
|
|
652
613
|
// init
|
|
653
614
|
llama_model * model;
|
|
654
615
|
llama_context * ctx;
|
|
616
|
+
|
|
655
617
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
|
656
618
|
if (model == nullptr || ctx == nullptr) {
|
|
657
619
|
fprintf(stderr, "%s : failed to init\n", __func__);
|
|
@@ -670,8 +632,7 @@ int main(int argc, char ** argv) {
|
|
|
670
632
|
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
|
671
633
|
}
|
|
672
634
|
|
|
673
|
-
|
|
674
|
-
if (!OK) {
|
|
635
|
+
if (!compute_imatrix(ctx, params)) {
|
|
675
636
|
return 1;
|
|
676
637
|
}
|
|
677
638
|
|