@fugood/llama.node 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +8 -1
- package/package.json +1 -1
- package/patches/llama.patch +12 -12
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/LlamaContext.cpp +33 -1
- package/src/LlamaContext.h +1 -0
- package/src/LoadSessionWorker.cpp +1 -0
- package/src/llama.cpp/.github/workflows/bench.yml +310 -0
- package/src/llama.cpp/.github/workflows/build.yml +1315 -0
- package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
- package/src/llama.cpp/.github/workflows/docker.yml +116 -0
- package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
- package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
- package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
- package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
- package/src/llama.cpp/.github/workflows/server.yml +183 -0
- package/src/llama.cpp/CMakeLists.txt +91 -1245
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
- package/src/llama.cpp/cmake/build-info.cmake +58 -0
- package/src/llama.cpp/cmake/git-vars.cmake +22 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -3
- package/src/llama.cpp/common/build-info.cpp.in +4 -0
- package/src/llama.cpp/common/common.cpp +1116 -877
- package/src/llama.cpp/common/common.h +191 -77
- package/src/llama.cpp/common/grammar-parser.cpp +118 -31
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
- package/src/llama.cpp/common/log.h +1 -1
- package/src/llama.cpp/common/ngram-cache.h +10 -3
- package/src/llama.cpp/common/sampling.cpp +19 -10
- package/src/llama.cpp/docs/build.md +353 -0
- package/src/llama.cpp/examples/CMakeLists.txt +22 -22
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +52 -55
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/chat-13B.bat +57 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
- package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
- package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
- package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +38 -153
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
- package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
- package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
- package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
- package/src/llama.cpp/examples/llava/clip.cpp +23 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
- package/src/llama.cpp/examples/llava/requirements.txt +3 -2
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +98 -75
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
- package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
- package/src/llama.cpp/examples/server/server.cpp +274 -671
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +28 -29
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +21 -29
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +23 -0
- package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
- package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
- package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
- package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
- package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
- package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
- package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
- package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
- package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
- package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
- package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
- package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
- package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
- package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
- package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
- package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
- package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
- package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
- package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
- package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
- package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
- package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
- package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
- package/src/llama.cpp/models/.editorconfig +1 -0
- package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/requirements/requirements-all.txt +12 -0
- package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
- package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
- package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
- package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
- package/src/llama.cpp/requirements.txt +5 -4
- package/src/llama.cpp/scripts/build-info.sh +30 -0
- package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
- package/src/llama.cpp/src/CMakeLists.txt +33 -0
- package/src/llama.cpp/src/llama-grammar.cpp +539 -0
- package/src/llama.cpp/src/llama-grammar.h +39 -0
- package/src/llama.cpp/src/llama-impl.h +26 -0
- package/src/llama.cpp/src/llama-sampling.cpp +635 -0
- package/src/llama.cpp/src/llama-sampling.h +56 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
- package/src/llama.cpp/src/llama-vocab.h +130 -0
- package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
- package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
- package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
- package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
- package/src/llama.cpp/tests/CMakeLists.txt +19 -20
- package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
- package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
- package/src/llama.cpp/tests/test-double-float.cpp +2 -2
- package/src/llama.cpp/tests/test-grad0.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
- package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
- package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
- package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
- package/src/llama.cpp/tests/test-rope.cpp +3 -4
- package/src/llama.cpp/tests/test-sampling.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
- package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
- package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
- package/src/llama.cpp/ggml-opencl.cpp +0 -2305
- package/src/llama.cpp/ggml-opencl.h +0 -36
- package/src/llama.cpp/ggml-sycl.cpp +0 -17340
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
- package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
- /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
- /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
- /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
- /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
- /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
|
@@ -1,462 +1,420 @@
|
|
|
1
|
-
|
|
2
1
|
#include "common.h"
|
|
3
2
|
#include "ggml.h"
|
|
4
3
|
#include "ggml-alloc.h"
|
|
5
4
|
|
|
5
|
+
#include <map>
|
|
6
6
|
#include <vector>
|
|
7
7
|
#include <string>
|
|
8
8
|
#include <thread>
|
|
9
|
+
#include <fstream>
|
|
9
10
|
|
|
10
|
-
|
|
11
|
-
std::string filename;
|
|
12
|
-
float scale;
|
|
13
|
-
};
|
|
14
|
-
|
|
15
|
-
struct export_lora_params {
|
|
16
|
-
std::string fn_model_base;
|
|
17
|
-
std::string fn_model_out;
|
|
18
|
-
std::vector<struct lora_info> lora;
|
|
19
|
-
int n_threads;
|
|
20
|
-
};
|
|
11
|
+
static bool g_verbose = false;
|
|
21
12
|
|
|
22
|
-
struct
|
|
23
|
-
|
|
24
|
-
std::
|
|
25
|
-
|
|
13
|
+
static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
|
|
14
|
+
int id = gguf_find_key(ctx_gguf, key.c_str());
|
|
15
|
+
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
|
|
16
|
+
}
|
|
26
17
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
18
|
+
static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) {
|
|
19
|
+
int id = gguf_find_key(ctx_gguf, key.c_str());
|
|
20
|
+
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
|
|
21
|
+
}
|
|
30
22
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
23
|
+
static void zeros(std::ofstream & file, size_t n) {
|
|
24
|
+
char zero = 0;
|
|
25
|
+
for (size_t i = 0; i < n; ++i) {
|
|
26
|
+
file.write(&zero, 1);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
35
29
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
size = tell();
|
|
43
|
-
seek(0, SEEK_SET);
|
|
30
|
+
static std::string ggml_ne_string(const ggml_tensor * t) {
|
|
31
|
+
std::string str;
|
|
32
|
+
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
|
33
|
+
str += std::to_string(t->ne[i]);
|
|
34
|
+
if (i + 1 < GGML_MAX_DIMS) {
|
|
35
|
+
str += ", ";
|
|
44
36
|
}
|
|
45
37
|
}
|
|
38
|
+
return str;
|
|
39
|
+
}
|
|
46
40
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
41
|
+
static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
|
|
42
|
+
struct gguf_init_params params = {
|
|
43
|
+
/*.no_alloc = */ true,
|
|
44
|
+
/*.ctx = */ ctx_ggml,
|
|
45
|
+
};
|
|
46
|
+
struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
|
|
47
|
+
if (!ctx_gguf) {
|
|
48
|
+
throw std::runtime_error("failed to load input GGUF from " + fname);
|
|
55
49
|
}
|
|
50
|
+
return ctx_gguf;
|
|
51
|
+
}
|
|
56
52
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
53
|
+
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
|
54
|
+
std::string result;
|
|
55
|
+
for (size_t pos = 0; ; pos += search.length()) {
|
|
56
|
+
auto new_pos = s.find(search, pos);
|
|
57
|
+
if (new_pos == std::string::npos) {
|
|
58
|
+
result += s.substr(pos, s.size() - pos);
|
|
59
|
+
break;
|
|
60
|
+
}
|
|
61
|
+
result += s.substr(pos, new_pos - pos) + replace;
|
|
62
|
+
pos = new_pos;
|
|
64
63
|
}
|
|
64
|
+
s = std::move(result);
|
|
65
|
+
}
|
|
65
66
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
if (
|
|
76
|
-
|
|
67
|
+
struct file_input {
|
|
68
|
+
struct ggml_context * ctx_meta = nullptr;
|
|
69
|
+
struct gguf_context * ctx_gguf = nullptr;
|
|
70
|
+
std::ifstream f_in;
|
|
71
|
+
std::map<std::string, ggml_tensor *> tensors;
|
|
72
|
+
float alpha;
|
|
73
|
+
float scale;
|
|
74
|
+
|
|
75
|
+
file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
|
|
76
|
+
if (!f_in.is_open()) {
|
|
77
|
+
throw std::runtime_error("failed to open input gguf from " + fname);
|
|
77
78
|
}
|
|
78
|
-
}
|
|
79
79
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
80
|
+
ctx_gguf = load_gguf(fname, &ctx_meta);
|
|
81
|
+
alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha");
|
|
82
|
+
printf("%s: loaded gguf from %s\n", __func__, fname.c_str());
|
|
83
|
+
|
|
84
|
+
for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) {
|
|
85
|
+
std::string name(cur->name);
|
|
86
|
+
tensors[name] = cur;
|
|
87
|
+
if (g_verbose) {
|
|
88
|
+
printf("%s: %s\n", __func__, cur->name);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
84
91
|
}
|
|
85
92
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
93
|
+
ggml_tensor * get_tensor(std::string name) {
|
|
94
|
+
if (tensors.find(name) == tensors.end()) {
|
|
95
|
+
return nullptr;
|
|
96
|
+
}
|
|
97
|
+
return tensors[name];
|
|
90
98
|
}
|
|
91
99
|
|
|
92
|
-
void
|
|
93
|
-
if (
|
|
94
|
-
|
|
100
|
+
void read_tensor_data(std::string name, std::vector<uint8_t> & buf) {
|
|
101
|
+
if (tensors.find(name) == tensors.end()) {
|
|
102
|
+
throw std::runtime_error("cannot find tensor with name: " + name);
|
|
95
103
|
}
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
die_fmt("write error: %s", strerror(errno));
|
|
104
|
+
auto len = ggml_nbytes(tensors[name]);
|
|
105
|
+
if (buf.size() < len) {
|
|
106
|
+
buf.resize(len);
|
|
100
107
|
}
|
|
108
|
+
auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file
|
|
109
|
+
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
|
|
110
|
+
f_in.seekg(offset);
|
|
111
|
+
f_in.read((char* )buf.data(), len);
|
|
101
112
|
}
|
|
102
113
|
|
|
103
|
-
|
|
104
|
-
|
|
114
|
+
~file_input() {
|
|
115
|
+
gguf_free(ctx_gguf);
|
|
116
|
+
ggml_free(ctx_meta);
|
|
105
117
|
}
|
|
118
|
+
};
|
|
106
119
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
120
|
+
struct lora_merge_ctx {
|
|
121
|
+
// input base model + adapters
|
|
122
|
+
file_input base_model;
|
|
123
|
+
std::vector<std::unique_ptr<file_input>> adapters;
|
|
110
124
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
125
|
+
// for computing merged tensor
|
|
126
|
+
int n_threads;
|
|
127
|
+
ggml_backend_t backend = nullptr;
|
|
128
|
+
ggml_gallocr_t allocr = nullptr;
|
|
129
|
+
std::vector<uint8_t> read_buf;
|
|
130
|
+
|
|
131
|
+
// output file
|
|
132
|
+
struct gguf_context * ctx_out;
|
|
133
|
+
struct ggml_context * ctx_out_ggml;
|
|
134
|
+
std::ofstream fout;
|
|
135
|
+
|
|
136
|
+
lora_merge_ctx(
|
|
137
|
+
std::string & base_fname,
|
|
138
|
+
std::vector<std::tuple<std::string, float>> & lora_files,
|
|
139
|
+
std::string & outfile,
|
|
140
|
+
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
|
|
141
|
+
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
|
142
|
+
|
|
143
|
+
if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
|
|
144
|
+
throw std::runtime_error("split model is not yet supported");
|
|
114
145
|
}
|
|
115
|
-
}
|
|
116
|
-
};
|
|
117
146
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
}
|
|
147
|
+
for (auto lora_inp : lora_files) {
|
|
148
|
+
auto fname = std::get<0>(lora_inp);
|
|
149
|
+
auto scale = std::get<1>(lora_inp);
|
|
150
|
+
std::unique_ptr<file_input> adapter(new file_input(fname, scale));
|
|
151
|
+
check_metadata_lora(adapter.get());
|
|
152
|
+
adapters.push_back(std::move(adapter));
|
|
153
|
+
}
|
|
125
154
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
}
|
|
155
|
+
ctx_out = gguf_init_empty();
|
|
156
|
+
struct ggml_init_params params = {
|
|
157
|
+
/*.mem_size =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
|
|
158
|
+
/*.mem_buffer =*/ NULL,
|
|
159
|
+
/*.no_alloc =*/ true,
|
|
160
|
+
};
|
|
161
|
+
ctx_out_ggml = ggml_init(params);
|
|
162
|
+
backend = ggml_backend_cpu_init();
|
|
163
|
+
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
|
|
164
|
+
}
|
|
137
165
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
166
|
+
void check_metadata_lora(file_input * adapter) {
|
|
167
|
+
auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
|
|
168
|
+
if (general_type != "adapter") {
|
|
169
|
+
throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
|
|
170
|
+
}
|
|
143
171
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
172
|
+
auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type");
|
|
173
|
+
if (adapter_type != "lora") {
|
|
174
|
+
throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
|
|
148
175
|
}
|
|
149
176
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
}
|
|
155
|
-
params->fn_model_base = argv[i];
|
|
156
|
-
} else if (arg == "-o" || arg == "--model-out") {
|
|
157
|
-
if (++i >= argc) {
|
|
158
|
-
invalid_param = true;
|
|
159
|
-
break;
|
|
160
|
-
}
|
|
161
|
-
params->fn_model_out = argv[i];
|
|
162
|
-
} else if (arg == "-l" || arg == "--lora") {
|
|
163
|
-
if (++i >= argc) {
|
|
164
|
-
invalid_param = true;
|
|
165
|
-
break;
|
|
166
|
-
}
|
|
167
|
-
struct lora_info lora;
|
|
168
|
-
lora.filename = argv[i];
|
|
169
|
-
lora.scale = 1.0f;
|
|
170
|
-
params->lora.push_back(lora);
|
|
171
|
-
} else if (arg == "-s" || arg == "--lora-scaled") {
|
|
172
|
-
if (++i >= argc) {
|
|
173
|
-
invalid_param = true;
|
|
174
|
-
break;
|
|
175
|
-
}
|
|
176
|
-
struct lora_info lora;
|
|
177
|
-
lora.filename = argv[i];
|
|
178
|
-
if (++i >= argc) {
|
|
179
|
-
invalid_param = true;
|
|
180
|
-
break;
|
|
181
|
-
}
|
|
182
|
-
lora.scale = std::stof(argv[i]);
|
|
183
|
-
params->lora.push_back(lora);
|
|
184
|
-
} else if (arg == "-t" || arg == "--threads") {
|
|
185
|
-
if (++i >= argc) {
|
|
186
|
-
invalid_param = true;
|
|
187
|
-
break;
|
|
188
|
-
}
|
|
189
|
-
params->n_threads = std::stoi(argv[i]);
|
|
190
|
-
if (params->n_threads <= 0) {
|
|
191
|
-
params->n_threads = std::thread::hardware_concurrency();
|
|
192
|
-
}
|
|
193
|
-
} else {
|
|
194
|
-
fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
|
|
195
|
-
export_lora_print_usage(argc, argv, &default_params);
|
|
196
|
-
exit(1);
|
|
177
|
+
auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture");
|
|
178
|
+
auto general_arch_lora = get_kv_str(adapter->ctx_gguf, "general.architecture");
|
|
179
|
+
if (general_arch_base != general_arch_lora) {
|
|
180
|
+
throw std::runtime_error("model arch and LoRA arch mismatch");
|
|
197
181
|
}
|
|
198
182
|
}
|
|
199
183
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
fprintf(stderr, "error: please specify a filename for model-out.\n");
|
|
207
|
-
export_lora_print_usage(argc, argv, &default_params);
|
|
208
|
-
exit(1);
|
|
209
|
-
}
|
|
210
|
-
if (invalid_param) {
|
|
211
|
-
fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
|
|
212
|
-
export_lora_print_usage(argc, argv, &default_params);
|
|
213
|
-
exit(1);
|
|
184
|
+
ggml_type get_out_tensor_type(struct ggml_tensor * t) {
|
|
185
|
+
if (t->type == GGML_TYPE_F32) {
|
|
186
|
+
return GGML_TYPE_F32;
|
|
187
|
+
} else {
|
|
188
|
+
return GGML_TYPE_F16;
|
|
189
|
+
}
|
|
214
190
|
}
|
|
215
|
-
return true;
|
|
216
|
-
}
|
|
217
191
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
192
|
+
void run_merge() {
|
|
193
|
+
// prepare metadata
|
|
194
|
+
gguf_set_kv(ctx_out, base_model.ctx_gguf);
|
|
195
|
+
// output is forced to f16 for now
|
|
196
|
+
gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
|
|
197
|
+
|
|
198
|
+
// check if all lora adapters have the same tensors
|
|
199
|
+
// TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
|
|
200
|
+
static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
|
|
201
|
+
if (adapters.size() > 1) {
|
|
202
|
+
for (size_t i = 1; i < adapters.size(); ++i) {
|
|
203
|
+
if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) {
|
|
204
|
+
throw std::runtime_error(err_no_subset_adapter);
|
|
205
|
+
}
|
|
206
|
+
for (auto & it : adapters[i]->tensors) {
|
|
207
|
+
if (adapters[0]->get_tensor(it.first) == nullptr) {
|
|
208
|
+
throw std::runtime_error(err_no_subset_adapter);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
}
|
|
224
213
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
214
|
+
// mapping base tensor to out tensor (same shape with base, but different type)
|
|
215
|
+
// if out_tensor == nullptr, we only copy it
|
|
216
|
+
std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
|
|
217
|
+
for (auto & it : base_model.tensors) {
|
|
218
|
+
bool t_a = true;
|
|
219
|
+
bool t_b = true;
|
|
220
|
+
for (auto & adapter : adapters) {
|
|
221
|
+
t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a");
|
|
222
|
+
t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
|
|
223
|
+
}
|
|
224
|
+
auto base_tensor = it.second;
|
|
225
|
+
if (!t_a && !t_b) {
|
|
226
|
+
// only copy
|
|
227
|
+
struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
|
|
228
|
+
ggml_set_name(cpy_tensor, base_tensor->name);
|
|
229
|
+
base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
|
|
230
|
+
gguf_add_tensor(ctx_out, cpy_tensor);
|
|
231
|
+
} else if (t_a && t_b) {
|
|
232
|
+
// need merging
|
|
233
|
+
struct ggml_tensor * out_tensor = ggml_new_tensor(
|
|
234
|
+
ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
|
|
235
|
+
ggml_set_name(out_tensor, base_tensor->name);
|
|
236
|
+
base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
|
|
237
|
+
gguf_add_tensor(ctx_out, out_tensor);
|
|
238
|
+
} else {
|
|
239
|
+
throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
|
|
240
|
+
}
|
|
241
|
+
}
|
|
239
242
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
243
|
+
// placeholder for the meta data
|
|
244
|
+
{
|
|
245
|
+
size_t meta_size = gguf_get_meta_size(ctx_out);
|
|
246
|
+
zeros(fout, meta_size);
|
|
247
|
+
}
|
|
245
248
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
result->lora_alpha = file.read_u32();
|
|
256
|
-
// read tensor infos from file
|
|
257
|
-
std::vector<char> name_buf;
|
|
258
|
-
std::vector<struct ggml_tensor *> tensors;
|
|
259
|
-
std::vector<size_t> tensors_offset;
|
|
260
|
-
size_t total_nbytes_pad = 0;
|
|
261
|
-
while(!file.eof()) {
|
|
262
|
-
int64_t ne[4] = {1,1,1,1};
|
|
263
|
-
uint32_t n_dims = file.read_u32();
|
|
264
|
-
uint32_t namelen = file.read_u32();
|
|
265
|
-
uint32_t type = file.read_u32();
|
|
266
|
-
for (uint32_t k = 0; k < n_dims; ++k) {
|
|
267
|
-
ne[k] = (int64_t)file.read_u32();
|
|
249
|
+
// process base model tensors
|
|
250
|
+
size_t n_merged = 0;
|
|
251
|
+
for (auto & it : base_to_out_tensors) {
|
|
252
|
+
if (it.second != nullptr) {
|
|
253
|
+
merge_tensor(it.first, it.second);
|
|
254
|
+
n_merged++;
|
|
255
|
+
} else {
|
|
256
|
+
copy_tensor(it.first);
|
|
257
|
+
}
|
|
268
258
|
}
|
|
269
|
-
name_buf.clear();
|
|
270
|
-
name_buf.resize(namelen + 1, '\0');
|
|
271
|
-
file.read_raw(name_buf.data(), namelen);
|
|
272
|
-
file.seek((0-file.tell()) & 31, SEEK_CUR);
|
|
273
|
-
size_t offset = file.tell();
|
|
274
|
-
struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
|
|
275
|
-
ggml_set_name(tensor, name_buf.data());
|
|
276
|
-
size_t nbytes = ggml_nbytes(tensor);
|
|
277
|
-
size_t nbytes_pad = ggml_nbytes_pad(tensor);
|
|
278
|
-
total_nbytes_pad += nbytes_pad;
|
|
279
|
-
tensors.push_back(tensor);
|
|
280
|
-
tensors_offset.push_back(offset);
|
|
281
|
-
file.seek(nbytes, SEEK_CUR);
|
|
282
|
-
}
|
|
283
|
-
// read tensor data
|
|
284
|
-
result->data.resize(total_nbytes_pad);
|
|
285
|
-
size_t data_offset = 0;
|
|
286
|
-
for (size_t i = 0; i < tensors.size(); ++i) {
|
|
287
|
-
struct ggml_tensor * tensor = tensors[i];
|
|
288
|
-
size_t offset = tensors_offset[i];
|
|
289
|
-
size_t nbytes = ggml_nbytes(tensor);
|
|
290
|
-
size_t nbytes_pad = ggml_nbytes_pad(tensor);
|
|
291
|
-
file.seek(offset, SEEK_SET);
|
|
292
|
-
tensor->data = result->data.data() + data_offset;
|
|
293
|
-
file.read_raw(tensor->data, nbytes);
|
|
294
|
-
data_offset += nbytes_pad;
|
|
295
|
-
}
|
|
296
|
-
return result;
|
|
297
|
-
}
|
|
298
259
|
|
|
260
|
+
// write output metadata
|
|
261
|
+
{
|
|
262
|
+
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
|
263
|
+
gguf_get_meta_data(ctx_out, data.data());
|
|
264
|
+
fout.seekp(0);
|
|
265
|
+
fout.write((const char *)data.data(), data.size());
|
|
266
|
+
}
|
|
299
267
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
struct ggml_tensor * tensor,
|
|
303
|
-
struct ggml_tensor * lora_a,
|
|
304
|
-
struct ggml_tensor * lora_b,
|
|
305
|
-
float scaling
|
|
306
|
-
) {
|
|
307
|
-
struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
|
|
308
|
-
if (scaling != 1.0f) {
|
|
309
|
-
ab = ggml_scale(ctx, ab, scaling);
|
|
268
|
+
printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
|
|
269
|
+
printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
|
|
310
270
|
}
|
|
311
|
-
struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
|
|
312
|
-
|
|
313
|
-
struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
|
314
|
-
ggml_build_forward_expand (gf, res);
|
|
315
|
-
return gf;
|
|
316
|
-
}
|
|
317
271
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
std::string name_b = name + std::string(".loraB");
|
|
325
|
-
struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str());
|
|
326
|
-
struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str());
|
|
327
|
-
if (lora_a == NULL || lora_b == NULL) {
|
|
328
|
-
return false;
|
|
272
|
+
void copy_tensor(struct ggml_tensor * base) {
|
|
273
|
+
printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
|
|
274
|
+
size_t len = ggml_nbytes(base);
|
|
275
|
+
base_model.read_tensor_data(base->name, read_buf);
|
|
276
|
+
fout.write((char* )read_buf.data(), len);
|
|
277
|
+
zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
|
|
329
278
|
}
|
|
330
279
|
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
280
|
+
void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) {
|
|
281
|
+
std::string name_base(base->name);
|
|
282
|
+
std::string name_lora_a = name_base + ".lora_a";
|
|
283
|
+
std::string name_lora_b = name_base + ".lora_b";
|
|
284
|
+
|
|
285
|
+
printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
|
|
286
|
+
|
|
287
|
+
// context for input tensor
|
|
288
|
+
std::vector<struct ggml_tensor *> inp_a(adapters.size());
|
|
289
|
+
std::vector<struct ggml_tensor *> inp_b(adapters.size());
|
|
290
|
+
struct ggml_init_params params {
|
|
291
|
+
/*.mem_size =*/ ggml_tensor_overhead()*(2+adapters.size()*2),
|
|
292
|
+
/*.mem_buffer =*/ NULL,
|
|
293
|
+
/*.no_alloc =*/ true,
|
|
294
|
+
};
|
|
295
|
+
struct ggml_context * ctx = ggml_init(params);
|
|
296
|
+
|
|
297
|
+
// alloc tensors
|
|
298
|
+
struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne);
|
|
299
|
+
for (size_t i = 0; i < adapters.size(); ++i) {
|
|
300
|
+
auto t_a = adapters[i]->get_tensor(name_lora_a);
|
|
301
|
+
auto t_b = adapters[i]->get_tensor(name_lora_b);
|
|
302
|
+
inp_a[i] = ggml_dup_tensor(ctx, t_a);
|
|
303
|
+
inp_b[i] = ggml_dup_tensor(ctx, t_b);
|
|
304
|
+
}
|
|
305
|
+
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
|
306
|
+
|
|
307
|
+
// load base tensor to backend buffer
|
|
308
|
+
base_model.read_tensor_data(name_base, read_buf);
|
|
309
|
+
if (base->type != GGML_TYPE_F32) {
|
|
310
|
+
// optionally dequantize it
|
|
311
|
+
printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
|
|
312
|
+
auto nels = ggml_nelements(inp_base);
|
|
313
|
+
ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
|
|
314
|
+
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
|
|
315
|
+
qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
|
|
316
|
+
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
|
|
317
|
+
} else {
|
|
318
|
+
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
|
|
319
|
+
}
|
|
351
320
|
|
|
352
|
-
|
|
321
|
+
// load lora tensors to backend buffer
|
|
322
|
+
for (size_t i = 0; i < adapters.size(); ++i) {
|
|
323
|
+
adapters[i]->read_tensor_data(name_lora_a, read_buf);
|
|
324
|
+
ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
|
|
325
|
+
adapters[i]->read_tensor_data(name_lora_b, read_buf);
|
|
326
|
+
ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i]));
|
|
327
|
+
}
|
|
353
328
|
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
329
|
+
// build graph
|
|
330
|
+
struct ggml_cgraph * gf;
|
|
331
|
+
{
|
|
332
|
+
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
|
|
333
|
+
static std::vector<uint8_t> buf(buf_size);
|
|
334
|
+
struct ggml_init_params params0 = {
|
|
335
|
+
/*.mem_size =*/ buf_size,
|
|
336
|
+
/*.mem_buffer =*/ buf.data(),
|
|
337
|
+
/*.no_alloc =*/ true,
|
|
338
|
+
};
|
|
339
|
+
struct ggml_context * ctx0 = ggml_init(params0);
|
|
340
|
+
gf = ggml_new_graph(ctx0);
|
|
341
|
+
struct ggml_tensor * cur = inp_base;
|
|
342
|
+
for (size_t i = 0; i < adapters.size(); ++i) {
|
|
343
|
+
struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
|
|
344
|
+
struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
|
|
345
|
+
// scale
|
|
346
|
+
const float alpha = adapters[i]->alpha;
|
|
347
|
+
const float rank = (float) inp_b[i]->ne[0];
|
|
348
|
+
const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
|
|
349
|
+
delta = ggml_scale(ctx0, delta, scale);
|
|
350
|
+
cur = ggml_add(ctx0, delta, cur);
|
|
351
|
+
printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
|
|
352
|
+
printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
|
|
353
|
+
}
|
|
354
|
+
cur = ggml_cast(ctx0, cur, out->type);
|
|
355
|
+
printf("%s : + output type is %s\n", __func__, ggml_type_name(out->type));
|
|
356
|
+
ggml_build_forward_expand(gf, cur);
|
|
357
|
+
ggml_free(ctx0);
|
|
358
|
+
}
|
|
358
359
|
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
if (lora != NULL) {
|
|
365
|
-
loras.push_back(lora);
|
|
360
|
+
// compute
|
|
361
|
+
{
|
|
362
|
+
ggml_gallocr_alloc_graph(allocr, gf);
|
|
363
|
+
ggml_backend_cpu_set_n_threads(backend, n_threads);
|
|
364
|
+
ggml_backend_graph_compute(backend, gf);
|
|
366
365
|
}
|
|
367
|
-
}
|
|
368
|
-
if (loras.size() == 0) {
|
|
369
|
-
fprintf(stderr, "warning: no lora adapters will be applied.\n");
|
|
370
|
-
}
|
|
371
366
|
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
367
|
+
// write data to output file
|
|
368
|
+
{
|
|
369
|
+
auto result = gf->nodes[gf->n_nodes - 1];
|
|
370
|
+
size_t len = ggml_nbytes(result);
|
|
371
|
+
if (read_buf.size() < len) {
|
|
372
|
+
read_buf.resize(len);
|
|
373
|
+
}
|
|
374
|
+
ggml_backend_tensor_get(result, read_buf.data(), 0, len);
|
|
375
|
+
fout.write((char* )read_buf.data(), len);
|
|
376
|
+
zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
|
|
377
|
+
}
|
|
377
378
|
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
struct gguf_init_params params_gguf;
|
|
381
|
-
params_gguf.no_alloc = true;
|
|
382
|
-
params_gguf.ctx = &ctx_in;
|
|
383
|
-
struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
|
|
384
|
-
|
|
385
|
-
// create new gguf
|
|
386
|
-
struct gguf_context * gguf_out = gguf_init_empty();
|
|
387
|
-
|
|
388
|
-
// copy meta data from base model: kv and tensors
|
|
389
|
-
gguf_set_kv(gguf_out, gguf_in);
|
|
390
|
-
int n_tensors = gguf_get_n_tensors(gguf_in);
|
|
391
|
-
for (int i=0; i < n_tensors; ++i) {
|
|
392
|
-
const char * name = gguf_get_tensor_name(gguf_in, i);
|
|
393
|
-
struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
|
|
394
|
-
gguf_add_tensor(gguf_out, tensor);
|
|
379
|
+
ggml_free(ctx);
|
|
380
|
+
ggml_backend_buffer_free(buffer);
|
|
395
381
|
}
|
|
396
382
|
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
383
|
+
~lora_merge_ctx() {
|
|
384
|
+
ggml_gallocr_free(allocr);
|
|
385
|
+
ggml_backend_free(backend);
|
|
386
|
+
gguf_free(ctx_out);
|
|
387
|
+
ggml_free(ctx_out_ggml);
|
|
401
388
|
}
|
|
389
|
+
};
|
|
402
390
|
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
meta.resize(gguf_get_meta_size(gguf_out));
|
|
406
|
-
gguf_get_meta_data(gguf_out, meta.data());
|
|
407
|
-
fout.write_raw(meta.data(), meta.size());
|
|
408
|
-
|
|
409
|
-
std::vector<uint8_t> data;
|
|
410
|
-
std::vector<uint8_t> padding;
|
|
411
|
-
for (int i=0; i < n_tensors; ++i) {
|
|
412
|
-
const char * name = gguf_get_tensor_name(gguf_in, i);
|
|
413
|
-
struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
|
|
414
|
-
|
|
415
|
-
// read tensor data
|
|
416
|
-
data.resize(ggml_nbytes(tensor));
|
|
417
|
-
tensor->data = data.data();
|
|
418
|
-
size_t offset = gguf_get_tensor_offset(gguf_in, i);
|
|
419
|
-
fin.seek(offset + meta.size(), SEEK_SET);
|
|
420
|
-
fin.read_raw(data.data(), data.size());
|
|
421
|
-
|
|
422
|
-
// apply all loras
|
|
423
|
-
for (size_t k = 0; k < loras.size(); ++k) {
|
|
424
|
-
apply_lora(tensor, loras[k], params->n_threads);
|
|
425
|
-
}
|
|
426
|
-
|
|
427
|
-
// write tensor data + padding
|
|
428
|
-
padding.clear();
|
|
429
|
-
padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
|
|
430
|
-
|
|
431
|
-
GGML_ASSERT(fout.tell() == offset + meta.size());
|
|
432
|
-
// fout.seek(offset + meta.size(), SEEK_SET);
|
|
433
|
-
fout.write_raw(data.data(), data.size());
|
|
434
|
-
fout.write_raw(padding.data(), padding.size());
|
|
391
|
+
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
|
392
|
+
gpt_params_print_usage(argc, argv, params);
|
|
435
393
|
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
}
|
|
394
|
+
printf("\nexample usage:\n");
|
|
395
|
+
printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
|
|
396
|
+
printf("\nNOTE: output model is F16\n");
|
|
440
397
|
printf("\n");
|
|
441
|
-
|
|
442
|
-
// close gguf
|
|
443
|
-
gguf_free(gguf_out);
|
|
444
|
-
gguf_free(gguf_in);
|
|
445
|
-
|
|
446
|
-
// free loras
|
|
447
|
-
for (size_t i = 0; i < loras.size(); ++i) {
|
|
448
|
-
free_lora(loras[i]);
|
|
449
|
-
}
|
|
450
398
|
}
|
|
451
399
|
|
|
452
400
|
int main(int argc, char ** argv) {
|
|
453
|
-
|
|
401
|
+
gpt_params params;
|
|
454
402
|
|
|
455
|
-
if (!
|
|
403
|
+
if (!gpt_params_parse(argc, argv, params)) {
|
|
404
|
+
print_usage(argc, argv, params);
|
|
456
405
|
return 1;
|
|
457
406
|
}
|
|
458
407
|
|
|
459
|
-
|
|
408
|
+
g_verbose = (params.verbosity == 1);
|
|
409
|
+
try {
|
|
410
|
+
lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads);
|
|
411
|
+
ctx.run_merge();
|
|
412
|
+
} catch (const std::exception & err) {
|
|
413
|
+
fprintf(stderr, "%s\n", err.what());
|
|
414
|
+
exit(EXIT_FAILURE);
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
printf("done, output file is %s\n", params.lora_outfile.c_str());
|
|
460
418
|
|
|
461
419
|
return 0;
|
|
462
420
|
}
|