@fugood/llama.node 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +8 -1
- package/package.json +1 -1
- package/patches/llama.patch +12 -12
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/LlamaContext.cpp +33 -1
- package/src/LlamaContext.h +1 -0
- package/src/LoadSessionWorker.cpp +1 -0
- package/src/llama.cpp/.github/workflows/bench.yml +310 -0
- package/src/llama.cpp/.github/workflows/build.yml +1315 -0
- package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
- package/src/llama.cpp/.github/workflows/docker.yml +116 -0
- package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
- package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
- package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
- package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
- package/src/llama.cpp/.github/workflows/server.yml +183 -0
- package/src/llama.cpp/CMakeLists.txt +91 -1245
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
- package/src/llama.cpp/cmake/build-info.cmake +58 -0
- package/src/llama.cpp/cmake/git-vars.cmake +22 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -3
- package/src/llama.cpp/common/build-info.cpp.in +4 -0
- package/src/llama.cpp/common/common.cpp +1116 -877
- package/src/llama.cpp/common/common.h +191 -77
- package/src/llama.cpp/common/grammar-parser.cpp +118 -31
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
- package/src/llama.cpp/common/log.h +1 -1
- package/src/llama.cpp/common/ngram-cache.h +10 -3
- package/src/llama.cpp/common/sampling.cpp +19 -10
- package/src/llama.cpp/docs/build.md +353 -0
- package/src/llama.cpp/examples/CMakeLists.txt +22 -22
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +52 -55
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/chat-13B.bat +57 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
- package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
- package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
- package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +38 -153
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
- package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
- package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
- package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
- package/src/llama.cpp/examples/llava/clip.cpp +23 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
- package/src/llama.cpp/examples/llava/requirements.txt +3 -2
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +98 -75
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
- package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
- package/src/llama.cpp/examples/server/server.cpp +274 -671
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +28 -29
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +21 -29
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +23 -0
- package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
- package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
- package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
- package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
- package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
- package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
- package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
- package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
- package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
- package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
- package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
- package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
- package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
- package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
- package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
- package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
- package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
- package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
- package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
- package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
- package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
- package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
- package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
- package/src/llama.cpp/models/.editorconfig +1 -0
- package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/requirements/requirements-all.txt +12 -0
- package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
- package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
- package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
- package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
- package/src/llama.cpp/requirements.txt +5 -4
- package/src/llama.cpp/scripts/build-info.sh +30 -0
- package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
- package/src/llama.cpp/src/CMakeLists.txt +33 -0
- package/src/llama.cpp/src/llama-grammar.cpp +539 -0
- package/src/llama.cpp/src/llama-grammar.h +39 -0
- package/src/llama.cpp/src/llama-impl.h +26 -0
- package/src/llama.cpp/src/llama-sampling.cpp +635 -0
- package/src/llama.cpp/src/llama-sampling.h +56 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
- package/src/llama.cpp/src/llama-vocab.h +130 -0
- package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
- package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
- package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
- package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
- package/src/llama.cpp/tests/CMakeLists.txt +19 -20
- package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
- package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
- package/src/llama.cpp/tests/test-double-float.cpp +2 -2
- package/src/llama.cpp/tests/test-grad0.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
- package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
- package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
- package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
- package/src/llama.cpp/tests/test-rope.cpp +3 -4
- package/src/llama.cpp/tests/test-sampling.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
- package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
- package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
- package/src/llama.cpp/ggml-opencl.cpp +0 -2305
- package/src/llama.cpp/ggml-opencl.h +0 -36
- package/src/llama.cpp/ggml-sycl.cpp +0 -17340
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
- package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
- /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
- /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
- /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
- /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
- /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
|
@@ -107,6 +107,7 @@ int main(int argc, char ** argv) {
|
|
|
107
107
|
g_params = ¶ms;
|
|
108
108
|
|
|
109
109
|
if (!gpt_params_parse(argc, argv, params)) {
|
|
110
|
+
gpt_params_print_usage(argc, argv, params);
|
|
110
111
|
return 1;
|
|
111
112
|
}
|
|
112
113
|
|
|
@@ -139,27 +140,6 @@ int main(int argc, char ** argv) {
|
|
|
139
140
|
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
|
140
141
|
params.n_ctx = 8;
|
|
141
142
|
}
|
|
142
|
-
if (params.instruct) {
|
|
143
|
-
printf("\n************\n");
|
|
144
|
-
printf("%s: please use the 'main' tool for instruct mode\n", __func__);
|
|
145
|
-
printf("************\n\n");
|
|
146
|
-
|
|
147
|
-
return 0;
|
|
148
|
-
}
|
|
149
|
-
if (params.chatml) {
|
|
150
|
-
printf("\n************\n");
|
|
151
|
-
printf("%s: please use the 'main' tool for chatml mode\n", __func__);
|
|
152
|
-
printf("************\n\n");
|
|
153
|
-
|
|
154
|
-
return 0;
|
|
155
|
-
}
|
|
156
|
-
if (!params.antiprompt.empty()) {
|
|
157
|
-
printf("\n************\n");
|
|
158
|
-
printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
|
|
159
|
-
printf("************\n\n");
|
|
160
|
-
|
|
161
|
-
return 0;
|
|
162
|
-
}
|
|
163
143
|
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
|
|
164
144
|
printf("\n************\n");
|
|
165
145
|
printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
|
|
@@ -167,20 +147,6 @@ int main(int argc, char ** argv) {
|
|
|
167
147
|
|
|
168
148
|
return 0;
|
|
169
149
|
}
|
|
170
|
-
if (params.random_prompt) {
|
|
171
|
-
printf("\n************\n");
|
|
172
|
-
printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
|
|
173
|
-
printf("************\n\n");
|
|
174
|
-
|
|
175
|
-
return 0;
|
|
176
|
-
}
|
|
177
|
-
if (!params.path_prompt_cache.empty()) {
|
|
178
|
-
printf("\n************\n");
|
|
179
|
-
printf("%s: infill does not support prompt caching\n", __func__);
|
|
180
|
-
printf("************\n\n");
|
|
181
|
-
|
|
182
|
-
return 0;
|
|
183
|
-
}
|
|
184
150
|
|
|
185
151
|
if (params.rope_freq_base != 0.0) {
|
|
186
152
|
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
|
@@ -207,17 +173,13 @@ int main(int argc, char ** argv) {
|
|
|
207
173
|
|
|
208
174
|
llama_model * model;
|
|
209
175
|
llama_context * ctx;
|
|
210
|
-
|
|
176
|
+
|
|
211
177
|
g_model = &model;
|
|
212
178
|
g_ctx = &ctx;
|
|
213
179
|
|
|
214
180
|
// load the model and apply lora adapter, if any
|
|
215
181
|
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
|
216
182
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
|
217
|
-
if (sparams.cfg_scale > 1.f) {
|
|
218
|
-
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
|
219
|
-
ctx_guidance = llama_new_context_with_model(model, lparams);
|
|
220
|
-
}
|
|
221
183
|
|
|
222
184
|
if (model == NULL) {
|
|
223
185
|
LOG_TEE("%s: error: unable to load model\n", __func__);
|
|
@@ -242,26 +204,28 @@ int main(int argc, char ** argv) {
|
|
|
242
204
|
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
|
243
205
|
LOG("add_bos: %d\n", add_bos);
|
|
244
206
|
|
|
245
|
-
bool suff_rm_leading_spc = params.escape;
|
|
246
|
-
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
|
247
|
-
params.input_suffix.erase(0, 1);
|
|
248
|
-
suff_rm_leading_spc = false;
|
|
249
|
-
}
|
|
250
207
|
std::vector<llama_token> embd_inp;
|
|
208
|
+
std::vector<llama_token> embd_end;
|
|
251
209
|
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
|
|
252
210
|
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
211
|
+
|
|
212
|
+
GGML_ASSERT(llama_token_prefix(model) >= 0);
|
|
213
|
+
GGML_ASSERT(llama_token_suffix(model) >= 0);
|
|
214
|
+
|
|
257
215
|
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
|
|
216
|
+
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
|
217
|
+
|
|
218
|
+
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
|
219
|
+
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
|
258
220
|
if (add_bos) {
|
|
259
|
-
|
|
221
|
+
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
|
222
|
+
}
|
|
223
|
+
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
|
224
|
+
|
|
225
|
+
const llama_token middle_token = llama_token_middle(model);
|
|
226
|
+
if (middle_token >= 0) {
|
|
227
|
+
embd_inp.push_back(middle_token);
|
|
260
228
|
}
|
|
261
|
-
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
|
262
|
-
embd_inp = inp_pfx;
|
|
263
|
-
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
|
264
|
-
embd_inp.push_back(llama_token_middle(model));
|
|
265
229
|
|
|
266
230
|
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
|
|
267
231
|
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
|
|
@@ -273,25 +237,6 @@ int main(int argc, char ** argv) {
|
|
|
273
237
|
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
|
274
238
|
}
|
|
275
239
|
|
|
276
|
-
// Tokenize negative prompt
|
|
277
|
-
std::vector<llama_token> guidance_inp;
|
|
278
|
-
int guidance_offset = 0;
|
|
279
|
-
int original_prompt_len = 0;
|
|
280
|
-
if (ctx_guidance) {
|
|
281
|
-
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
|
282
|
-
|
|
283
|
-
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
|
|
284
|
-
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
|
285
|
-
|
|
286
|
-
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
|
|
287
|
-
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
|
288
|
-
|
|
289
|
-
original_prompt_len = original_inp.size();
|
|
290
|
-
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
|
291
|
-
LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
|
|
292
|
-
LOG("guidance_offset: %s", log_tostr(guidance_offset));
|
|
293
|
-
}
|
|
294
|
-
|
|
295
240
|
if ((int) embd_inp.size() > n_ctx - 4) {
|
|
296
241
|
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
|
297
242
|
return 1;
|
|
@@ -319,15 +264,6 @@ int main(int argc, char ** argv) {
|
|
|
319
264
|
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
|
320
265
|
}
|
|
321
266
|
|
|
322
|
-
if (ctx_guidance) {
|
|
323
|
-
LOG_TEE("\n");
|
|
324
|
-
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
|
325
|
-
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
|
326
|
-
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
|
327
|
-
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
|
328
|
-
}
|
|
329
|
-
}
|
|
330
|
-
|
|
331
267
|
if (params.n_keep > 0) {
|
|
332
268
|
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
|
333
269
|
for (int i = 0; i < params.n_keep; i++) {
|
|
@@ -395,12 +331,11 @@ int main(int argc, char ** argv) {
|
|
|
395
331
|
is_interacting = params.interactive_first;
|
|
396
332
|
}
|
|
397
333
|
|
|
398
|
-
bool input_echo
|
|
334
|
+
bool input_echo = true;
|
|
399
335
|
|
|
400
|
-
int n_past
|
|
401
|
-
int n_remain
|
|
402
|
-
int n_consumed
|
|
403
|
-
int n_past_guidance = 0;
|
|
336
|
+
int n_past = 0;
|
|
337
|
+
int n_remain = params.n_predict;
|
|
338
|
+
int n_consumed = 0;
|
|
404
339
|
|
|
405
340
|
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
|
406
341
|
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
|
@@ -410,7 +345,6 @@ int main(int argc, char ** argv) {
|
|
|
410
345
|
console::set_display(console::prompt);
|
|
411
346
|
|
|
412
347
|
std::vector<llama_token> embd;
|
|
413
|
-
std::vector<llama_token> embd_guidance;
|
|
414
348
|
|
|
415
349
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
|
416
350
|
|
|
@@ -436,7 +370,7 @@ int main(int argc, char ** argv) {
|
|
|
436
370
|
// if we run out of context:
|
|
437
371
|
// - take the n_keep first tokens from the original prompt (via n_past)
|
|
438
372
|
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
|
439
|
-
if (n_past + (int) embd.size()
|
|
373
|
+
if (n_past + (int) embd.size() > n_ctx) {
|
|
440
374
|
if (params.n_predict == -2) {
|
|
441
375
|
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
|
442
376
|
break;
|
|
@@ -453,11 +387,7 @@ int main(int argc, char ** argv) {
|
|
|
453
387
|
|
|
454
388
|
n_past -= n_discard;
|
|
455
389
|
|
|
456
|
-
|
|
457
|
-
n_past_guidance -= n_discard;
|
|
458
|
-
}
|
|
459
|
-
|
|
460
|
-
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
|
|
390
|
+
LOG("after swap: n_past = %d\n", n_past);
|
|
461
391
|
|
|
462
392
|
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
|
463
393
|
|
|
@@ -465,45 +395,6 @@ int main(int argc, char ** argv) {
|
|
|
465
395
|
|
|
466
396
|
// evaluate tokens in batches
|
|
467
397
|
// embd is typically prepared beforehand to fit within a batch, but not always
|
|
468
|
-
|
|
469
|
-
if (ctx_guidance) {
|
|
470
|
-
int input_size = 0;
|
|
471
|
-
llama_token * input_buf = NULL;
|
|
472
|
-
|
|
473
|
-
if (n_past_guidance < (int) guidance_inp.size()) {
|
|
474
|
-
// Guidance context should have the same data with these modifications:
|
|
475
|
-
//
|
|
476
|
-
// * Replace the initial prompt
|
|
477
|
-
// * Shift everything by guidance_offset
|
|
478
|
-
embd_guidance = guidance_inp;
|
|
479
|
-
if (embd.begin() + original_prompt_len < embd.end()) {
|
|
480
|
-
embd_guidance.insert(
|
|
481
|
-
embd_guidance.end(),
|
|
482
|
-
embd.begin() + original_prompt_len,
|
|
483
|
-
embd.end()
|
|
484
|
-
);
|
|
485
|
-
}
|
|
486
|
-
|
|
487
|
-
input_buf = embd_guidance.data();
|
|
488
|
-
input_size = embd_guidance.size();
|
|
489
|
-
|
|
490
|
-
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
|
|
491
|
-
} else {
|
|
492
|
-
input_buf = embd.data();
|
|
493
|
-
input_size = embd.size();
|
|
494
|
-
}
|
|
495
|
-
|
|
496
|
-
for (int i = 0; i < input_size; i += params.n_batch) {
|
|
497
|
-
int n_eval = std::min(input_size - i, params.n_batch);
|
|
498
|
-
if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
|
|
499
|
-
LOG_TEE("%s : failed to eval\n", __func__);
|
|
500
|
-
return 1;
|
|
501
|
-
}
|
|
502
|
-
|
|
503
|
-
n_past_guidance += n_eval;
|
|
504
|
-
}
|
|
505
|
-
}
|
|
506
|
-
|
|
507
398
|
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
|
|
508
399
|
int n_eval = (int) embd.size() - i;
|
|
509
400
|
if (n_eval > params.n_batch) {
|
|
@@ -525,11 +416,9 @@ int main(int argc, char ** argv) {
|
|
|
525
416
|
}
|
|
526
417
|
|
|
527
418
|
embd.clear();
|
|
528
|
-
embd_guidance.clear();
|
|
529
419
|
|
|
530
420
|
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
|
531
|
-
|
|
532
|
-
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
|
|
421
|
+
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
|
|
533
422
|
|
|
534
423
|
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
|
535
424
|
|
|
@@ -583,7 +472,6 @@ int main(int argc, char ** argv) {
|
|
|
583
472
|
|
|
584
473
|
// if not currently processing queued inputs;
|
|
585
474
|
if ((int) embd_inp.size() <= n_consumed) {
|
|
586
|
-
|
|
587
475
|
// deal with eot token in infill mode
|
|
588
476
|
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
|
|
589
477
|
if (is_interacting && !params.interactive_first) {
|
|
@@ -624,27 +512,26 @@ int main(int argc, char ** argv) {
|
|
|
624
512
|
string_process_escapes(params.input_prefix);
|
|
625
513
|
string_process_escapes(params.input_suffix);
|
|
626
514
|
}
|
|
627
|
-
|
|
628
|
-
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
|
629
|
-
params.input_suffix.erase(0, 1);
|
|
630
|
-
suff_rm_leading_spc = false;
|
|
631
|
-
}
|
|
515
|
+
|
|
632
516
|
// tokenize new prefix and suffix
|
|
633
517
|
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
|
|
634
518
|
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
|
|
635
|
-
|
|
636
|
-
inp_sfx.erase(inp_sfx.begin());
|
|
637
|
-
}
|
|
519
|
+
|
|
638
520
|
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
|
|
521
|
+
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
|
522
|
+
|
|
523
|
+
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
|
524
|
+
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
|
639
525
|
if (add_bos) {
|
|
640
|
-
|
|
526
|
+
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
|
641
527
|
}
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
528
|
+
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
|
529
|
+
|
|
530
|
+
if (middle_token >= 0) {
|
|
531
|
+
embd_inp.push_back(middle_token);
|
|
532
|
+
}
|
|
533
|
+
|
|
646
534
|
embd.clear();
|
|
647
|
-
embd_guidance.clear();
|
|
648
535
|
n_remain = params.n_predict;
|
|
649
536
|
n_past = 0;
|
|
650
537
|
n_consumed = 0;
|
|
@@ -751,7 +638,6 @@ int main(int argc, char ** argv) {
|
|
|
751
638
|
llama_print_timings(ctx);
|
|
752
639
|
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
|
753
640
|
|
|
754
|
-
if (ctx_guidance) { llama_free(ctx_guidance); }
|
|
755
641
|
llama_free(ctx);
|
|
756
642
|
llama_free_model(model);
|
|
757
643
|
|
|
@@ -764,4 +650,3 @@ int main(int argc, char ** argv) {
|
|
|
764
650
|
|
|
765
651
|
return 0;
|
|
766
652
|
}
|
|
767
|
-
|