@fugood/llama.node 0.2.3 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -3
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +8 -1
- package/package.json +3 -3
- package/patches/llama.patch +12 -12
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/LlamaContext.cpp +33 -1
- package/src/LlamaContext.h +1 -0
- package/src/llama.cpp/.github/workflows/bench.yml +310 -0
- package/src/llama.cpp/.github/workflows/build.yml +1315 -0
- package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
- package/src/llama.cpp/.github/workflows/docker.yml +116 -0
- package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
- package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
- package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
- package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
- package/src/llama.cpp/.github/workflows/server.yml +183 -0
- package/src/llama.cpp/CMakeLists.txt +91 -1245
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
- package/src/llama.cpp/cmake/build-info.cmake +58 -0
- package/src/llama.cpp/cmake/git-vars.cmake +22 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -3
- package/src/llama.cpp/common/build-info.cpp.in +4 -0
- package/src/llama.cpp/common/common.cpp +1116 -877
- package/src/llama.cpp/common/common.h +191 -77
- package/src/llama.cpp/common/grammar-parser.cpp +118 -31
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
- package/src/llama.cpp/common/log.h +1 -1
- package/src/llama.cpp/common/ngram-cache.h +10 -3
- package/src/llama.cpp/common/sampling.cpp +19 -10
- package/src/llama.cpp/docs/build.md +353 -0
- package/src/llama.cpp/examples/CMakeLists.txt +22 -22
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +52 -55
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/chat-13B.bat +57 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
- package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
- package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
- package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +38 -153
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
- package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
- package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
- package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
- package/src/llama.cpp/examples/llava/clip.cpp +23 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
- package/src/llama.cpp/examples/llava/requirements.txt +3 -2
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +98 -75
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
- package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
- package/src/llama.cpp/examples/server/server.cpp +274 -671
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +28 -29
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +21 -29
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +23 -0
- package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
- package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
- package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
- package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
- package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
- package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
- package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
- package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
- package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
- package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
- package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
- package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
- package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
- package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
- package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
- package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
- package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
- package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
- package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
- package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
- package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
- package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
- package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
- package/src/llama.cpp/models/.editorconfig +1 -0
- package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/requirements/requirements-all.txt +12 -0
- package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
- package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
- package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
- package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
- package/src/llama.cpp/requirements.txt +5 -4
- package/src/llama.cpp/scripts/build-info.sh +30 -0
- package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
- package/src/llama.cpp/src/CMakeLists.txt +33 -0
- package/src/llama.cpp/src/llama-grammar.cpp +539 -0
- package/src/llama.cpp/src/llama-grammar.h +39 -0
- package/src/llama.cpp/src/llama-impl.h +26 -0
- package/src/llama.cpp/src/llama-sampling.cpp +635 -0
- package/src/llama.cpp/src/llama-sampling.h +56 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
- package/src/llama.cpp/src/llama-vocab.h +130 -0
- package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
- package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
- package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
- package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
- package/src/llama.cpp/tests/CMakeLists.txt +19 -20
- package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
- package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
- package/src/llama.cpp/tests/test-double-float.cpp +2 -2
- package/src/llama.cpp/tests/test-grad0.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
- package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
- package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
- package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
- package/src/llama.cpp/tests/test-rope.cpp +3 -4
- package/src/llama.cpp/tests/test-sampling.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
- package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
- package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
- package/src/llama.cpp/ggml-opencl.cpp +0 -2305
- package/src/llama.cpp/ggml-opencl.h +0 -36
- package/src/llama.cpp/ggml-sycl.cpp +0 -17340
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
- package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
- /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
- /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
- /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
- /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
- /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
|
@@ -52,109 +52,114 @@ int32_t cpu_get_num_math();
|
|
|
52
52
|
// CLI argument parsing
|
|
53
53
|
//
|
|
54
54
|
|
|
55
|
+
// dimensionality reduction methods, used by cvector-generator
|
|
56
|
+
enum dimre_method {
|
|
57
|
+
DIMRE_METHOD_PCA,
|
|
58
|
+
DIMRE_METHOD_MEAN,
|
|
59
|
+
};
|
|
60
|
+
|
|
55
61
|
struct gpt_params {
|
|
56
62
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
|
57
63
|
|
|
58
64
|
int32_t n_threads = cpu_get_num_math();
|
|
59
|
-
int32_t n_threads_draft =
|
|
60
|
-
int32_t n_threads_batch =
|
|
61
|
-
int32_t n_threads_batch_draft =
|
|
62
|
-
int32_t n_predict =
|
|
63
|
-
int32_t n_ctx =
|
|
64
|
-
int32_t n_batch =
|
|
65
|
-
int32_t n_ubatch =
|
|
66
|
-
int32_t n_keep =
|
|
67
|
-
int32_t n_draft =
|
|
68
|
-
int32_t n_chunks =
|
|
69
|
-
int32_t n_parallel =
|
|
70
|
-
int32_t n_sequences =
|
|
71
|
-
float p_split =
|
|
72
|
-
int32_t n_gpu_layers =
|
|
73
|
-
int32_t n_gpu_layers_draft =
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
int32_t
|
|
78
|
-
int32_t
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
float rope_freq_base = 0.0f; // RoPE base frequency
|
|
82
|
-
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
|
65
|
+
int32_t n_threads_draft = -1;
|
|
66
|
+
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
|
67
|
+
int32_t n_threads_batch_draft = -1;
|
|
68
|
+
int32_t n_predict = -1; // new tokens to predict
|
|
69
|
+
int32_t n_ctx = 0; // context size
|
|
70
|
+
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
71
|
+
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
|
72
|
+
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
|
73
|
+
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
|
|
74
|
+
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
|
75
|
+
int32_t n_parallel = 1; // number of parallel sequences to decode
|
|
76
|
+
int32_t n_sequences = 1; // number of sequences to decode
|
|
77
|
+
float p_split = 0.1f; // speculative decoding split probability
|
|
78
|
+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
|
79
|
+
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
|
80
|
+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
|
81
|
+
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
|
82
|
+
int32_t grp_attn_n = 1; // group-attention factor
|
|
83
|
+
int32_t grp_attn_w = 512; // group-attention width
|
|
84
|
+
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
|
85
|
+
float rope_freq_base = 0.0f; // RoPE base frequency
|
|
86
|
+
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
|
83
87
|
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
|
|
84
|
-
float yarn_attn_factor =
|
|
88
|
+
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
|
|
85
89
|
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
|
86
|
-
float yarn_beta_slow =
|
|
87
|
-
int32_t yarn_orig_ctx =
|
|
90
|
+
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
|
91
|
+
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
|
88
92
|
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
|
89
|
-
std::string rpc_servers = ""; // comma separated list of RPC servers
|
|
90
93
|
|
|
91
94
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
|
92
95
|
void * cb_eval_user_data = nullptr;
|
|
93
96
|
|
|
94
97
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
|
95
98
|
|
|
99
|
+
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
|
96
100
|
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
|
97
101
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
|
102
|
+
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
|
98
103
|
|
|
99
104
|
// // sampling parameters
|
|
100
105
|
struct llama_sampling_params sparams;
|
|
101
106
|
|
|
102
|
-
std::string model = "";
|
|
103
|
-
std::string model_draft = "";
|
|
107
|
+
std::string model = ""; // model path
|
|
108
|
+
std::string model_draft = ""; // draft model for speculative decoding
|
|
104
109
|
std::string model_alias = "unknown"; // model alias
|
|
105
|
-
std::string model_url = "";
|
|
106
|
-
std::string
|
|
107
|
-
std::string
|
|
110
|
+
std::string model_url = ""; // model url to download
|
|
111
|
+
std::string hf_token = ""; // HF token
|
|
112
|
+
std::string hf_repo = ""; // HF repo
|
|
113
|
+
std::string hf_file = ""; // HF file
|
|
108
114
|
std::string prompt = "";
|
|
109
|
-
std::string prompt_file = "";
|
|
110
|
-
std::string path_prompt_cache = "";
|
|
111
|
-
std::string input_prefix = "";
|
|
112
|
-
std::string input_suffix = "";
|
|
113
|
-
std::
|
|
114
|
-
std::string logdir = ""; // directory in which to save YAML log files
|
|
115
|
+
std::string prompt_file = ""; // store the external prompt file name
|
|
116
|
+
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
|
117
|
+
std::string input_prefix = ""; // string to prefix user inputs with
|
|
118
|
+
std::string input_suffix = ""; // string to suffix user inputs with
|
|
119
|
+
std::string logdir = ""; // directory in which to save YAML log files
|
|
115
120
|
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
|
|
116
121
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
|
|
117
|
-
std::string logits_file = "";
|
|
122
|
+
std::string logits_file = ""; // file for saving *all* logits
|
|
123
|
+
std::string rpc_servers = ""; // comma separated list of RPC servers
|
|
118
124
|
|
|
125
|
+
std::vector<std::string> in_files; // all input files
|
|
126
|
+
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
|
119
127
|
std::vector<llama_model_kv_override> kv_overrides;
|
|
120
128
|
|
|
121
129
|
// TODO: avoid tuple, use struct
|
|
122
130
|
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
|
123
|
-
std::string lora_base = ""; // base model path for the lora adapter
|
|
124
131
|
|
|
125
132
|
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
|
126
133
|
|
|
134
|
+
int32_t verbosity = 0;
|
|
127
135
|
int32_t control_vector_layer_start = -1; // layer range for control vector
|
|
128
136
|
int32_t control_vector_layer_end = -1; // layer range for control vector
|
|
129
137
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
bool hellaswag
|
|
135
|
-
size_t hellaswag_tasks
|
|
138
|
+
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
|
139
|
+
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
|
140
|
+
// (which is more convenient to use for plotting)
|
|
141
|
+
//
|
|
142
|
+
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
|
143
|
+
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
|
136
144
|
|
|
137
|
-
bool winogrande
|
|
138
|
-
size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
|
|
145
|
+
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
|
|
146
|
+
size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
|
|
139
147
|
|
|
140
|
-
bool multiple_choice
|
|
141
|
-
size_t multiple_choice_tasks = 0;
|
|
148
|
+
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
|
|
149
|
+
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
|
|
142
150
|
|
|
143
|
-
bool kl_divergence
|
|
151
|
+
bool kl_divergence = false; // compute KL divergence
|
|
144
152
|
|
|
145
|
-
bool
|
|
153
|
+
bool usage = false; // print usage
|
|
146
154
|
bool use_color = false; // use color to distinguish generations and inputs
|
|
147
|
-
bool interactive = false; // interactive mode
|
|
148
|
-
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
|
|
149
155
|
bool special = false; // enable special token output
|
|
156
|
+
bool interactive = false; // interactive mode
|
|
157
|
+
bool interactive_first = false; // wait for user input immediately
|
|
150
158
|
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
|
151
|
-
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
|
152
159
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
|
153
160
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
|
154
161
|
|
|
155
|
-
bool
|
|
156
|
-
bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
|
157
|
-
bool interactive_first = false; // wait for user input immediately
|
|
162
|
+
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
|
158
163
|
bool multiline_input = false; // reverse the usage of `\`
|
|
159
164
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
|
160
165
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
|
@@ -162,7 +167,6 @@ struct gpt_params {
|
|
|
162
167
|
|
|
163
168
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
|
164
169
|
bool ignore_eos = false; // ignore generated EOS tokens
|
|
165
|
-
bool instruct = false; // instruction mode (used for Alpaca models)
|
|
166
170
|
bool logits_all = false; // return logits for all tokens in the batch
|
|
167
171
|
bool use_mmap = true; // use mmap for faster loads
|
|
168
172
|
bool use_mlock = false; // use mlock to keep model in memory
|
|
@@ -180,8 +184,81 @@ struct gpt_params {
|
|
|
180
184
|
// multimodal models (see examples/llava)
|
|
181
185
|
std::string mmproj = ""; // path to multimodal projector
|
|
182
186
|
std::vector<std::string> image; // path to image file(s)
|
|
187
|
+
|
|
188
|
+
// embedding
|
|
189
|
+
bool embedding = false; // get only sentence embedding
|
|
190
|
+
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
|
191
|
+
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
|
192
|
+
std::string embd_sep = "\n"; // separator of embendings
|
|
193
|
+
|
|
194
|
+
// server params
|
|
195
|
+
int32_t port = 8080; // server listens on this network port
|
|
196
|
+
int32_t timeout_read = 600; // http read timeout in seconds
|
|
197
|
+
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
|
198
|
+
int32_t n_threads_http = -1; // number of threads to process HTTP requests
|
|
199
|
+
|
|
200
|
+
std::string hostname = "127.0.0.1";
|
|
201
|
+
std::string public_path = "";
|
|
202
|
+
std::string chat_template = "";
|
|
203
|
+
std::string system_prompt = "";
|
|
204
|
+
bool enable_chat_template = true;
|
|
205
|
+
|
|
206
|
+
std::vector<std::string> api_keys;
|
|
207
|
+
|
|
208
|
+
std::string ssl_file_key = "";
|
|
209
|
+
std::string ssl_file_cert = "";
|
|
210
|
+
|
|
211
|
+
bool endpoint_slots = true;
|
|
212
|
+
bool endpoint_metrics = false;
|
|
213
|
+
|
|
214
|
+
bool log_json = false;
|
|
215
|
+
|
|
216
|
+
std::string slot_save_path;
|
|
217
|
+
|
|
218
|
+
float slot_prompt_similarity = 0.5f;
|
|
219
|
+
|
|
220
|
+
// batched-bench params
|
|
221
|
+
bool is_pp_shared = false;
|
|
222
|
+
|
|
223
|
+
std::vector<int32_t> n_pp;
|
|
224
|
+
std::vector<int32_t> n_tg;
|
|
225
|
+
std::vector<int32_t> n_pl;
|
|
226
|
+
|
|
227
|
+
// retrieval params
|
|
228
|
+
std::vector<std::string> context_files; // context files to embed
|
|
229
|
+
|
|
230
|
+
int32_t chunk_size = 64; // chunk size for context embedding
|
|
231
|
+
|
|
232
|
+
std::string chunk_separator = "\n"; // chunk separator for context embedding
|
|
233
|
+
|
|
234
|
+
// passkey params
|
|
235
|
+
int32_t n_junk = 250; // number of times to repeat the junk text
|
|
236
|
+
int32_t i_pos = -1; // position of the passkey in the junk text
|
|
237
|
+
|
|
238
|
+
// imatrix params
|
|
239
|
+
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
|
|
240
|
+
|
|
241
|
+
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
|
242
|
+
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
|
243
|
+
int32_t i_chunk = 0; // start processing from this chunk
|
|
244
|
+
|
|
245
|
+
bool process_output = false; // collect data for the output tensor
|
|
246
|
+
bool compute_ppl = true; // whether to compute perplexity
|
|
247
|
+
|
|
248
|
+
// cvector-generator params
|
|
249
|
+
int n_pca_batch = 100;
|
|
250
|
+
int n_pca_iterations = 1000;
|
|
251
|
+
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
|
252
|
+
std::string cvector_outfile = "control_vector.gguf";
|
|
253
|
+
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
|
254
|
+
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
|
255
|
+
|
|
256
|
+
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
|
257
|
+
|
|
258
|
+
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
|
183
259
|
};
|
|
184
260
|
|
|
261
|
+
void gpt_params_handle_hf_token(gpt_params & params);
|
|
185
262
|
void gpt_params_handle_model_default(gpt_params & params);
|
|
186
263
|
|
|
187
264
|
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
|
|
@@ -199,7 +276,20 @@ std::vector<std::string> string_split(std::string input, char separator);
|
|
|
199
276
|
|
|
200
277
|
std::string string_strip(const std::string & str);
|
|
201
278
|
std::string string_get_sortable_timestamp();
|
|
202
|
-
|
|
279
|
+
|
|
280
|
+
template<class T>
|
|
281
|
+
static std::vector<T> string_split(const std::string & str, char delim) {
|
|
282
|
+
std::vector<T> values;
|
|
283
|
+
std::istringstream str_stream(str);
|
|
284
|
+
std::string token;
|
|
285
|
+
while (std::getline(str_stream, token, delim)) {
|
|
286
|
+
T value;
|
|
287
|
+
std::istringstream token_stream(token);
|
|
288
|
+
token_stream >> value;
|
|
289
|
+
values.push_back(value);
|
|
290
|
+
}
|
|
291
|
+
return values;
|
|
292
|
+
}
|
|
203
293
|
|
|
204
294
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
|
205
295
|
void string_process_escapes(std::string & input);
|
|
@@ -212,6 +302,7 @@ bool fs_validate_filename(const std::string & filename);
|
|
|
212
302
|
bool fs_create_directory_with_parents(const std::string & path);
|
|
213
303
|
|
|
214
304
|
std::string fs_get_cache_directory();
|
|
305
|
+
std::string fs_get_cache_file(const std::string & filename);
|
|
215
306
|
|
|
216
307
|
//
|
|
217
308
|
// Model utils
|
|
@@ -223,8 +314,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
|
223
314
|
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
|
224
315
|
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
|
225
316
|
|
|
226
|
-
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
|
|
227
|
-
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);
|
|
317
|
+
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
|
318
|
+
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
|
228
319
|
|
|
229
320
|
// Batch utils
|
|
230
321
|
|
|
@@ -262,26 +353,50 @@ std::string llama_token_to_piece(
|
|
|
262
353
|
llama_token token,
|
|
263
354
|
bool special = true);
|
|
264
355
|
|
|
265
|
-
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
|
|
266
|
-
// that takes into account the tokenizer type and decides how to handle the leading space
|
|
267
|
-
//
|
|
268
|
-
// detokenizes a vector of tokens into a string
|
|
269
|
-
// should work similar to Python's `tokenizer.decode`
|
|
270
|
-
// removes the leading space from the first non-BOS token
|
|
271
|
-
std::string llama_detokenize_spm(
|
|
272
|
-
llama_context * ctx,
|
|
273
|
-
const std::vector<llama_token> & tokens);
|
|
274
|
-
|
|
275
356
|
// detokenizes a vector of tokens into a string
|
|
276
357
|
// should work similar to Python's `tokenizer.decode`
|
|
277
|
-
|
|
358
|
+
// optionally renders special/control tokens
|
|
359
|
+
std::string llama_detokenize(
|
|
278
360
|
llama_context * ctx,
|
|
279
|
-
const std::vector<llama_token> & tokens
|
|
361
|
+
const std::vector<llama_token> & tokens,
|
|
362
|
+
bool special = true);
|
|
280
363
|
|
|
281
364
|
// Uses the value from the model metadata if possible, otherwise
|
|
282
365
|
// defaults to true when model type is SPM, otherwise false.
|
|
283
366
|
bool llama_should_add_bos_token(const llama_model * model);
|
|
284
367
|
|
|
368
|
+
//
|
|
369
|
+
// Chat template utils
|
|
370
|
+
//
|
|
371
|
+
|
|
372
|
+
// same with llama_chat_message, but uses std::string
|
|
373
|
+
struct llama_chat_msg {
|
|
374
|
+
std::string role;
|
|
375
|
+
std::string content;
|
|
376
|
+
};
|
|
377
|
+
|
|
378
|
+
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
|
379
|
+
bool llama_chat_verify_template(const std::string & tmpl);
|
|
380
|
+
|
|
381
|
+
// CPP wrapper for llama_chat_apply_template
|
|
382
|
+
// If the built-in template is not supported, we default to chatml
|
|
383
|
+
// If the custom "tmpl" is not supported, we throw an error
|
|
384
|
+
std::string llama_chat_apply_template(const struct llama_model * model,
|
|
385
|
+
const std::string & tmpl,
|
|
386
|
+
const std::vector<llama_chat_msg> & chat,
|
|
387
|
+
bool add_ass);
|
|
388
|
+
|
|
389
|
+
// Format single message, while taking into account the position of that message in chat history
|
|
390
|
+
std::string llama_chat_format_single(const struct llama_model * model,
|
|
391
|
+
const std::string & tmpl,
|
|
392
|
+
const std::vector<llama_chat_msg> & past_msg,
|
|
393
|
+
const llama_chat_msg & new_msg,
|
|
394
|
+
bool add_ass);
|
|
395
|
+
|
|
396
|
+
// Returns an example of formatted chat
|
|
397
|
+
std::string llama_chat_format_example(const struct llama_model * model,
|
|
398
|
+
const std::string & tmpl);
|
|
399
|
+
|
|
285
400
|
//
|
|
286
401
|
// KV cache utils
|
|
287
402
|
//
|
|
@@ -296,7 +411,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
|
|
|
296
411
|
// Embedding utils
|
|
297
412
|
//
|
|
298
413
|
|
|
299
|
-
void llama_embd_normalize(const float * inp, float * out, int n);
|
|
414
|
+
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
|
|
300
415
|
|
|
301
416
|
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
|
302
417
|
|
|
@@ -340,4 +455,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
|
|
|
340
455
|
void yaml_dump_non_result_info(
|
|
341
456
|
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
|
342
457
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
|
343
|
-
|
|
@@ -46,8 +46,12 @@ namespace grammar_parser {
|
|
|
46
46
|
state.rules[rule_id] = rule;
|
|
47
47
|
}
|
|
48
48
|
|
|
49
|
+
static bool is_digit_char(char c) {
|
|
50
|
+
return '0' <= c && c <= '9';
|
|
51
|
+
}
|
|
52
|
+
|
|
49
53
|
static bool is_word_char(char c) {
|
|
50
|
-
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || (
|
|
54
|
+
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
|
|
51
55
|
}
|
|
52
56
|
|
|
53
57
|
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
|
|
@@ -99,6 +103,17 @@ namespace grammar_parser {
|
|
|
99
103
|
return pos;
|
|
100
104
|
}
|
|
101
105
|
|
|
106
|
+
static const char * parse_int(const char * src) {
|
|
107
|
+
const char * pos = src;
|
|
108
|
+
while (is_digit_char(*pos)) {
|
|
109
|
+
pos++;
|
|
110
|
+
}
|
|
111
|
+
if (pos == src) {
|
|
112
|
+
throw std::runtime_error(std::string("expecting integer at ") + src);
|
|
113
|
+
}
|
|
114
|
+
return pos;
|
|
115
|
+
}
|
|
116
|
+
|
|
102
117
|
static std::pair<uint32_t, const char *> parse_char(const char * src) {
|
|
103
118
|
if (*src == '\\') {
|
|
104
119
|
switch (src[1]) {
|
|
@@ -137,6 +152,60 @@ namespace grammar_parser {
|
|
|
137
152
|
bool is_nested) {
|
|
138
153
|
size_t last_sym_start = out_elements.size();
|
|
139
154
|
const char * pos = src;
|
|
155
|
+
|
|
156
|
+
auto handle_repetitions = [&](int min_times, int max_times) {
|
|
157
|
+
|
|
158
|
+
if (last_sym_start == out_elements.size()) {
|
|
159
|
+
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// apply transformation to previous symbol (last_sym_start to end) according to
|
|
163
|
+
// the following rewrite rules:
|
|
164
|
+
// S{m,n} --> S S S (m times) S'(n-m)
|
|
165
|
+
// S'(x) ::= S S'(x-1) |
|
|
166
|
+
// (... n-m definitions of these S' rules ...)
|
|
167
|
+
// S'(1) ::= S |
|
|
168
|
+
// S{m,} --> S S S (m times) S'
|
|
169
|
+
// S' ::= S S' |
|
|
170
|
+
// S* --> S{0,}
|
|
171
|
+
// --> S' ::= S S' |
|
|
172
|
+
// S+ --> S{1,}
|
|
173
|
+
// --> S S'
|
|
174
|
+
// S' ::= S S' |
|
|
175
|
+
// S? --> S{0,1}
|
|
176
|
+
// --> S'
|
|
177
|
+
// S' ::= S |
|
|
178
|
+
|
|
179
|
+
std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
|
|
180
|
+
if (min_times == 0) {
|
|
181
|
+
out_elements.resize(last_sym_start);
|
|
182
|
+
} else {
|
|
183
|
+
// Repeat the previous elements (min_times - 1) times
|
|
184
|
+
for (int i = 1; i < min_times; i++) {
|
|
185
|
+
out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
uint32_t last_rec_rule_id = 0;
|
|
190
|
+
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
|
|
191
|
+
|
|
192
|
+
std::vector<llama_grammar_element> rec_rule(previous_elements);
|
|
193
|
+
for (int i = 0; i < n_opt; i++) {
|
|
194
|
+
rec_rule.resize(previous_elements.size());
|
|
195
|
+
uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
|
|
196
|
+
if (i > 0 || max_times < 0) {
|
|
197
|
+
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
|
|
198
|
+
}
|
|
199
|
+
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
|
200
|
+
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
|
|
201
|
+
add_rule(state, rec_rule_id, rec_rule);
|
|
202
|
+
last_rec_rule_id = rec_rule_id;
|
|
203
|
+
}
|
|
204
|
+
if (n_opt > 0) {
|
|
205
|
+
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
|
|
206
|
+
}
|
|
207
|
+
};
|
|
208
|
+
|
|
140
209
|
while (*pos) {
|
|
141
210
|
if (*pos == '"') { // literal string
|
|
142
211
|
pos++;
|
|
@@ -197,40 +266,51 @@ namespace grammar_parser {
|
|
|
197
266
|
throw std::runtime_error(std::string("expecting ')' at ") + pos);
|
|
198
267
|
}
|
|
199
268
|
pos = parse_space(pos + 1, is_nested);
|
|
200
|
-
} else if (*pos == '
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
269
|
+
} else if (*pos == '.') { // any char
|
|
270
|
+
last_sym_start = out_elements.size();
|
|
271
|
+
out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
|
|
272
|
+
pos = parse_space(pos + 1, is_nested);
|
|
273
|
+
} else if (*pos == '*') {
|
|
274
|
+
pos = parse_space(pos + 1, is_nested);
|
|
275
|
+
handle_repetitions(0, -1);
|
|
276
|
+
} else if (*pos == '+') {
|
|
277
|
+
pos = parse_space(pos + 1, is_nested);
|
|
278
|
+
handle_repetitions(1, -1);
|
|
279
|
+
} else if (*pos == '?') {
|
|
280
|
+
pos = parse_space(pos + 1, is_nested);
|
|
281
|
+
handle_repetitions(0, 1);
|
|
282
|
+
} else if (*pos == '{') {
|
|
283
|
+
pos = parse_space(pos + 1, is_nested);
|
|
204
284
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
// S* --> S' ::= S S' |
|
|
208
|
-
// S+ --> S' ::= S S' | S
|
|
209
|
-
// S? --> S' ::= S |
|
|
210
|
-
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
|
|
211
|
-
std::vector<llama_grammar_element> sub_rule;
|
|
212
|
-
// add preceding symbol to generated rule
|
|
213
|
-
sub_rule.insert(
|
|
214
|
-
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
|
|
215
|
-
if (*pos == '*' || *pos == '+') {
|
|
216
|
-
// cause generated rule to recurse
|
|
217
|
-
sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
|
|
285
|
+
if (!is_digit_char(*pos)) {
|
|
286
|
+
throw std::runtime_error(std::string("expecting an int at ") + pos);
|
|
218
287
|
}
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
// add preceding symbol as alternate only for '+' (otherwise empty)
|
|
223
|
-
sub_rule.insert(
|
|
224
|
-
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
|
|
225
|
-
}
|
|
226
|
-
sub_rule.push_back({LLAMA_GRETYPE_END, 0});
|
|
227
|
-
add_rule(state, sub_rule_id, sub_rule);
|
|
288
|
+
const char * int_end = parse_int(pos);
|
|
289
|
+
int min_times = std::stoul(std::string(pos, int_end - pos));
|
|
290
|
+
pos = parse_space(int_end, is_nested);
|
|
228
291
|
|
|
229
|
-
|
|
230
|
-
out_elements.resize(last_sym_start);
|
|
231
|
-
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
|
|
292
|
+
int max_times = -1;
|
|
232
293
|
|
|
233
|
-
|
|
294
|
+
if (*pos == '}') {
|
|
295
|
+
max_times = min_times;
|
|
296
|
+
pos = parse_space(pos + 1, is_nested);
|
|
297
|
+
} else if (*pos == ',') {
|
|
298
|
+
pos = parse_space(pos + 1, is_nested);
|
|
299
|
+
|
|
300
|
+
if (is_digit_char(*pos)) {
|
|
301
|
+
const char * int_end = parse_int(pos);
|
|
302
|
+
max_times = std::stoul(std::string(pos, int_end - pos));
|
|
303
|
+
pos = parse_space(int_end, is_nested);
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
if (*pos != '}') {
|
|
307
|
+
throw std::runtime_error(std::string("expecting '}' at ") + pos);
|
|
308
|
+
}
|
|
309
|
+
pos = parse_space(pos + 1, is_nested);
|
|
310
|
+
} else {
|
|
311
|
+
throw std::runtime_error(std::string("expecting ',' at ") + pos);
|
|
312
|
+
}
|
|
313
|
+
handle_repetitions(min_times, max_times);
|
|
234
314
|
} else {
|
|
235
315
|
break;
|
|
236
316
|
}
|
|
@@ -325,6 +405,7 @@ namespace grammar_parser {
|
|
|
325
405
|
case LLAMA_GRETYPE_CHAR_NOT: return true;
|
|
326
406
|
case LLAMA_GRETYPE_CHAR_ALT: return true;
|
|
327
407
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
|
|
408
|
+
case LLAMA_GRETYPE_CHAR_ANY: return true;
|
|
328
409
|
default: return false;
|
|
329
410
|
}
|
|
330
411
|
}
|
|
@@ -339,6 +420,7 @@ namespace grammar_parser {
|
|
|
339
420
|
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
|
|
340
421
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
|
|
341
422
|
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
|
|
423
|
+
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
|
|
342
424
|
}
|
|
343
425
|
switch (elem.type) {
|
|
344
426
|
case LLAMA_GRETYPE_END:
|
|
@@ -350,6 +432,7 @@ namespace grammar_parser {
|
|
|
350
432
|
case LLAMA_GRETYPE_CHAR_NOT:
|
|
351
433
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
|
352
434
|
case LLAMA_GRETYPE_CHAR_ALT:
|
|
435
|
+
case LLAMA_GRETYPE_CHAR_ANY:
|
|
353
436
|
fprintf(file, "(\"");
|
|
354
437
|
print_grammar_char(file, elem.value);
|
|
355
438
|
fprintf(file, "\") ");
|
|
@@ -407,11 +490,15 @@ namespace grammar_parser {
|
|
|
407
490
|
}
|
|
408
491
|
print_grammar_char(file, elem.value);
|
|
409
492
|
break;
|
|
493
|
+
case LLAMA_GRETYPE_CHAR_ANY:
|
|
494
|
+
fprintf(file, ".");
|
|
495
|
+
break;
|
|
410
496
|
}
|
|
411
497
|
if (is_char_element(elem)) {
|
|
412
498
|
switch (rule[i + 1].type) {
|
|
413
499
|
case LLAMA_GRETYPE_CHAR_ALT:
|
|
414
500
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
|
501
|
+
case LLAMA_GRETYPE_CHAR_ANY:
|
|
415
502
|
break;
|
|
416
503
|
default:
|
|
417
504
|
fprintf(file, "] ");
|