@fugood/llama.node 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +8 -1
- package/package.json +1 -1
- package/patches/llama.patch +12 -12
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/LlamaContext.cpp +33 -1
- package/src/LlamaContext.h +1 -0
- package/src/LoadSessionWorker.cpp +1 -0
- package/src/llama.cpp/.github/workflows/bench.yml +310 -0
- package/src/llama.cpp/.github/workflows/build.yml +1315 -0
- package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
- package/src/llama.cpp/.github/workflows/docker.yml +116 -0
- package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
- package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
- package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
- package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
- package/src/llama.cpp/.github/workflows/server.yml +183 -0
- package/src/llama.cpp/CMakeLists.txt +91 -1245
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
- package/src/llama.cpp/cmake/build-info.cmake +58 -0
- package/src/llama.cpp/cmake/git-vars.cmake +22 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -3
- package/src/llama.cpp/common/build-info.cpp.in +4 -0
- package/src/llama.cpp/common/common.cpp +1116 -877
- package/src/llama.cpp/common/common.h +191 -77
- package/src/llama.cpp/common/grammar-parser.cpp +118 -31
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
- package/src/llama.cpp/common/log.h +1 -1
- package/src/llama.cpp/common/ngram-cache.h +10 -3
- package/src/llama.cpp/common/sampling.cpp +19 -10
- package/src/llama.cpp/docs/build.md +353 -0
- package/src/llama.cpp/examples/CMakeLists.txt +22 -22
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +52 -55
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/chat-13B.bat +57 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
- package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
- package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
- package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +38 -153
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
- package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
- package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
- package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
- package/src/llama.cpp/examples/llava/clip.cpp +23 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
- package/src/llama.cpp/examples/llava/requirements.txt +3 -2
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +98 -75
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
- package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
- package/src/llama.cpp/examples/server/server.cpp +274 -671
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +28 -29
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +21 -29
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +23 -0
- package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
- package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
- package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
- package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
- package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
- package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
- package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
- package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
- package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
- package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
- package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
- package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
- package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
- package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
- package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
- package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
- package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
- package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
- package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
- package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
- package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
- package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
- package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
- package/src/llama.cpp/models/.editorconfig +1 -0
- package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/requirements/requirements-all.txt +12 -0
- package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
- package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
- package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
- package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
- package/src/llama.cpp/requirements.txt +5 -4
- package/src/llama.cpp/scripts/build-info.sh +30 -0
- package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
- package/src/llama.cpp/src/CMakeLists.txt +33 -0
- package/src/llama.cpp/src/llama-grammar.cpp +539 -0
- package/src/llama.cpp/src/llama-grammar.h +39 -0
- package/src/llama.cpp/src/llama-impl.h +26 -0
- package/src/llama.cpp/src/llama-sampling.cpp +635 -0
- package/src/llama.cpp/src/llama-sampling.h +56 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
- package/src/llama.cpp/src/llama-vocab.h +130 -0
- package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
- package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
- package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
- package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
- package/src/llama.cpp/tests/CMakeLists.txt +19 -20
- package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
- package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
- package/src/llama.cpp/tests/test-double-float.cpp +2 -2
- package/src/llama.cpp/tests/test-grad0.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
- package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
- package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
- package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
- package/src/llama.cpp/tests/test-rope.cpp +3 -4
- package/src/llama.cpp/tests/test-sampling.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
- package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
- package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
- package/src/llama.cpp/ggml-opencl.cpp +0 -2305
- package/src/llama.cpp/ggml-opencl.h +0 -36
- package/src/llama.cpp/ggml-sycl.cpp +0 -17340
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
- package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
- /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
- /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
- /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
- /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
- /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
|
@@ -2,10 +2,12 @@
|
|
|
2
2
|
#undef NDEBUG
|
|
3
3
|
#endif
|
|
4
4
|
|
|
5
|
-
#
|
|
5
|
+
#define LLAMA_API_INTERNAL
|
|
6
|
+
#include "llama.h"
|
|
6
7
|
#include "grammar-parser.h"
|
|
7
8
|
|
|
8
9
|
#include <cassert>
|
|
10
|
+
#include <stdexcept>
|
|
9
11
|
|
|
10
12
|
int main()
|
|
11
13
|
{
|
|
@@ -112,10 +114,14 @@ int main()
|
|
|
112
114
|
}
|
|
113
115
|
}
|
|
114
116
|
|
|
115
|
-
llama_grammar *grammar = NULL;
|
|
117
|
+
llama_grammar * grammar = NULL;
|
|
116
118
|
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
|
117
|
-
|
|
118
|
-
|
|
119
|
+
|
|
120
|
+
grammar = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
|
121
|
+
if (grammar == nullptr)
|
|
122
|
+
{
|
|
123
|
+
throw std::runtime_error("Failed to initialize llama_grammar");
|
|
124
|
+
}
|
|
119
125
|
|
|
120
126
|
std::vector<std::vector<llama_grammar_element>> expected_stacks = {
|
|
121
127
|
{
|
|
@@ -168,7 +174,7 @@ int main()
|
|
|
168
174
|
}};
|
|
169
175
|
|
|
170
176
|
auto index = 0;
|
|
171
|
-
for (auto stack : grammar
|
|
177
|
+
for (auto stack : llama_grammar_get_stacks(grammar))
|
|
172
178
|
{
|
|
173
179
|
// compare stack to expected_stack
|
|
174
180
|
for (uint32_t i = 0; i < stack.size(); i++)
|
|
@@ -370,13 +376,13 @@ int main()
|
|
|
370
376
|
},
|
|
371
377
|
};
|
|
372
378
|
|
|
373
|
-
std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(grammar
|
|
379
|
+
std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[0], next_candidates);
|
|
374
380
|
|
|
375
381
|
std::vector<std::vector<llama_grammar_candidate>> all_rejects;
|
|
376
382
|
|
|
377
|
-
for (std::size_t count = 0; count < grammar
|
|
383
|
+
for (std::size_t count = 0; count < llama_grammar_get_stacks(grammar).size(); ++count)
|
|
378
384
|
{
|
|
379
|
-
rejects = llama_grammar_reject_candidates_for_stack(grammar
|
|
385
|
+
rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[count], next_candidates);
|
|
380
386
|
all_rejects.push_back(rejects);
|
|
381
387
|
}
|
|
382
388
|
|
|
@@ -397,6 +403,6 @@ int main()
|
|
|
397
403
|
delete[] candidate.code_points;
|
|
398
404
|
candidate.code_points = nullptr;
|
|
399
405
|
}
|
|
400
|
-
|
|
406
|
+
llama_grammar_free(grammar);
|
|
401
407
|
return 0;
|
|
402
408
|
}
|
|
@@ -60,7 +60,7 @@ static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test
|
|
|
60
60
|
qfns.from_float(test_data, tmp_q.data(), test_size);
|
|
61
61
|
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
|
|
62
62
|
|
|
63
|
-
qfns.
|
|
63
|
+
qfns.from_float_ref(test_data, tmp_q.data(), test_size);
|
|
64
64
|
qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
|
|
65
65
|
|
|
66
66
|
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
|
|
@@ -285,7 +285,7 @@ int main(int argc, char * argv[]) {
|
|
|
285
285
|
for (size_t size : params.test_sizes) {
|
|
286
286
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
|
287
287
|
auto quantize_fn = [&](void) -> float {
|
|
288
|
-
qfns.
|
|
288
|
+
qfns.from_float_ref(test_data1, test_q1, size);
|
|
289
289
|
return test_q1[0];
|
|
290
290
|
};
|
|
291
291
|
size_t quantized_size = ggml_row_size(type, size);
|
|
@@ -162,12 +162,12 @@ int main(int /*argc*/, const char ** /*argv*/) {
|
|
|
162
162
|
x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
|
|
163
163
|
|
|
164
164
|
// 100, 101, 102, ..., 172
|
|
165
|
-
struct ggml_tensor * r0 = ggml_rope(ctx0, x, p0, n_rot, mode
|
|
165
|
+
struct ggml_tensor * r0 = ggml_rope(ctx0, x, p0, n_rot, mode);
|
|
166
166
|
// -67, -67, -67, ..., -67
|
|
167
|
-
struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode
|
|
167
|
+
struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
|
|
168
168
|
|
|
169
169
|
// 33, 34, 35, ..., 105
|
|
170
|
-
struct ggml_tensor * r2 = ggml_rope(ctx0, x, p2, n_rot, mode
|
|
170
|
+
struct ggml_tensor * r2 = ggml_rope(ctx0, x, p2, n_rot, mode);
|
|
171
171
|
|
|
172
172
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
|
173
173
|
|
|
@@ -218,4 +218,3 @@ int main(int /*argc*/, const char ** /*argv*/) {
|
|
|
218
218
|
|
|
219
219
|
return 0;
|
|
220
220
|
}
|
|
221
|
-
|
|
@@ -166,12 +166,12 @@ static void test_sampler_queue(
|
|
|
166
166
|
for (auto s : samplers_sequence) {
|
|
167
167
|
switch (s){
|
|
168
168
|
case 'k': llama_sample_top_k (nullptr, &candidates_p, top_k, 1); break;
|
|
169
|
-
case 'f':
|
|
170
|
-
case 'y':
|
|
169
|
+
case 'f': GGML_ABORT("tail_free test not implemented"); break;
|
|
170
|
+
case 'y': GGML_ABORT("typical test not implemented"); break;
|
|
171
171
|
case 'p': llama_sample_top_p (nullptr, &candidates_p, top_p, 1); break;
|
|
172
172
|
case 'm': llama_sample_min_p (nullptr, &candidates_p, min_p, 1); break;
|
|
173
|
-
case 't':
|
|
174
|
-
default :
|
|
173
|
+
case 't': GGML_ABORT("temperature test not implemented"); break;
|
|
174
|
+
default : GGML_ABORT("Unknown sampler"); break;
|
|
175
175
|
}
|
|
176
176
|
|
|
177
177
|
llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests
|
|
@@ -222,7 +222,7 @@ static void test_sampler_queue(
|
|
|
222
222
|
GGML_ASSERT(candidates_p.data[0].id == max_token_id);
|
|
223
223
|
GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
|
|
224
224
|
} else {
|
|
225
|
-
|
|
225
|
+
GGML_ABORT("fatal error");
|
|
226
226
|
}
|
|
227
227
|
}
|
|
228
228
|
|
|
@@ -195,11 +195,11 @@ int main(int argc, char **argv) {
|
|
|
195
195
|
const bool add_special = false;
|
|
196
196
|
|
|
197
197
|
for (const auto & test_kv : k_tests) {
|
|
198
|
-
const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special);
|
|
198
|
+
const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, false);
|
|
199
199
|
|
|
200
200
|
printf("\n");
|
|
201
201
|
printf("src: '%s'\n", test_kv.first.c_str());
|
|
202
|
-
printf("res: '%s'\n",
|
|
202
|
+
printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
|
|
203
203
|
printf("tok: ");
|
|
204
204
|
for (const auto & tok : res) {
|
|
205
205
|
printf("%d ", tok);
|
|
@@ -216,8 +216,8 @@ int main(int argc, char **argv) {
|
|
|
216
216
|
if (!correct) {
|
|
217
217
|
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
|
218
218
|
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
|
|
219
|
-
|
|
220
|
-
|
|
219
|
+
llama_detokenize(ctx, res).c_str(),
|
|
220
|
+
llama_detokenize(ctx, test_kv.second).c_str());
|
|
221
221
|
fprintf(stderr, "%s : expected tokens: ", __func__);
|
|
222
222
|
for (const auto & t : test_kv.second) {
|
|
223
223
|
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
|
|
@@ -253,7 +253,7 @@ int main(int argc, char **argv) {
|
|
|
253
253
|
{
|
|
254
254
|
const auto t_start = ggml_time_us();
|
|
255
255
|
|
|
256
|
-
res = llama_tokenize(ctx, text, add_special);
|
|
256
|
+
res = llama_tokenize(ctx, text, add_special, false);
|
|
257
257
|
|
|
258
258
|
const auto t_end = ggml_time_us();
|
|
259
259
|
|
|
@@ -272,7 +272,7 @@ int main(int argc, char **argv) {
|
|
|
272
272
|
}
|
|
273
273
|
|
|
274
274
|
for (const auto & tok : res) {
|
|
275
|
-
//ofs << tok << " '" << string_strip(
|
|
275
|
+
//ofs << tok << " '" << string_strip(llama_detokenize(ctx, std::vector<int>{tok})) << "'" << std::endl;
|
|
276
276
|
ofs << tok << "\n";
|
|
277
277
|
}
|
|
278
278
|
}
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
#include <string>
|
|
12
12
|
#include <thread>
|
|
13
13
|
#include <vector>
|
|
14
|
+
#include <atomic>
|
|
14
15
|
|
|
15
16
|
int main(int argc, char **argv) {
|
|
16
17
|
if (argc < 2 || argc > 3) {
|
|
@@ -63,7 +64,10 @@ int main(int argc, char **argv) {
|
|
|
63
64
|
}
|
|
64
65
|
}
|
|
65
66
|
|
|
66
|
-
GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_BPE);
|
|
67
|
+
//GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_BPE);
|
|
68
|
+
if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
|
|
69
|
+
return 99;
|
|
70
|
+
}
|
|
67
71
|
|
|
68
72
|
#ifdef _WIN32
|
|
69
73
|
// We need this for unicode console support
|
|
@@ -74,7 +78,7 @@ int main(int argc, char **argv) {
|
|
|
74
78
|
const int n_vocab = llama_n_vocab(model);
|
|
75
79
|
|
|
76
80
|
for (int i = 0; i < n_vocab; ++i) {
|
|
77
|
-
std::string str =
|
|
81
|
+
std::string str = llama_detokenize(ctx, std::vector<int>(1, i));
|
|
78
82
|
try {
|
|
79
83
|
auto cps = unicode_cpts_from_utf8(str);
|
|
80
84
|
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
|
|
@@ -90,7 +94,7 @@ int main(int argc, char **argv) {
|
|
|
90
94
|
fprintf(stderr, "]\n");
|
|
91
95
|
return 2;
|
|
92
96
|
}
|
|
93
|
-
std::string check =
|
|
97
|
+
std::string check = llama_detokenize(ctx, tokens);
|
|
94
98
|
if (check != str) {
|
|
95
99
|
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
|
|
96
100
|
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
|
|
@@ -108,26 +112,23 @@ int main(int argc, char **argv) {
|
|
|
108
112
|
|
|
109
113
|
std::vector<std::thread> threads(nthread);
|
|
110
114
|
|
|
115
|
+
std::atomic_int errcode = {};
|
|
116
|
+
|
|
111
117
|
for (int i = 0; i < nthread; ++i) {
|
|
112
|
-
threads[i] = std::thread([i, nthread, ctx]() {
|
|
113
|
-
for (uint32_t cp = i; cp <
|
|
114
|
-
if (
|
|
115
|
-
|
|
116
|
-
(cp < 0x13 || cp > 0x17) && cp != 0x19 &&
|
|
117
|
-
(cp < 0x1c || cp > 0x1e) &&
|
|
118
|
-
(cp < 0xd800 || cp > 0xdfff) &&
|
|
119
|
-
(cp < 0x00040000 || cp >= 0x000e0000)
|
|
120
|
-
)) {
|
|
118
|
+
threads[i] = std::thread([i, nthread, ctx, &errcode]() {
|
|
119
|
+
for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
|
|
120
|
+
if ((0x0000D800 <= cp && cp <= 0x0000DFFF) || // surrogates \p{Cs}
|
|
121
|
+
(0x00040000 <= cp && cp <= 0x000E0000)) { // undefined \p{Cn}
|
|
121
122
|
continue;
|
|
122
123
|
}
|
|
123
124
|
|
|
124
125
|
std::string str = unicode_cpt_to_utf8(cp);
|
|
125
126
|
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
|
126
|
-
std::string check =
|
|
127
|
+
std::string check = llama_detokenize(ctx, tokens);
|
|
127
128
|
if (cp != 9601 && str != check) {
|
|
128
|
-
fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
|
129
|
+
fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
|
129
130
|
cp, check.c_str(), check.length(), str.c_str(), str.length());
|
|
130
|
-
|
|
131
|
+
errcode = 3;
|
|
131
132
|
}
|
|
132
133
|
}
|
|
133
134
|
});
|
|
@@ -136,6 +137,10 @@ int main(int argc, char **argv) {
|
|
|
136
137
|
for (auto & t : threads) {
|
|
137
138
|
t.join();
|
|
138
139
|
}
|
|
140
|
+
|
|
141
|
+
if (errcode) {
|
|
142
|
+
return errcode;
|
|
143
|
+
}
|
|
139
144
|
}
|
|
140
145
|
|
|
141
146
|
llama_free_model(model);
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
#include <string>
|
|
12
12
|
#include <thread>
|
|
13
13
|
#include <vector>
|
|
14
|
+
#include <atomic>
|
|
14
15
|
|
|
15
16
|
int main(int argc, char ** argv) {
|
|
16
17
|
if (argc < 2) {
|
|
@@ -51,7 +52,10 @@ int main(int argc, char ** argv) {
|
|
|
51
52
|
}
|
|
52
53
|
}
|
|
53
54
|
|
|
54
|
-
GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
|
|
55
|
+
//GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
|
|
56
|
+
if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_SPM) {
|
|
57
|
+
return 99;
|
|
58
|
+
}
|
|
55
59
|
|
|
56
60
|
#ifdef _WIN32
|
|
57
61
|
// We need this for unicode console support
|
|
@@ -62,9 +66,9 @@ int main(int argc, char ** argv) {
|
|
|
62
66
|
const int n_vocab = llama_n_vocab(model);
|
|
63
67
|
|
|
64
68
|
for (int i = 0; i < n_vocab; ++i) {
|
|
65
|
-
std::string str =
|
|
66
|
-
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
|
67
|
-
std::string check =
|
|
69
|
+
std::string str = llama_detokenize(ctx, std::vector<int>(1, i), true);
|
|
70
|
+
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
|
|
71
|
+
std::string check = llama_detokenize(ctx, tokens);
|
|
68
72
|
if (check != str) {
|
|
69
73
|
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
|
|
70
74
|
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
|
|
@@ -78,20 +82,23 @@ int main(int argc, char ** argv) {
|
|
|
78
82
|
|
|
79
83
|
std::vector<std::thread> threads(nthread);
|
|
80
84
|
|
|
85
|
+
std::atomic_int errcode = {};
|
|
86
|
+
|
|
81
87
|
for (int i = 0; i < nthread; ++i) {
|
|
82
|
-
threads[i] = std::thread([i, nthread, ctx]() {
|
|
83
|
-
for (uint32_t cp = i; cp <
|
|
84
|
-
if (
|
|
88
|
+
threads[i] = std::thread([i, nthread, ctx, &errcode]() {
|
|
89
|
+
for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
|
|
90
|
+
if ((0x0000D800 <= cp && cp <= 0x0000DFFF) || // surrogates \p{Cs}
|
|
91
|
+
(0x00040000 <= cp && cp <= 0x000E0000)) { // undefined \p{Cn}
|
|
85
92
|
continue;
|
|
86
93
|
}
|
|
87
94
|
|
|
88
95
|
std::string str = unicode_cpt_to_utf8(cp);
|
|
89
|
-
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
|
90
|
-
std::string check =
|
|
96
|
+
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
|
|
97
|
+
std::string check = llama_detokenize(ctx, tokens);
|
|
91
98
|
if (cp != 9601 && str != check) {
|
|
92
|
-
fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
|
99
|
+
fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
|
93
100
|
cp, check.c_str(), check.length(), str.c_str(), str.length());
|
|
94
|
-
|
|
101
|
+
errcode = 3;
|
|
95
102
|
}
|
|
96
103
|
}
|
|
97
104
|
});
|
|
@@ -100,6 +107,10 @@ int main(int argc, char ** argv) {
|
|
|
100
107
|
for (auto & t : threads) {
|
|
101
108
|
t.join();
|
|
102
109
|
}
|
|
110
|
+
|
|
111
|
+
if(errcode) {
|
|
112
|
+
return errcode;
|
|
113
|
+
}
|
|
103
114
|
}
|
|
104
115
|
|
|
105
116
|
llama_free_model(model);
|
|
Binary file
|
|
Binary file
|
|
@@ -1,188 +0,0 @@
|
|
|
1
|
-
#include "common.h"
|
|
2
|
-
#include "llama.h"
|
|
3
|
-
|
|
4
|
-
#include <cassert>
|
|
5
|
-
#include <cinttypes>
|
|
6
|
-
#include <cmath>
|
|
7
|
-
#include <cstdio>
|
|
8
|
-
#include <cstring>
|
|
9
|
-
#include <ctime>
|
|
10
|
-
#include <fstream>
|
|
11
|
-
#include <iostream>
|
|
12
|
-
#include <string>
|
|
13
|
-
#include <vector>
|
|
14
|
-
|
|
15
|
-
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
16
|
-
#include <signal.h>
|
|
17
|
-
#include <unistd.h>
|
|
18
|
-
#elif defined (_WIN32)
|
|
19
|
-
#define WIN32_LEAN_AND_MEAN
|
|
20
|
-
#ifndef NOMINMAX
|
|
21
|
-
# define NOMINMAX
|
|
22
|
-
#endif
|
|
23
|
-
#include <windows.h>
|
|
24
|
-
#include <signal.h>
|
|
25
|
-
#endif
|
|
26
|
-
|
|
27
|
-
// Used for debugging to print out beam tokens.
|
|
28
|
-
struct ostream_beam_view {
|
|
29
|
-
llama_context * ctx;
|
|
30
|
-
llama_beam_view beam_view;
|
|
31
|
-
};
|
|
32
|
-
|
|
33
|
-
static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
|
|
34
|
-
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
|
|
35
|
-
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
|
|
36
|
-
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
|
|
37
|
-
}
|
|
38
|
-
return os << ')';
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
// Put here anything you want back in beam_search_callback().
|
|
42
|
-
struct beam_search_callback_data {
|
|
43
|
-
llama_context * ctx;
|
|
44
|
-
std::vector<llama_token> response;
|
|
45
|
-
};
|
|
46
|
-
|
|
47
|
-
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
|
|
48
|
-
// For example, eob can be flagged due to maximum token length, stop words, etc.
|
|
49
|
-
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
|
|
50
|
-
return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
// Function matching type llama_beam_search_callback_fn_t.
|
|
54
|
-
// Custom callback example is called each time the beams lengths increase:
|
|
55
|
-
// * Show progress by printing ',' following by number of convergent beam tokens if any.
|
|
56
|
-
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
|
57
|
-
// This is also called when the stop condition is met.
|
|
58
|
-
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
|
59
|
-
static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
|
|
60
|
-
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
|
|
61
|
-
// Mark beams as EOS as needed.
|
|
62
|
-
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
|
63
|
-
llama_beam_view& beam_view = beams_state.beam_views[i];
|
|
64
|
-
if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
|
|
65
|
-
beam_view.eob = true;
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
printf(","); // Show progress
|
|
69
|
-
if (const size_t n = beams_state.common_prefix_length) {
|
|
70
|
-
callback_data.response.resize(callback_data.response.size() + n);
|
|
71
|
-
assert(0u < beams_state.n_beams);
|
|
72
|
-
const llama_token * tokens = beams_state.beam_views[0].tokens;
|
|
73
|
-
std::copy(tokens, tokens + n, callback_data.response.end() - n);
|
|
74
|
-
printf("%zu", n);
|
|
75
|
-
}
|
|
76
|
-
fflush(stdout);
|
|
77
|
-
#if 1 // DEBUG: print current beams for this iteration
|
|
78
|
-
std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
|
|
79
|
-
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
|
80
|
-
std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
|
|
81
|
-
}
|
|
82
|
-
#endif
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
int main(int argc, char ** argv)
|
|
86
|
-
{
|
|
87
|
-
gpt_params params;
|
|
88
|
-
//params.n_gpu_layers = 200;
|
|
89
|
-
|
|
90
|
-
//---------------------------------
|
|
91
|
-
// Print help :
|
|
92
|
-
//---------------------------------
|
|
93
|
-
|
|
94
|
-
if ( argc < 2 || argv[1][0] == '-' )
|
|
95
|
-
{
|
|
96
|
-
printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
|
|
97
|
-
return 1 ;
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
//---------------------------------
|
|
101
|
-
// Load parameters :
|
|
102
|
-
//---------------------------------
|
|
103
|
-
|
|
104
|
-
params.model = argv[1];
|
|
105
|
-
|
|
106
|
-
params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
|
|
107
|
-
|
|
108
|
-
if ( argc > 3 )
|
|
109
|
-
{
|
|
110
|
-
params.prompt = argv[3];
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
if ( params.prompt.empty() )
|
|
114
|
-
{
|
|
115
|
-
params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
//---------------------------------
|
|
119
|
-
// Init LLM :
|
|
120
|
-
//---------------------------------
|
|
121
|
-
|
|
122
|
-
llama_backend_init();
|
|
123
|
-
llama_numa_init(params.numa);
|
|
124
|
-
|
|
125
|
-
llama_model * model;
|
|
126
|
-
llama_context * ctx;
|
|
127
|
-
|
|
128
|
-
std::tie(model, ctx) = llama_init_from_gpt_params( params );
|
|
129
|
-
|
|
130
|
-
if ( model == NULL )
|
|
131
|
-
{
|
|
132
|
-
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
|
|
133
|
-
return 1;
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
//---------------------------------
|
|
137
|
-
// Tokenize the prompt :
|
|
138
|
-
//---------------------------------
|
|
139
|
-
|
|
140
|
-
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
|
|
141
|
-
|
|
142
|
-
const size_t max_context_size = llama_n_ctx( ctx );
|
|
143
|
-
const size_t max_tokens_list_size = max_context_size - 4 ;
|
|
144
|
-
|
|
145
|
-
if (tokens_list.size() > max_tokens_list_size)
|
|
146
|
-
{
|
|
147
|
-
fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
|
|
148
|
-
__func__ , tokens_list.size() , max_tokens_list_size );
|
|
149
|
-
return 1;
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
fprintf( stderr, "\n\n" );
|
|
153
|
-
|
|
154
|
-
// Print the tokens from the prompt :
|
|
155
|
-
|
|
156
|
-
for( auto id : tokens_list )
|
|
157
|
-
{
|
|
158
|
-
std::cout << llama_token_to_piece(ctx, id);
|
|
159
|
-
}
|
|
160
|
-
std::cout << std::flush;
|
|
161
|
-
|
|
162
|
-
int n_past = 0;
|
|
163
|
-
|
|
164
|
-
if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
|
|
165
|
-
{
|
|
166
|
-
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
|
|
167
|
-
return 1;
|
|
168
|
-
}
|
|
169
|
-
n_past += tokens_list.size();
|
|
170
|
-
|
|
171
|
-
beam_search_callback_data callback_data{ctx, {}};
|
|
172
|
-
size_t const beam_width = static_cast<size_t>(params.n_beams);
|
|
173
|
-
int const n_predict = 256;
|
|
174
|
-
llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
|
|
175
|
-
|
|
176
|
-
std::cout << "\n\n";
|
|
177
|
-
for (llama_token const token_id : callback_data.response) {
|
|
178
|
-
std::cout << llama_token_to_piece(ctx,token_id);
|
|
179
|
-
}
|
|
180
|
-
std::cout << std::endl;
|
|
181
|
-
|
|
182
|
-
llama_free( ctx );
|
|
183
|
-
llama_free_model( model );
|
|
184
|
-
|
|
185
|
-
llama_backend_free();
|
|
186
|
-
|
|
187
|
-
return 0;
|
|
188
|
-
}
|