@fugood/llama.node 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +8 -1
- package/package.json +1 -1
- package/patches/llama.patch +12 -12
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/LlamaContext.cpp +33 -1
- package/src/LlamaContext.h +1 -0
- package/src/LoadSessionWorker.cpp +1 -0
- package/src/llama.cpp/.github/workflows/bench.yml +310 -0
- package/src/llama.cpp/.github/workflows/build.yml +1315 -0
- package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
- package/src/llama.cpp/.github/workflows/docker.yml +116 -0
- package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
- package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
- package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
- package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
- package/src/llama.cpp/.github/workflows/server.yml +183 -0
- package/src/llama.cpp/CMakeLists.txt +91 -1245
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
- package/src/llama.cpp/cmake/build-info.cmake +58 -0
- package/src/llama.cpp/cmake/git-vars.cmake +22 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -3
- package/src/llama.cpp/common/build-info.cpp.in +4 -0
- package/src/llama.cpp/common/common.cpp +1116 -877
- package/src/llama.cpp/common/common.h +191 -77
- package/src/llama.cpp/common/grammar-parser.cpp +118 -31
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
- package/src/llama.cpp/common/log.h +1 -1
- package/src/llama.cpp/common/ngram-cache.h +10 -3
- package/src/llama.cpp/common/sampling.cpp +19 -10
- package/src/llama.cpp/docs/build.md +353 -0
- package/src/llama.cpp/examples/CMakeLists.txt +22 -22
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +52 -55
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/chat-13B.bat +57 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
- package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
- package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
- package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +38 -153
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
- package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
- package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
- package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
- package/src/llama.cpp/examples/llava/clip.cpp +23 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
- package/src/llama.cpp/examples/llava/requirements.txt +3 -2
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +98 -75
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
- package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
- package/src/llama.cpp/examples/server/server.cpp +274 -671
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +28 -29
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +21 -29
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +23 -0
- package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
- package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
- package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
- package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
- package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
- package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
- package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
- package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
- package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
- package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
- package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
- package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
- package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
- package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
- package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
- package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
- package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
- package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
- package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
- package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
- package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
- package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
- package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
- package/src/llama.cpp/models/.editorconfig +1 -0
- package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/requirements/requirements-all.txt +12 -0
- package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
- package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
- package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
- package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
- package/src/llama.cpp/requirements.txt +5 -4
- package/src/llama.cpp/scripts/build-info.sh +30 -0
- package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
- package/src/llama.cpp/src/CMakeLists.txt +33 -0
- package/src/llama.cpp/src/llama-grammar.cpp +539 -0
- package/src/llama.cpp/src/llama-grammar.h +39 -0
- package/src/llama.cpp/src/llama-impl.h +26 -0
- package/src/llama.cpp/src/llama-sampling.cpp +635 -0
- package/src/llama.cpp/src/llama-sampling.h +56 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
- package/src/llama.cpp/src/llama-vocab.h +130 -0
- package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
- package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
- package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
- package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
- package/src/llama.cpp/tests/CMakeLists.txt +19 -20
- package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
- package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
- package/src/llama.cpp/tests/test-double-float.cpp +2 -2
- package/src/llama.cpp/tests/test-grad0.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
- package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
- package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
- package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
- package/src/llama.cpp/tests/test-rope.cpp +3 -4
- package/src/llama.cpp/tests/test-sampling.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
- package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
- package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
- package/src/llama.cpp/ggml-opencl.cpp +0 -2305
- package/src/llama.cpp/ggml-opencl.h +0 -36
- package/src/llama.cpp/ggml-sycl.cpp +0 -17340
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
- package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
- /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
- /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
- /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
- /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
- /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
|
@@ -37,14 +37,15 @@ static gpt_params * g_params;
|
|
|
37
37
|
static std::vector<llama_token> * g_input_tokens;
|
|
38
38
|
static std::ostringstream * g_output_ss;
|
|
39
39
|
static std::vector<llama_token> * g_output_tokens;
|
|
40
|
-
static bool is_interacting
|
|
40
|
+
static bool is_interacting = false;
|
|
41
|
+
static bool need_insert_eot = false;
|
|
41
42
|
|
|
42
|
-
static bool file_exists(const std::string &path) {
|
|
43
|
+
static bool file_exists(const std::string & path) {
|
|
43
44
|
std::ifstream f(path.c_str());
|
|
44
45
|
return f.good();
|
|
45
46
|
}
|
|
46
47
|
|
|
47
|
-
static bool file_is_empty(const std::string &path) {
|
|
48
|
+
static bool file_is_empty(const std::string & path) {
|
|
48
49
|
std::ifstream f;
|
|
49
50
|
f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
|
50
51
|
f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
|
|
@@ -99,7 +100,8 @@ static void write_logfile(
|
|
|
99
100
|
static void sigint_handler(int signo) {
|
|
100
101
|
if (signo == SIGINT) {
|
|
101
102
|
if (!is_interacting && g_params->interactive) {
|
|
102
|
-
is_interacting
|
|
103
|
+
is_interacting = true;
|
|
104
|
+
need_insert_eot = true;
|
|
103
105
|
} else {
|
|
104
106
|
console::cleanup();
|
|
105
107
|
printf("\n");
|
|
@@ -117,13 +119,24 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
|
|
|
117
119
|
LOG_TEE("%s", text);
|
|
118
120
|
}
|
|
119
121
|
|
|
122
|
+
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
|
|
123
|
+
llama_chat_msg new_msg{role, content};
|
|
124
|
+
auto formatted = llama_chat_format_single(
|
|
125
|
+
model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
|
126
|
+
chat_msgs.push_back({role, content});
|
|
127
|
+
LOG("formatted: %s\n", formatted.c_str());
|
|
128
|
+
return formatted;
|
|
129
|
+
}
|
|
130
|
+
|
|
120
131
|
int main(int argc, char ** argv) {
|
|
121
132
|
gpt_params params;
|
|
122
133
|
g_params = ¶ms;
|
|
123
134
|
|
|
124
135
|
if (!gpt_params_parse(argc, argv, params)) {
|
|
136
|
+
gpt_params_print_usage(argc, argv, params);
|
|
125
137
|
return 1;
|
|
126
138
|
}
|
|
139
|
+
|
|
127
140
|
llama_sampling_params & sparams = params.sparams;
|
|
128
141
|
|
|
129
142
|
#ifndef LOG_DISABLE_LOGS
|
|
@@ -180,9 +193,6 @@ int main(int argc, char ** argv) {
|
|
|
180
193
|
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
|
|
181
194
|
|
|
182
195
|
std::mt19937 rng(params.seed);
|
|
183
|
-
if (params.random_prompt) {
|
|
184
|
-
params.prompt = string_random_prompt(rng);
|
|
185
|
-
}
|
|
186
196
|
|
|
187
197
|
LOG("%s: llama backend init\n", __func__);
|
|
188
198
|
llama_backend_init();
|
|
@@ -191,6 +201,7 @@ int main(int argc, char ** argv) {
|
|
|
191
201
|
llama_model * model;
|
|
192
202
|
llama_context * ctx;
|
|
193
203
|
llama_context * ctx_guidance = NULL;
|
|
204
|
+
std::vector<llama_chat_msg> chat_msgs;
|
|
194
205
|
g_model = &model;
|
|
195
206
|
g_ctx = &ctx;
|
|
196
207
|
|
|
@@ -216,6 +227,15 @@ int main(int argc, char ** argv) {
|
|
|
216
227
|
__func__, n_ctx_train, n_ctx);
|
|
217
228
|
}
|
|
218
229
|
|
|
230
|
+
// print chat template example in conversation mode
|
|
231
|
+
if (params.conversation) {
|
|
232
|
+
if (params.enable_chat_template) {
|
|
233
|
+
LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
|
|
234
|
+
} else {
|
|
235
|
+
LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
219
239
|
// print system information
|
|
220
240
|
{
|
|
221
241
|
LOG_TEE("\n");
|
|
@@ -245,29 +265,38 @@ int main(int argc, char ** argv) {
|
|
|
245
265
|
}
|
|
246
266
|
|
|
247
267
|
const bool add_bos = llama_should_add_bos_token(model);
|
|
248
|
-
|
|
268
|
+
if (!llama_model_has_encoder(model)) {
|
|
269
|
+
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
|
270
|
+
}
|
|
249
271
|
LOG("add_bos: %d\n", add_bos);
|
|
250
272
|
|
|
251
273
|
std::vector<llama_token> embd_inp;
|
|
252
274
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
275
|
+
{
|
|
276
|
+
auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty())
|
|
277
|
+
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
|
|
278
|
+
: params.prompt;
|
|
279
|
+
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
|
280
|
+
LOG("tokenize the prompt\n");
|
|
281
|
+
embd_inp = ::llama_tokenize(ctx, prompt, true, true);
|
|
282
|
+
} else {
|
|
283
|
+
LOG("use session tokens\n");
|
|
284
|
+
embd_inp = session_tokens;
|
|
257
285
|
}
|
|
258
|
-
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
|
259
|
-
} else {
|
|
260
|
-
LOG("use session tokens\n");
|
|
261
|
-
embd_inp = session_tokens;
|
|
262
|
-
}
|
|
263
286
|
|
|
264
|
-
|
|
265
|
-
|
|
287
|
+
LOG("prompt: \"%s\"\n", log_tostr(prompt));
|
|
288
|
+
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
|
289
|
+
}
|
|
266
290
|
|
|
267
291
|
// Should not run without any tokens
|
|
268
292
|
if (embd_inp.empty()) {
|
|
269
|
-
|
|
270
|
-
|
|
293
|
+
if (add_bos) {
|
|
294
|
+
embd_inp.push_back(llama_token_bos(model));
|
|
295
|
+
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
|
296
|
+
} else {
|
|
297
|
+
LOG_TEE("error: input is empty\n");
|
|
298
|
+
return -1;
|
|
299
|
+
}
|
|
271
300
|
}
|
|
272
301
|
|
|
273
302
|
// Tokenize negative prompt
|
|
@@ -332,37 +361,13 @@ int main(int argc, char ** argv) {
|
|
|
332
361
|
}
|
|
333
362
|
|
|
334
363
|
// number of tokens to keep when resetting context
|
|
335
|
-
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()
|
|
364
|
+
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
|
|
336
365
|
params.n_keep = (int)embd_inp.size();
|
|
337
366
|
} else {
|
|
338
367
|
params.n_keep += add_bos; // always keep the BOS token
|
|
339
368
|
}
|
|
340
369
|
|
|
341
|
-
|
|
342
|
-
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true);
|
|
343
|
-
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
|
|
344
|
-
|
|
345
|
-
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
|
346
|
-
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
|
347
|
-
|
|
348
|
-
// chatml prefix & suffix
|
|
349
|
-
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
|
|
350
|
-
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
|
|
351
|
-
|
|
352
|
-
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
|
|
353
|
-
LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
|
|
354
|
-
|
|
355
|
-
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
|
356
|
-
if (params.instruct) {
|
|
357
|
-
params.interactive_first = true;
|
|
358
|
-
params.antiprompt.emplace_back("### Instruction:\n\n");
|
|
359
|
-
}
|
|
360
|
-
// similar for chatml mode
|
|
361
|
-
else if (params.chatml) {
|
|
362
|
-
params.interactive_first = true;
|
|
363
|
-
params.antiprompt.emplace_back("<|im_start|>user\n");
|
|
364
|
-
}
|
|
365
|
-
else if (params.conversation) {
|
|
370
|
+
if (params.conversation) {
|
|
366
371
|
params.interactive_first = true;
|
|
367
372
|
}
|
|
368
373
|
|
|
@@ -506,6 +511,7 @@ int main(int argc, char ** argv) {
|
|
|
506
511
|
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
|
507
512
|
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
|
508
513
|
std::ostringstream output_ss; g_output_ss = &output_ss;
|
|
514
|
+
std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
|
|
509
515
|
|
|
510
516
|
// the first thing we will do is to output the prompt, so set color accordingly
|
|
511
517
|
console::set_display(console::prompt);
|
|
@@ -528,6 +534,24 @@ int main(int argc, char ** argv) {
|
|
|
528
534
|
exit(1);
|
|
529
535
|
}
|
|
530
536
|
|
|
537
|
+
if (llama_model_has_encoder(model)) {
|
|
538
|
+
int enc_input_size = embd_inp.size();
|
|
539
|
+
llama_token * enc_input_buf = embd_inp.data();
|
|
540
|
+
|
|
541
|
+
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
|
|
542
|
+
LOG_TEE("%s : failed to eval\n", __func__);
|
|
543
|
+
return 1;
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
|
547
|
+
if (decoder_start_token_id == -1) {
|
|
548
|
+
decoder_start_token_id = llama_token_bos(model);
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
embd_inp.clear();
|
|
552
|
+
embd_inp.push_back(decoder_start_token_id);
|
|
553
|
+
}
|
|
554
|
+
|
|
531
555
|
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
|
532
556
|
// predict
|
|
533
557
|
if (!embd.empty()) {
|
|
@@ -821,17 +845,24 @@ int main(int argc, char ** argv) {
|
|
|
821
845
|
is_antiprompt = true;
|
|
822
846
|
}
|
|
823
847
|
|
|
848
|
+
if (params.enable_chat_template) {
|
|
849
|
+
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
|
|
850
|
+
}
|
|
824
851
|
is_interacting = true;
|
|
825
852
|
printf("\n");
|
|
826
|
-
} else if (params.instruct || params.chatml) {
|
|
827
|
-
is_interacting = true;
|
|
828
853
|
}
|
|
829
854
|
}
|
|
830
855
|
|
|
856
|
+
// if current token is not EOG, we add it to current assistant message
|
|
857
|
+
if (params.conversation) {
|
|
858
|
+
auto id = llama_sampling_last(ctx_sampling);
|
|
859
|
+
assistant_ss << llama_token_to_piece(ctx, id, false);
|
|
860
|
+
}
|
|
861
|
+
|
|
831
862
|
if (n_past > 0 && is_interacting) {
|
|
832
863
|
LOG("waiting for user input\n");
|
|
833
864
|
|
|
834
|
-
if (params.conversation
|
|
865
|
+
if (params.conversation) {
|
|
835
866
|
printf("\n> ");
|
|
836
867
|
}
|
|
837
868
|
|
|
@@ -874,49 +905,41 @@ int main(int argc, char ** argv) {
|
|
|
874
905
|
|
|
875
906
|
const size_t original_size = embd_inp.size();
|
|
876
907
|
|
|
877
|
-
// instruct mode: insert instruction prefix
|
|
878
|
-
if (params.instruct && !is_antiprompt) {
|
|
879
|
-
LOG("inserting instruction prefix\n");
|
|
880
|
-
n_consumed = embd_inp.size();
|
|
881
|
-
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
|
882
|
-
}
|
|
883
|
-
// chatml mode: insert user chat prefix
|
|
884
|
-
if (params.chatml && !is_antiprompt) {
|
|
885
|
-
LOG("inserting chatml prefix\n");
|
|
886
|
-
n_consumed = embd_inp.size();
|
|
887
|
-
embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
|
|
888
|
-
}
|
|
889
908
|
if (params.escape) {
|
|
890
909
|
string_process_escapes(buffer);
|
|
891
910
|
}
|
|
892
911
|
|
|
912
|
+
bool format_chat = params.conversation && params.enable_chat_template;
|
|
913
|
+
std::string user_inp = format_chat
|
|
914
|
+
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
|
|
915
|
+
: std::move(buffer);
|
|
916
|
+
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
|
|
893
917
|
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
|
894
|
-
const auto line_inp = ::llama_tokenize(ctx,
|
|
918
|
+
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
|
|
895
919
|
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
|
896
920
|
|
|
897
921
|
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
|
898
922
|
|
|
923
|
+
// if user stop generation mid-way, we must add EOT to finish model's last response
|
|
924
|
+
if (need_insert_eot && format_chat) {
|
|
925
|
+
llama_token eot = llama_token_eot(model);
|
|
926
|
+
embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot);
|
|
927
|
+
need_insert_eot = false;
|
|
928
|
+
}
|
|
929
|
+
|
|
899
930
|
embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
|
|
900
931
|
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
|
901
932
|
embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
|
|
902
933
|
|
|
903
|
-
// instruct mode: insert response suffix
|
|
904
|
-
if (params.instruct) {
|
|
905
|
-
LOG("inserting instruction suffix\n");
|
|
906
|
-
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
|
907
|
-
}
|
|
908
|
-
// chatml mode: insert assistant chat suffix
|
|
909
|
-
if (params.chatml) {
|
|
910
|
-
LOG("inserting chatml suffix\n");
|
|
911
|
-
embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end());
|
|
912
|
-
}
|
|
913
|
-
|
|
914
934
|
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
|
915
935
|
const llama_token token = embd_inp[i];
|
|
916
936
|
output_tokens.push_back(token);
|
|
917
937
|
output_ss << llama_token_to_piece(ctx, token);
|
|
918
938
|
}
|
|
919
939
|
|
|
940
|
+
// reset assistant message
|
|
941
|
+
assistant_ss.str("");
|
|
942
|
+
|
|
920
943
|
n_remain -= line_inp.size();
|
|
921
944
|
LOG("n_remain: %d\n", n_remain);
|
|
922
945
|
} else {
|
|
@@ -935,7 +958,7 @@ int main(int argc, char ** argv) {
|
|
|
935
958
|
}
|
|
936
959
|
|
|
937
960
|
// end of generation
|
|
938
|
-
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.
|
|
961
|
+
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
|
|
939
962
|
LOG_TEE(" [end of text]\n");
|
|
940
963
|
break;
|
|
941
964
|
}
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
cmake_minimum_required(VERSION 3.12)
|
|
2
|
-
project("
|
|
3
|
-
set(TARGET
|
|
2
|
+
project("llama-cli-cmake-pkg" C CXX)
|
|
3
|
+
set(TARGET llama-cli-cmake-pkg)
|
|
4
4
|
|
|
5
5
|
find_package(Llama 0.0.1 REQUIRED)
|
|
6
6
|
|
|
7
7
|
# Bake common functionality in with target. Because applications
|
|
8
8
|
# using the relocatable Llama package should be outside of the
|
|
9
|
-
# source tree,
|
|
9
|
+
# source tree, llama-cli-cmake-pkg pretends the dependencies are built-in.
|
|
10
10
|
set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
|
|
11
11
|
add_library(common OBJECT)
|
|
12
12
|
file(GLOB _common_files
|
|
@@ -15,7 +15,7 @@ file(GLOB _common_files
|
|
|
15
15
|
)
|
|
16
16
|
target_sources(common PRIVATE ${_common_files})
|
|
17
17
|
|
|
18
|
-
# If the common project was part of "
|
|
18
|
+
# If the common project was part of "llama-cli-cmake-pkg" the transient
|
|
19
19
|
# defines would automatically be attached. Because the common func-
|
|
20
20
|
# tionality is separate, but dependent upon the defines, it must be
|
|
21
21
|
# explicitly extracted from the "llama" target.
|
|
@@ -30,4 +30,3 @@ target_include_directories(${TARGET} PRIVATE ${_common_path})
|
|
|
30
30
|
install(TARGETS ${TARGET} RUNTIME)
|
|
31
31
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
32
32
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
|
33
|
-
|
|
@@ -6,46 +6,32 @@
|
|
|
6
6
|
#include <string>
|
|
7
7
|
#include <vector>
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
if (argc == 1 || argv[1][0] == '-') {
|
|
13
|
-
printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]);
|
|
14
|
-
return 1 ;
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
int seed = -1;
|
|
9
|
+
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
|
10
|
+
gpt_params_print_usage(argc, argv, params);
|
|
18
11
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
if (argc >= 2) {
|
|
25
|
-
params.model = argv[1];
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
if (argc >= 3) {
|
|
29
|
-
n_junk = std::stoi(argv[2]);
|
|
30
|
-
}
|
|
12
|
+
LOG_TEE("\nexample usage:\n");
|
|
13
|
+
LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
|
|
14
|
+
LOG_TEE("\n");
|
|
15
|
+
}
|
|
31
16
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
}
|
|
17
|
+
int main(int argc, char ** argv) {
|
|
18
|
+
gpt_params params;
|
|
35
19
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
20
|
+
params.n_junk = 250;
|
|
21
|
+
params.n_keep = 32;
|
|
22
|
+
params.i_pos = -1;
|
|
39
23
|
|
|
40
|
-
if (argc
|
|
41
|
-
|
|
24
|
+
if (!gpt_params_parse(argc, argv, params)) {
|
|
25
|
+
print_usage(argc, argv, params);
|
|
26
|
+
return 1;
|
|
42
27
|
}
|
|
43
28
|
|
|
44
|
-
|
|
45
|
-
seed = time(NULL);
|
|
46
|
-
}
|
|
29
|
+
srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
|
|
47
30
|
|
|
48
|
-
|
|
31
|
+
int n_junk = params.n_junk;
|
|
32
|
+
int n_keep = params.n_keep;
|
|
33
|
+
int n_grp = params.grp_attn_n;
|
|
34
|
+
int i_pos = params.i_pos;
|
|
49
35
|
|
|
50
36
|
if (i_pos == -1) {
|
|
51
37
|
i_pos = rand() % n_junk;
|
|
@@ -76,9 +62,7 @@ int main(int argc, char ** argv) {
|
|
|
76
62
|
|
|
77
63
|
// initialize the model
|
|
78
64
|
|
|
79
|
-
llama_model_params model_params =
|
|
80
|
-
|
|
81
|
-
model_params.n_gpu_layers = 99; // offload all layers to the GPU
|
|
65
|
+
llama_model_params model_params = llama_model_params_from_gpt_params(params);
|
|
82
66
|
|
|
83
67
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
|
84
68
|
|
|
@@ -89,13 +73,9 @@ int main(int argc, char ** argv) {
|
|
|
89
73
|
|
|
90
74
|
// initialize the context
|
|
91
75
|
|
|
92
|
-
llama_context_params ctx_params =
|
|
76
|
+
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
|
93
77
|
|
|
94
|
-
ctx_params.
|
|
95
|
-
ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
|
|
96
|
-
ctx_params.n_batch = 512;
|
|
97
|
-
ctx_params.n_threads = params.n_threads;
|
|
98
|
-
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
|
78
|
+
ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
|
|
99
79
|
|
|
100
80
|
GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
|
|
101
81
|
|
|
@@ -135,7 +115,7 @@ int main(int argc, char ** argv) {
|
|
|
135
115
|
LOG_TEE("prompt tokens: %d\n", n_tokens_all);
|
|
136
116
|
//LOG_TEE("prompt: %s\n", params.prompt.c_str());
|
|
137
117
|
|
|
138
|
-
llama_batch batch = llama_batch_init(
|
|
118
|
+
llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
|
|
139
119
|
|
|
140
120
|
int n_past = 0;
|
|
141
121
|
|
|
@@ -476,7 +476,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|
|
476
476
|
}
|
|
477
477
|
|
|
478
478
|
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
|
479
|
-
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
|
479
|
+
// Run `./llama-perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
|
480
480
|
// Output: `perplexity: 13.5106 [114/114]`
|
|
481
481
|
// BOS tokens will be added for each chunk before eval
|
|
482
482
|
|
|
@@ -1032,7 +1032,7 @@ struct winogrande_entry {
|
|
|
1032
1032
|
std::vector<llama_token> seq_tokens[2];
|
|
1033
1033
|
};
|
|
1034
1034
|
|
|
1035
|
-
static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string& prompt) {
|
|
1035
|
+
static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string & prompt) {
|
|
1036
1036
|
std::vector<winogrande_entry> result;
|
|
1037
1037
|
std::istringstream in(prompt);
|
|
1038
1038
|
std::string line;
|
|
@@ -1964,12 +1964,14 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1964
1964
|
int main(int argc, char ** argv) {
|
|
1965
1965
|
gpt_params params;
|
|
1966
1966
|
|
|
1967
|
+
params.n_ctx = 512;
|
|
1968
|
+
params.logits_all = true;
|
|
1969
|
+
|
|
1967
1970
|
if (!gpt_params_parse(argc, argv, params)) {
|
|
1971
|
+
gpt_params_print_usage(argc, argv, params);
|
|
1968
1972
|
return 1;
|
|
1969
1973
|
}
|
|
1970
1974
|
|
|
1971
|
-
params.logits_all = true;
|
|
1972
|
-
|
|
1973
1975
|
const int32_t n_ctx = params.n_ctx;
|
|
1974
1976
|
|
|
1975
1977
|
if (n_ctx <= 0) {
|
|
@@ -1989,6 +1991,12 @@ int main(int argc, char ** argv) {
|
|
|
1989
1991
|
params.n_batch = std::min(params.n_batch, n_kv);
|
|
1990
1992
|
} else {
|
|
1991
1993
|
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
|
1994
|
+
if (params.kl_divergence) {
|
|
1995
|
+
params.n_parallel = 1;
|
|
1996
|
+
} else {
|
|
1997
|
+
// ensure there's at least enough seq_ids for HellaSwag
|
|
1998
|
+
params.n_parallel = std::max(4, params.n_parallel);
|
|
1999
|
+
}
|
|
1992
2000
|
}
|
|
1993
2001
|
|
|
1994
2002
|
if (params.ppl_stride > 0) {
|
|
@@ -2006,9 +2014,6 @@ int main(int argc, char ** argv) {
|
|
|
2006
2014
|
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
|
2007
2015
|
|
|
2008
2016
|
std::mt19937 rng(params.seed);
|
|
2009
|
-
if (params.random_prompt) {
|
|
2010
|
-
params.prompt = string_random_prompt(rng);
|
|
2011
|
-
}
|
|
2012
2017
|
|
|
2013
2018
|
llama_backend_init();
|
|
2014
2019
|
llama_numa_init(params.numa);
|
|
@@ -2016,9 +2021,6 @@ int main(int argc, char ** argv) {
|
|
|
2016
2021
|
llama_model * model;
|
|
2017
2022
|
llama_context * ctx;
|
|
2018
2023
|
|
|
2019
|
-
// ensure there's at least enough seq_ids for HellaSwag
|
|
2020
|
-
params.n_parallel = std::max(4, params.n_parallel);
|
|
2021
|
-
|
|
2022
2024
|
// load the model and apply lora adapter, if any
|
|
2023
2025
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
|
2024
2026
|
if (model == NULL) {
|
|
@@ -2027,6 +2029,7 @@ int main(int argc, char ** argv) {
|
|
|
2027
2029
|
}
|
|
2028
2030
|
|
|
2029
2031
|
const int n_ctx_train = llama_n_ctx_train(model);
|
|
2032
|
+
|
|
2030
2033
|
if (params.n_ctx > n_ctx_train) {
|
|
2031
2034
|
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
|
2032
2035
|
__func__, n_ctx_train, params.n_ctx);
|
|
@@ -16,41 +16,44 @@ struct quant_option {
|
|
|
16
16
|
};
|
|
17
17
|
|
|
18
18
|
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|
19
|
-
{ "Q4_0",
|
|
20
|
-
{ "Q4_1",
|
|
21
|
-
{ "Q5_0",
|
|
22
|
-
{ "Q5_1",
|
|
23
|
-
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
|
|
24
|
-
{ "IQ2_XS",
|
|
25
|
-
{ "IQ2_S",
|
|
26
|
-
{ "IQ2_M",
|
|
27
|
-
{ "IQ1_S",
|
|
28
|
-
{ "IQ1_M",
|
|
29
|
-
{ "Q2_K",
|
|
30
|
-
{ "Q2_K_S",
|
|
31
|
-
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
|
32
|
-
{ "IQ3_S",
|
|
33
|
-
{ "IQ3_M",
|
|
34
|
-
{ "Q3_K",
|
|
35
|
-
{ "IQ3_XS",
|
|
36
|
-
{ "Q3_K_S",
|
|
37
|
-
{ "Q3_K_M",
|
|
38
|
-
{ "Q3_K_L",
|
|
39
|
-
{ "IQ4_NL",
|
|
40
|
-
{ "IQ4_XS",
|
|
41
|
-
{ "Q4_K",
|
|
42
|
-
{ "Q4_K_S",
|
|
43
|
-
{ "Q4_K_M",
|
|
44
|
-
{ "Q5_K",
|
|
45
|
-
{ "Q5_K_S",
|
|
46
|
-
{ "Q5_K_M",
|
|
47
|
-
{ "Q6_K",
|
|
48
|
-
{ "Q8_0",
|
|
49
|
-
{ "
|
|
50
|
-
{ "
|
|
51
|
-
{ "
|
|
19
|
+
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
|
20
|
+
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
|
|
21
|
+
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
|
|
22
|
+
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", },
|
|
23
|
+
{ "IQ2_XXS", LLAMA_FTYPE_MOSTLY_IQ2_XXS, " 2.06 bpw quantization", },
|
|
24
|
+
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
|
|
25
|
+
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
|
|
26
|
+
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
|
27
|
+
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
|
28
|
+
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
|
29
|
+
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
|
|
30
|
+
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
|
|
31
|
+
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
|
|
32
|
+
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
|
|
33
|
+
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
|
|
34
|
+
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
|
35
|
+
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", },
|
|
36
|
+
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", },
|
|
37
|
+
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", },
|
|
38
|
+
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", },
|
|
39
|
+
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
|
|
40
|
+
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
|
|
41
|
+
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
|
42
|
+
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", },
|
|
43
|
+
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", },
|
|
44
|
+
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
|
45
|
+
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", },
|
|
46
|
+
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
|
|
47
|
+
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
|
|
48
|
+
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
|
|
49
|
+
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
|
50
|
+
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
|
51
|
+
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
|
52
|
+
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
|
|
53
|
+
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
|
54
|
+
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
|
52
55
|
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
|
53
|
-
{ "COPY",
|
|
56
|
+
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
|
54
57
|
};
|
|
55
58
|
|
|
56
59
|
static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
|