@fugood/llama.node 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +8 -1
- package/package.json +1 -1
- package/patches/llama.patch +12 -12
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/LlamaContext.cpp +33 -1
- package/src/LlamaContext.h +1 -0
- package/src/LoadSessionWorker.cpp +1 -0
- package/src/llama.cpp/.github/workflows/bench.yml +310 -0
- package/src/llama.cpp/.github/workflows/build.yml +1315 -0
- package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
- package/src/llama.cpp/.github/workflows/docker.yml +116 -0
- package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
- package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
- package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
- package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
- package/src/llama.cpp/.github/workflows/server.yml +183 -0
- package/src/llama.cpp/CMakeLists.txt +91 -1245
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
- package/src/llama.cpp/cmake/build-info.cmake +58 -0
- package/src/llama.cpp/cmake/git-vars.cmake +22 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -3
- package/src/llama.cpp/common/build-info.cpp.in +4 -0
- package/src/llama.cpp/common/common.cpp +1116 -877
- package/src/llama.cpp/common/common.h +191 -77
- package/src/llama.cpp/common/grammar-parser.cpp +118 -31
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
- package/src/llama.cpp/common/log.h +1 -1
- package/src/llama.cpp/common/ngram-cache.h +10 -3
- package/src/llama.cpp/common/sampling.cpp +19 -10
- package/src/llama.cpp/docs/build.md +353 -0
- package/src/llama.cpp/examples/CMakeLists.txt +22 -22
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +52 -55
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/chat-13B.bat +57 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
- package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
- package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
- package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +38 -153
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
- package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
- package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
- package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
- package/src/llama.cpp/examples/llava/clip.cpp +23 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
- package/src/llama.cpp/examples/llava/requirements.txt +3 -2
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +98 -75
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
- package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
- package/src/llama.cpp/examples/server/server.cpp +274 -671
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +28 -29
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +21 -29
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +23 -0
- package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
- package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
- package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
- package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
- package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
- package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
- package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
- package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
- package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
- package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
- package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
- package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
- package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
- package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
- package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
- package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
- package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
- package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
- package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
- package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
- package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
- package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
- package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
- package/src/llama.cpp/models/.editorconfig +1 -0
- package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/requirements/requirements-all.txt +12 -0
- package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
- package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
- package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
- package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
- package/src/llama.cpp/requirements.txt +5 -4
- package/src/llama.cpp/scripts/build-info.sh +30 -0
- package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
- package/src/llama.cpp/src/CMakeLists.txt +33 -0
- package/src/llama.cpp/src/llama-grammar.cpp +539 -0
- package/src/llama.cpp/src/llama-grammar.h +39 -0
- package/src/llama.cpp/src/llama-impl.h +26 -0
- package/src/llama.cpp/src/llama-sampling.cpp +635 -0
- package/src/llama.cpp/src/llama-sampling.h +56 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
- package/src/llama.cpp/src/llama-vocab.h +130 -0
- package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
- package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
- package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
- package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
- package/src/llama.cpp/tests/CMakeLists.txt +19 -20
- package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
- package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
- package/src/llama.cpp/tests/test-double-float.cpp +2 -2
- package/src/llama.cpp/tests/test-grad0.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
- package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
- package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
- package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
- package/src/llama.cpp/tests/test-rope.cpp +3 -4
- package/src/llama.cpp/tests/test-sampling.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
- package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
- package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
- package/src/llama.cpp/ggml-opencl.cpp +0 -2305
- package/src/llama.cpp/ggml-opencl.h +0 -36
- package/src/llama.cpp/ggml-sycl.cpp +0 -17340
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
- package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
- /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
- /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
- /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
- /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
- /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
#if defined(_MSC_VER)
|
|
2
|
+
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
|
3
|
+
#endif
|
|
4
|
+
|
|
1
5
|
#include "common.h"
|
|
2
6
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
3
7
|
#define JSON_ASSERT GGML_ASSERT
|
|
@@ -6,21 +10,21 @@
|
|
|
6
10
|
#include "llama.h"
|
|
7
11
|
|
|
8
12
|
#include <algorithm>
|
|
9
|
-
#include <
|
|
13
|
+
#include <cinttypes>
|
|
10
14
|
#include <cmath>
|
|
15
|
+
#include <codecvt>
|
|
16
|
+
#include <cstdarg>
|
|
11
17
|
#include <cstring>
|
|
12
18
|
#include <ctime>
|
|
13
19
|
#include <fstream>
|
|
14
|
-
#include <iterator>
|
|
15
20
|
#include <iostream>
|
|
21
|
+
#include <iterator>
|
|
16
22
|
#include <regex>
|
|
17
23
|
#include <sstream>
|
|
18
24
|
#include <string>
|
|
19
25
|
#include <unordered_map>
|
|
20
26
|
#include <unordered_set>
|
|
21
27
|
#include <vector>
|
|
22
|
-
#include <cinttypes>
|
|
23
|
-
#include <codecvt>
|
|
24
28
|
|
|
25
29
|
#if defined(__APPLE__) && defined(__MACH__)
|
|
26
30
|
#include <sys/types.h>
|
|
@@ -190,6 +194,12 @@ int32_t cpu_get_num_math() {
|
|
|
190
194
|
// CLI argument parsing
|
|
191
195
|
//
|
|
192
196
|
|
|
197
|
+
void gpt_params_handle_hf_token(gpt_params & params) {
|
|
198
|
+
if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
|
|
199
|
+
params.hf_token = std::getenv("HF_TOKEN");
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
193
203
|
void gpt_params_handle_model_default(gpt_params & params) {
|
|
194
204
|
if (!params.hf_repo.empty()) {
|
|
195
205
|
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
@@ -199,19 +209,13 @@ void gpt_params_handle_model_default(gpt_params & params) {
|
|
|
199
209
|
}
|
|
200
210
|
params.hf_file = params.model;
|
|
201
211
|
} else if (params.model.empty()) {
|
|
202
|
-
|
|
203
|
-
const bool success = fs_create_directory_with_parents(cache_directory);
|
|
204
|
-
if (!success) {
|
|
205
|
-
throw std::runtime_error("failed to create cache directory: " + cache_directory);
|
|
206
|
-
}
|
|
207
|
-
params.model = cache_directory + string_split(params.hf_file, '/').back();
|
|
212
|
+
params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
|
|
208
213
|
}
|
|
209
214
|
} else if (!params.model_url.empty()) {
|
|
210
215
|
if (params.model.empty()) {
|
|
211
216
|
auto f = string_split(params.model_url, '#').front();
|
|
212
217
|
f = string_split(f, '?').front();
|
|
213
|
-
|
|
214
|
-
params.model = "models/" + f;
|
|
218
|
+
params.model = fs_get_cache_file(string_split(f, '/').back());
|
|
215
219
|
}
|
|
216
220
|
} else if (params.model.empty()) {
|
|
217
221
|
params.model = DEFAULT_MODEL_PATH;
|
|
@@ -237,15 +241,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|
|
237
241
|
}
|
|
238
242
|
}
|
|
239
243
|
|
|
240
|
-
if (params.prompt_cache_all &&
|
|
241
|
-
(params.interactive || params.interactive_first ||
|
|
242
|
-
params.instruct)) {
|
|
243
|
-
|
|
244
|
+
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
|
|
244
245
|
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
|
245
246
|
}
|
|
246
247
|
|
|
247
248
|
gpt_params_handle_model_default(params);
|
|
248
249
|
|
|
250
|
+
gpt_params_handle_hf_token(params);
|
|
251
|
+
|
|
249
252
|
if (params.escape) {
|
|
250
253
|
string_process_escapes(params.prompt);
|
|
251
254
|
string_process_escapes(params.input_prefix);
|
|
@@ -265,39 +268,39 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|
|
265
268
|
}
|
|
266
269
|
|
|
267
270
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
268
|
-
|
|
271
|
+
const auto params_org = params; // the example can modify the default params
|
|
272
|
+
|
|
269
273
|
try {
|
|
270
|
-
if (!gpt_params_parse_ex(argc, argv, params)) {
|
|
271
|
-
|
|
272
|
-
|
|
274
|
+
if (!gpt_params_parse_ex(argc, argv, params) || params.usage) {
|
|
275
|
+
params = params_org;
|
|
276
|
+
params.usage = true;
|
|
277
|
+
return false;
|
|
273
278
|
}
|
|
274
|
-
}
|
|
275
|
-
catch (const std::invalid_argument & ex) {
|
|
279
|
+
} catch (const std::invalid_argument & ex) {
|
|
276
280
|
fprintf(stderr, "%s\n", ex.what());
|
|
277
|
-
|
|
278
|
-
|
|
281
|
+
params = params_org;
|
|
282
|
+
return false;
|
|
279
283
|
}
|
|
280
|
-
|
|
284
|
+
|
|
285
|
+
return true;
|
|
281
286
|
}
|
|
282
287
|
|
|
288
|
+
#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
|
|
289
|
+
|
|
283
290
|
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
|
|
291
|
+
const char split_delim = ',';
|
|
292
|
+
|
|
284
293
|
llama_sampling_params & sparams = params.sparams;
|
|
285
294
|
|
|
286
295
|
if (arg == "-s" || arg == "--seed") {
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
return true;
|
|
290
|
-
}
|
|
291
|
-
// This is temporary, in the future the samplign state will be moved fully to llama_sampling_context.
|
|
296
|
+
CHECK_ARG
|
|
297
|
+
// TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
|
|
292
298
|
params.seed = std::stoul(argv[i]);
|
|
293
299
|
sparams.seed = std::stoul(argv[i]);
|
|
294
300
|
return true;
|
|
295
301
|
}
|
|
296
302
|
if (arg == "-t" || arg == "--threads") {
|
|
297
|
-
|
|
298
|
-
invalid_param = true;
|
|
299
|
-
return true;
|
|
300
|
-
}
|
|
303
|
+
CHECK_ARG
|
|
301
304
|
params.n_threads = std::stoi(argv[i]);
|
|
302
305
|
if (params.n_threads <= 0) {
|
|
303
306
|
params.n_threads = std::thread::hardware_concurrency();
|
|
@@ -305,10 +308,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
305
308
|
return true;
|
|
306
309
|
}
|
|
307
310
|
if (arg == "-tb" || arg == "--threads-batch") {
|
|
308
|
-
|
|
309
|
-
invalid_param = true;
|
|
310
|
-
return true;
|
|
311
|
-
}
|
|
311
|
+
CHECK_ARG
|
|
312
312
|
params.n_threads_batch = std::stoi(argv[i]);
|
|
313
313
|
if (params.n_threads_batch <= 0) {
|
|
314
314
|
params.n_threads_batch = std::thread::hardware_concurrency();
|
|
@@ -316,10 +316,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
316
316
|
return true;
|
|
317
317
|
}
|
|
318
318
|
if (arg == "-td" || arg == "--threads-draft") {
|
|
319
|
-
|
|
320
|
-
invalid_param = true;
|
|
321
|
-
return true;
|
|
322
|
-
}
|
|
319
|
+
CHECK_ARG
|
|
323
320
|
params.n_threads_draft = std::stoi(argv[i]);
|
|
324
321
|
if (params.n_threads_draft <= 0) {
|
|
325
322
|
params.n_threads_draft = std::thread::hardware_concurrency();
|
|
@@ -327,10 +324,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
327
324
|
return true;
|
|
328
325
|
}
|
|
329
326
|
if (arg == "-tbd" || arg == "--threads-batch-draft") {
|
|
330
|
-
|
|
331
|
-
invalid_param = true;
|
|
332
|
-
return true;
|
|
333
|
-
}
|
|
327
|
+
CHECK_ARG
|
|
334
328
|
params.n_threads_batch_draft = std::stoi(argv[i]);
|
|
335
329
|
if (params.n_threads_batch_draft <= 0) {
|
|
336
330
|
params.n_threads_batch_draft = std::thread::hardware_concurrency();
|
|
@@ -338,10 +332,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
338
332
|
return true;
|
|
339
333
|
}
|
|
340
334
|
if (arg == "-p" || arg == "--prompt") {
|
|
341
|
-
|
|
342
|
-
invalid_param = true;
|
|
343
|
-
return true;
|
|
344
|
-
}
|
|
335
|
+
CHECK_ARG
|
|
345
336
|
params.prompt = argv[i];
|
|
346
337
|
return true;
|
|
347
338
|
}
|
|
@@ -349,11 +340,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
349
340
|
params.escape = true;
|
|
350
341
|
return true;
|
|
351
342
|
}
|
|
343
|
+
if (arg == "--no-escape") {
|
|
344
|
+
params.escape = false;
|
|
345
|
+
return true;
|
|
346
|
+
}
|
|
352
347
|
if (arg == "--prompt-cache") {
|
|
353
|
-
|
|
354
|
-
invalid_param = true;
|
|
355
|
-
return true;
|
|
356
|
-
}
|
|
348
|
+
CHECK_ARG
|
|
357
349
|
params.path_prompt_cache = argv[i];
|
|
358
350
|
return true;
|
|
359
351
|
}
|
|
@@ -366,10 +358,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
366
358
|
return true;
|
|
367
359
|
}
|
|
368
360
|
if (arg == "-bf" || arg == "--binary-file") {
|
|
369
|
-
|
|
370
|
-
invalid_param = true;
|
|
371
|
-
return true;
|
|
372
|
-
}
|
|
361
|
+
CHECK_ARG
|
|
373
362
|
std::ifstream file(argv[i], std::ios::binary);
|
|
374
363
|
if (!file) {
|
|
375
364
|
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
@@ -385,10 +374,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
385
374
|
return true;
|
|
386
375
|
}
|
|
387
376
|
if (arg == "-f" || arg == "--file") {
|
|
388
|
-
|
|
389
|
-
invalid_param = true;
|
|
390
|
-
return true;
|
|
391
|
-
}
|
|
377
|
+
CHECK_ARG
|
|
392
378
|
std::ifstream file(argv[i]);
|
|
393
379
|
if (!file) {
|
|
394
380
|
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
@@ -403,67 +389,54 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
403
389
|
}
|
|
404
390
|
return true;
|
|
405
391
|
}
|
|
406
|
-
if (arg == "
|
|
407
|
-
|
|
392
|
+
if (arg == "--in-file") {
|
|
393
|
+
CHECK_ARG
|
|
394
|
+
std::ifstream file(argv[i]);
|
|
395
|
+
if (!file) {
|
|
396
|
+
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
408
397
|
invalid_param = true;
|
|
409
398
|
return true;
|
|
410
399
|
}
|
|
400
|
+
params.in_files.push_back(argv[i]);
|
|
401
|
+
return true;
|
|
402
|
+
}
|
|
403
|
+
if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
|
|
404
|
+
CHECK_ARG
|
|
411
405
|
params.n_predict = std::stoi(argv[i]);
|
|
412
406
|
return true;
|
|
413
407
|
}
|
|
414
408
|
if (arg == "--top-k") {
|
|
415
|
-
|
|
416
|
-
invalid_param = true;
|
|
417
|
-
return true;
|
|
418
|
-
}
|
|
409
|
+
CHECK_ARG
|
|
419
410
|
sparams.top_k = std::stoi(argv[i]);
|
|
420
411
|
return true;
|
|
421
412
|
}
|
|
422
413
|
if (arg == "-c" || arg == "--ctx-size") {
|
|
423
|
-
|
|
424
|
-
invalid_param = true;
|
|
425
|
-
return true;
|
|
426
|
-
}
|
|
414
|
+
CHECK_ARG
|
|
427
415
|
params.n_ctx = std::stoi(argv[i]);
|
|
428
416
|
return true;
|
|
429
417
|
}
|
|
430
418
|
if (arg == "--grp-attn-n" || arg == "-gan") {
|
|
431
|
-
|
|
432
|
-
invalid_param = true;
|
|
433
|
-
return true;
|
|
434
|
-
}
|
|
419
|
+
CHECK_ARG
|
|
435
420
|
params.grp_attn_n = std::stoi(argv[i]);
|
|
436
421
|
return true;
|
|
437
422
|
}
|
|
438
423
|
if (arg == "--grp-attn-w" || arg == "-gaw") {
|
|
439
|
-
|
|
440
|
-
invalid_param = true;
|
|
441
|
-
return true;
|
|
442
|
-
}
|
|
424
|
+
CHECK_ARG
|
|
443
425
|
params.grp_attn_w = std::stoi(argv[i]);
|
|
444
426
|
return true;
|
|
445
427
|
}
|
|
446
428
|
if (arg == "--rope-freq-base") {
|
|
447
|
-
|
|
448
|
-
invalid_param = true;
|
|
449
|
-
return true;
|
|
450
|
-
}
|
|
429
|
+
CHECK_ARG
|
|
451
430
|
params.rope_freq_base = std::stof(argv[i]);
|
|
452
431
|
return true;
|
|
453
432
|
}
|
|
454
433
|
if (arg == "--rope-freq-scale") {
|
|
455
|
-
|
|
456
|
-
invalid_param = true;
|
|
457
|
-
return true;
|
|
458
|
-
}
|
|
434
|
+
CHECK_ARG
|
|
459
435
|
params.rope_freq_scale = std::stof(argv[i]);
|
|
460
436
|
return true;
|
|
461
437
|
}
|
|
462
438
|
if (arg == "--rope-scaling") {
|
|
463
|
-
|
|
464
|
-
invalid_param = true;
|
|
465
|
-
return true;
|
|
466
|
-
}
|
|
439
|
+
CHECK_ARG
|
|
467
440
|
std::string value(argv[i]);
|
|
468
441
|
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
|
469
442
|
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
|
@@ -472,217 +445,148 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
472
445
|
return true;
|
|
473
446
|
}
|
|
474
447
|
if (arg == "--rope-scale") {
|
|
475
|
-
|
|
476
|
-
invalid_param = true;
|
|
477
|
-
return true;
|
|
478
|
-
}
|
|
448
|
+
CHECK_ARG
|
|
479
449
|
params.rope_freq_scale = 1.0f / std::stof(argv[i]);
|
|
480
450
|
return true;
|
|
481
451
|
}
|
|
482
452
|
if (arg == "--yarn-orig-ctx") {
|
|
483
|
-
|
|
484
|
-
invalid_param = true;
|
|
485
|
-
return true;
|
|
486
|
-
}
|
|
453
|
+
CHECK_ARG
|
|
487
454
|
params.yarn_orig_ctx = std::stoi(argv[i]);
|
|
488
455
|
return true;
|
|
489
456
|
}
|
|
490
457
|
if (arg == "--yarn-ext-factor") {
|
|
491
|
-
|
|
492
|
-
invalid_param = true;
|
|
493
|
-
return true;
|
|
494
|
-
}
|
|
458
|
+
CHECK_ARG
|
|
495
459
|
params.yarn_ext_factor = std::stof(argv[i]);
|
|
496
460
|
return true;
|
|
497
461
|
}
|
|
498
462
|
if (arg == "--yarn-attn-factor") {
|
|
499
|
-
|
|
500
|
-
invalid_param = true;
|
|
501
|
-
return true;
|
|
502
|
-
}
|
|
463
|
+
CHECK_ARG
|
|
503
464
|
params.yarn_attn_factor = std::stof(argv[i]);
|
|
504
465
|
return true;
|
|
505
466
|
}
|
|
506
467
|
if (arg == "--yarn-beta-fast") {
|
|
507
|
-
|
|
508
|
-
invalid_param = true;
|
|
509
|
-
return true;
|
|
510
|
-
}
|
|
468
|
+
CHECK_ARG
|
|
511
469
|
params.yarn_beta_fast = std::stof(argv[i]);
|
|
512
470
|
return true;
|
|
513
471
|
}
|
|
514
472
|
if (arg == "--yarn-beta-slow") {
|
|
515
|
-
|
|
516
|
-
invalid_param = true;
|
|
517
|
-
return true;
|
|
518
|
-
}
|
|
473
|
+
CHECK_ARG
|
|
519
474
|
params.yarn_beta_slow = std::stof(argv[i]);
|
|
520
475
|
return true;
|
|
521
476
|
}
|
|
522
477
|
if (arg == "--pooling") {
|
|
523
|
-
|
|
524
|
-
invalid_param = true;
|
|
525
|
-
return true;
|
|
526
|
-
}
|
|
478
|
+
CHECK_ARG
|
|
527
479
|
std::string value(argv[i]);
|
|
528
480
|
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
|
|
529
481
|
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
|
|
530
482
|
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
|
|
483
|
+
else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
|
|
484
|
+
else { invalid_param = true; }
|
|
485
|
+
return true;
|
|
486
|
+
}
|
|
487
|
+
if (arg == "--attention") {
|
|
488
|
+
CHECK_ARG
|
|
489
|
+
std::string value(argv[i]);
|
|
490
|
+
/**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
|
|
491
|
+
else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
|
|
531
492
|
else { invalid_param = true; }
|
|
532
493
|
return true;
|
|
533
494
|
}
|
|
534
495
|
if (arg == "--defrag-thold" || arg == "-dt") {
|
|
535
|
-
|
|
536
|
-
invalid_param = true;
|
|
537
|
-
return true;
|
|
538
|
-
}
|
|
496
|
+
CHECK_ARG
|
|
539
497
|
params.defrag_thold = std::stof(argv[i]);
|
|
540
498
|
return true;
|
|
541
499
|
}
|
|
542
500
|
if (arg == "--samplers") {
|
|
543
|
-
|
|
544
|
-
invalid_param = true;
|
|
545
|
-
return true;
|
|
546
|
-
}
|
|
501
|
+
CHECK_ARG
|
|
547
502
|
const auto sampler_names = string_split(argv[i], ';');
|
|
548
503
|
sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
|
|
549
504
|
return true;
|
|
550
505
|
}
|
|
551
506
|
if (arg == "--sampling-seq") {
|
|
552
|
-
|
|
553
|
-
invalid_param = true;
|
|
554
|
-
return true;
|
|
555
|
-
}
|
|
507
|
+
CHECK_ARG
|
|
556
508
|
sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]);
|
|
557
509
|
return true;
|
|
558
510
|
}
|
|
559
511
|
if (arg == "--top-p") {
|
|
560
|
-
|
|
561
|
-
invalid_param = true;
|
|
562
|
-
return true;
|
|
563
|
-
}
|
|
512
|
+
CHECK_ARG
|
|
564
513
|
sparams.top_p = std::stof(argv[i]);
|
|
565
514
|
return true;
|
|
566
515
|
}
|
|
567
516
|
if (arg == "--min-p") {
|
|
568
|
-
|
|
569
|
-
invalid_param = true;
|
|
570
|
-
return true;
|
|
571
|
-
}
|
|
517
|
+
CHECK_ARG
|
|
572
518
|
sparams.min_p = std::stof(argv[i]);
|
|
573
519
|
return true;
|
|
574
520
|
}
|
|
575
521
|
if (arg == "--temp") {
|
|
576
|
-
|
|
577
|
-
invalid_param = true;
|
|
578
|
-
return true;
|
|
579
|
-
}
|
|
522
|
+
CHECK_ARG
|
|
580
523
|
sparams.temp = std::stof(argv[i]);
|
|
581
524
|
sparams.temp = std::max(sparams.temp, 0.0f);
|
|
582
525
|
return true;
|
|
583
526
|
}
|
|
584
527
|
if (arg == "--tfs") {
|
|
585
|
-
|
|
586
|
-
invalid_param = true;
|
|
587
|
-
return true;
|
|
588
|
-
}
|
|
528
|
+
CHECK_ARG
|
|
589
529
|
sparams.tfs_z = std::stof(argv[i]);
|
|
590
530
|
return true;
|
|
591
531
|
}
|
|
592
532
|
if (arg == "--typical") {
|
|
593
|
-
|
|
594
|
-
invalid_param = true;
|
|
595
|
-
return true;
|
|
596
|
-
}
|
|
533
|
+
CHECK_ARG
|
|
597
534
|
sparams.typical_p = std::stof(argv[i]);
|
|
598
535
|
return true;
|
|
599
536
|
}
|
|
600
537
|
if (arg == "--repeat-last-n") {
|
|
601
|
-
|
|
602
|
-
invalid_param = true;
|
|
603
|
-
return true;
|
|
604
|
-
}
|
|
538
|
+
CHECK_ARG
|
|
605
539
|
sparams.penalty_last_n = std::stoi(argv[i]);
|
|
606
540
|
sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
|
|
607
541
|
return true;
|
|
608
542
|
}
|
|
609
543
|
if (arg == "--repeat-penalty") {
|
|
610
|
-
|
|
611
|
-
invalid_param = true;
|
|
612
|
-
return true;
|
|
613
|
-
}
|
|
544
|
+
CHECK_ARG
|
|
614
545
|
sparams.penalty_repeat = std::stof(argv[i]);
|
|
615
546
|
return true;
|
|
616
547
|
}
|
|
617
548
|
if (arg == "--frequency-penalty") {
|
|
618
|
-
|
|
619
|
-
invalid_param = true;
|
|
620
|
-
return true;
|
|
621
|
-
}
|
|
549
|
+
CHECK_ARG
|
|
622
550
|
sparams.penalty_freq = std::stof(argv[i]);
|
|
623
551
|
return true;
|
|
624
552
|
}
|
|
625
553
|
if (arg == "--presence-penalty") {
|
|
626
|
-
|
|
627
|
-
invalid_param = true;
|
|
628
|
-
return true;
|
|
629
|
-
}
|
|
554
|
+
CHECK_ARG
|
|
630
555
|
sparams.penalty_present = std::stof(argv[i]);
|
|
631
556
|
return true;
|
|
632
557
|
}
|
|
633
558
|
if (arg == "--dynatemp-range") {
|
|
634
|
-
|
|
635
|
-
invalid_param = true;
|
|
636
|
-
return true;
|
|
637
|
-
}
|
|
559
|
+
CHECK_ARG
|
|
638
560
|
sparams.dynatemp_range = std::stof(argv[i]);
|
|
639
561
|
return true;
|
|
640
562
|
}
|
|
641
563
|
if (arg == "--dynatemp-exp") {
|
|
642
|
-
|
|
643
|
-
invalid_param = true;
|
|
644
|
-
return true;
|
|
645
|
-
}
|
|
564
|
+
CHECK_ARG
|
|
646
565
|
sparams.dynatemp_exponent = std::stof(argv[i]);
|
|
647
566
|
return true;
|
|
648
567
|
}
|
|
649
568
|
if (arg == "--mirostat") {
|
|
650
|
-
|
|
651
|
-
invalid_param = true;
|
|
652
|
-
return true;
|
|
653
|
-
}
|
|
569
|
+
CHECK_ARG
|
|
654
570
|
sparams.mirostat = std::stoi(argv[i]);
|
|
655
571
|
return true;
|
|
656
572
|
}
|
|
657
573
|
if (arg == "--mirostat-lr") {
|
|
658
|
-
|
|
659
|
-
invalid_param = true;
|
|
660
|
-
return true;
|
|
661
|
-
}
|
|
574
|
+
CHECK_ARG
|
|
662
575
|
sparams.mirostat_eta = std::stof(argv[i]);
|
|
663
576
|
return true;
|
|
664
577
|
}
|
|
665
578
|
if (arg == "--mirostat-ent") {
|
|
666
|
-
|
|
667
|
-
invalid_param = true;
|
|
668
|
-
return true;
|
|
669
|
-
}
|
|
579
|
+
CHECK_ARG
|
|
670
580
|
sparams.mirostat_tau = std::stof(argv[i]);
|
|
671
581
|
return true;
|
|
672
582
|
}
|
|
673
583
|
if (arg == "--cfg-negative-prompt") {
|
|
674
|
-
|
|
675
|
-
invalid_param = true;
|
|
676
|
-
return true;
|
|
677
|
-
}
|
|
584
|
+
CHECK_ARG
|
|
678
585
|
sparams.cfg_negative_prompt = argv[i];
|
|
679
586
|
return true;
|
|
680
587
|
}
|
|
681
588
|
if (arg == "--cfg-negative-prompt-file") {
|
|
682
|
-
|
|
683
|
-
invalid_param = true;
|
|
684
|
-
return true;
|
|
685
|
-
}
|
|
589
|
+
CHECK_ARG
|
|
686
590
|
std::ifstream file(argv[i]);
|
|
687
591
|
if (!file) {
|
|
688
592
|
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
@@ -696,203 +600,126 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
696
600
|
return true;
|
|
697
601
|
}
|
|
698
602
|
if (arg == "--cfg-scale") {
|
|
699
|
-
|
|
700
|
-
invalid_param = true;
|
|
701
|
-
return true;
|
|
702
|
-
}
|
|
603
|
+
CHECK_ARG
|
|
703
604
|
sparams.cfg_scale = std::stof(argv[i]);
|
|
704
605
|
return true;
|
|
705
606
|
}
|
|
706
607
|
if (arg == "-b" || arg == "--batch-size") {
|
|
707
|
-
|
|
708
|
-
invalid_param = true;
|
|
709
|
-
return true;
|
|
710
|
-
}
|
|
608
|
+
CHECK_ARG
|
|
711
609
|
params.n_batch = std::stoi(argv[i]);
|
|
712
610
|
return true;
|
|
713
611
|
}
|
|
714
612
|
if (arg == "-ub" || arg == "--ubatch-size") {
|
|
715
|
-
|
|
716
|
-
invalid_param = true;
|
|
717
|
-
return true;
|
|
718
|
-
}
|
|
613
|
+
CHECK_ARG
|
|
719
614
|
params.n_ubatch = std::stoi(argv[i]);
|
|
720
615
|
return true;
|
|
721
616
|
}
|
|
722
617
|
if (arg == "--keep") {
|
|
723
|
-
|
|
724
|
-
invalid_param = true;
|
|
725
|
-
return true;
|
|
726
|
-
}
|
|
618
|
+
CHECK_ARG
|
|
727
619
|
params.n_keep = std::stoi(argv[i]);
|
|
728
620
|
return true;
|
|
729
621
|
}
|
|
730
622
|
if (arg == "--draft") {
|
|
731
|
-
|
|
732
|
-
invalid_param = true;
|
|
733
|
-
return true;
|
|
734
|
-
}
|
|
623
|
+
CHECK_ARG
|
|
735
624
|
params.n_draft = std::stoi(argv[i]);
|
|
736
625
|
return true;
|
|
737
626
|
}
|
|
738
627
|
if (arg == "--chunks") {
|
|
739
|
-
|
|
740
|
-
invalid_param = true;
|
|
741
|
-
return true;
|
|
742
|
-
}
|
|
628
|
+
CHECK_ARG
|
|
743
629
|
params.n_chunks = std::stoi(argv[i]);
|
|
744
630
|
return true;
|
|
745
631
|
}
|
|
746
632
|
if (arg == "-np" || arg == "--parallel") {
|
|
747
|
-
|
|
748
|
-
invalid_param = true;
|
|
749
|
-
return true;
|
|
750
|
-
}
|
|
633
|
+
CHECK_ARG
|
|
751
634
|
params.n_parallel = std::stoi(argv[i]);
|
|
752
635
|
return true;
|
|
753
636
|
}
|
|
754
637
|
if (arg == "-ns" || arg == "--sequences") {
|
|
755
|
-
|
|
756
|
-
invalid_param = true;
|
|
757
|
-
return true;
|
|
758
|
-
}
|
|
638
|
+
CHECK_ARG
|
|
759
639
|
params.n_sequences = std::stoi(argv[i]);
|
|
760
640
|
return true;
|
|
761
641
|
}
|
|
762
642
|
if (arg == "--p-split" || arg == "-ps") {
|
|
763
|
-
|
|
764
|
-
invalid_param = true;
|
|
765
|
-
return true;
|
|
766
|
-
}
|
|
643
|
+
CHECK_ARG
|
|
767
644
|
params.p_split = std::stof(argv[i]);
|
|
768
645
|
return true;
|
|
769
646
|
}
|
|
770
647
|
if (arg == "-m" || arg == "--model") {
|
|
771
|
-
|
|
772
|
-
invalid_param = true;
|
|
773
|
-
return true;
|
|
774
|
-
}
|
|
648
|
+
CHECK_ARG
|
|
775
649
|
params.model = argv[i];
|
|
776
650
|
return true;
|
|
777
651
|
}
|
|
778
652
|
if (arg == "-md" || arg == "--model-draft") {
|
|
779
|
-
|
|
780
|
-
invalid_param = true;
|
|
781
|
-
return true;
|
|
782
|
-
}
|
|
653
|
+
CHECK_ARG
|
|
783
654
|
params.model_draft = argv[i];
|
|
784
655
|
return true;
|
|
785
656
|
}
|
|
786
657
|
if (arg == "-a" || arg == "--alias") {
|
|
787
|
-
|
|
788
|
-
invalid_param = true;
|
|
789
|
-
return true;
|
|
790
|
-
}
|
|
658
|
+
CHECK_ARG
|
|
791
659
|
params.model_alias = argv[i];
|
|
792
660
|
return true;
|
|
793
661
|
}
|
|
794
662
|
if (arg == "-mu" || arg == "--model-url") {
|
|
795
|
-
|
|
796
|
-
invalid_param = true;
|
|
797
|
-
return true;
|
|
798
|
-
}
|
|
663
|
+
CHECK_ARG
|
|
799
664
|
params.model_url = argv[i];
|
|
800
665
|
return true;
|
|
801
666
|
}
|
|
802
|
-
if (arg == "-
|
|
667
|
+
if (arg == "-hft" || arg == "--hf-token") {
|
|
803
668
|
if (++i >= argc) {
|
|
804
|
-
|
|
805
|
-
|
|
669
|
+
invalid_param = true;
|
|
670
|
+
return true;
|
|
806
671
|
}
|
|
672
|
+
params.hf_token = argv[i];
|
|
673
|
+
return true;
|
|
674
|
+
}
|
|
675
|
+
if (arg == "-hfr" || arg == "--hf-repo") {
|
|
676
|
+
CHECK_ARG
|
|
807
677
|
params.hf_repo = argv[i];
|
|
808
678
|
return true;
|
|
809
679
|
}
|
|
810
680
|
if (arg == "-hff" || arg == "--hf-file") {
|
|
811
|
-
|
|
812
|
-
invalid_param = true;
|
|
813
|
-
return true;
|
|
814
|
-
}
|
|
681
|
+
CHECK_ARG
|
|
815
682
|
params.hf_file = argv[i];
|
|
816
683
|
return true;
|
|
817
684
|
}
|
|
818
685
|
if (arg == "--lora") {
|
|
819
|
-
|
|
820
|
-
invalid_param = true;
|
|
821
|
-
return true;
|
|
822
|
-
}
|
|
686
|
+
CHECK_ARG
|
|
823
687
|
params.lora_adapter.emplace_back(argv[i], 1.0f);
|
|
824
|
-
params.use_mmap = false;
|
|
825
688
|
return true;
|
|
826
689
|
}
|
|
827
690
|
if (arg == "--lora-scaled") {
|
|
828
|
-
|
|
829
|
-
invalid_param = true;
|
|
830
|
-
return true;
|
|
831
|
-
}
|
|
691
|
+
CHECK_ARG
|
|
832
692
|
const char* lora_adapter = argv[i];
|
|
833
|
-
|
|
834
|
-
invalid_param = true;
|
|
835
|
-
return true;
|
|
836
|
-
}
|
|
693
|
+
CHECK_ARG
|
|
837
694
|
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
|
838
|
-
params.use_mmap = false;
|
|
839
|
-
return true;
|
|
840
|
-
}
|
|
841
|
-
if (arg == "--lora-base") {
|
|
842
|
-
if (++i >= argc) {
|
|
843
|
-
invalid_param = true;
|
|
844
|
-
return true;
|
|
845
|
-
}
|
|
846
|
-
params.lora_base = argv[i];
|
|
847
695
|
return true;
|
|
848
696
|
}
|
|
849
697
|
if (arg == "--control-vector") {
|
|
850
|
-
|
|
851
|
-
invalid_param = true;
|
|
852
|
-
return true;
|
|
853
|
-
}
|
|
698
|
+
CHECK_ARG
|
|
854
699
|
params.control_vectors.push_back({ 1.0f, argv[i], });
|
|
855
700
|
return true;
|
|
856
701
|
}
|
|
857
702
|
if (arg == "--control-vector-scaled") {
|
|
858
|
-
|
|
859
|
-
invalid_param = true;
|
|
860
|
-
return true;
|
|
861
|
-
}
|
|
703
|
+
CHECK_ARG
|
|
862
704
|
const char* fname = argv[i];
|
|
863
|
-
|
|
864
|
-
invalid_param = true;
|
|
865
|
-
return true;
|
|
866
|
-
}
|
|
705
|
+
CHECK_ARG
|
|
867
706
|
params.control_vectors.push_back({ std::stof(argv[i]), fname, });
|
|
868
707
|
return true;
|
|
869
708
|
}
|
|
870
709
|
if (arg == "--control-vector-layer-range") {
|
|
871
|
-
|
|
872
|
-
invalid_param = true;
|
|
873
|
-
return true;
|
|
874
|
-
}
|
|
710
|
+
CHECK_ARG
|
|
875
711
|
params.control_vector_layer_start = std::stoi(argv[i]);
|
|
876
|
-
|
|
877
|
-
invalid_param = true;
|
|
878
|
-
return true;
|
|
879
|
-
}
|
|
712
|
+
CHECK_ARG
|
|
880
713
|
params.control_vector_layer_end = std::stoi(argv[i]);
|
|
881
714
|
return true;
|
|
882
715
|
}
|
|
883
716
|
if (arg == "--mmproj") {
|
|
884
|
-
|
|
885
|
-
invalid_param = true;
|
|
886
|
-
return true;
|
|
887
|
-
}
|
|
717
|
+
CHECK_ARG
|
|
888
718
|
params.mmproj = argv[i];
|
|
889
719
|
return true;
|
|
890
720
|
}
|
|
891
721
|
if (arg == "--image") {
|
|
892
|
-
|
|
893
|
-
invalid_param = true;
|
|
894
|
-
return true;
|
|
895
|
-
}
|
|
722
|
+
CHECK_ARG
|
|
896
723
|
params.image.emplace_back(argv[i]);
|
|
897
724
|
return true;
|
|
898
725
|
}
|
|
@@ -900,32 +727,35 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
900
727
|
params.interactive = true;
|
|
901
728
|
return true;
|
|
902
729
|
}
|
|
903
|
-
if (arg == "
|
|
904
|
-
params.interactive_specials = true;
|
|
905
|
-
return true;
|
|
906
|
-
}
|
|
907
|
-
if (arg == "--special") {
|
|
730
|
+
if (arg == "-sp" || arg == "--special") {
|
|
908
731
|
params.special = true;
|
|
909
732
|
return true;
|
|
910
733
|
}
|
|
911
|
-
if (arg == "--embedding") {
|
|
734
|
+
if (arg == "--embedding" || arg == "--embeddings") {
|
|
912
735
|
params.embedding = true;
|
|
913
736
|
return true;
|
|
914
737
|
}
|
|
915
|
-
if (arg == "--
|
|
916
|
-
|
|
738
|
+
if (arg == "--embd-normalize") {
|
|
739
|
+
CHECK_ARG
|
|
740
|
+
params.embd_normalize = std::stoi(argv[i]);
|
|
917
741
|
return true;
|
|
918
742
|
}
|
|
919
|
-
if (arg == "-
|
|
920
|
-
|
|
743
|
+
if (arg == "--embd-output-format") {
|
|
744
|
+
CHECK_ARG
|
|
745
|
+
params.embd_out = argv[i];
|
|
921
746
|
return true;
|
|
922
747
|
}
|
|
923
|
-
if (arg == "-
|
|
924
|
-
|
|
748
|
+
if (arg == "--embd-separator") {
|
|
749
|
+
CHECK_ARG
|
|
750
|
+
params.embd_sep = argv[i];
|
|
925
751
|
return true;
|
|
926
752
|
}
|
|
927
|
-
if (arg == "-
|
|
928
|
-
params.
|
|
753
|
+
if (arg == "-if" || arg == "--interactive-first") {
|
|
754
|
+
params.interactive_first = true;
|
|
755
|
+
return true;
|
|
756
|
+
}
|
|
757
|
+
if (arg == "-cnv" || arg == "--conversation") {
|
|
758
|
+
params.conversation = true;
|
|
929
759
|
return true;
|
|
930
760
|
}
|
|
931
761
|
if (arg == "--infill") {
|
|
@@ -948,7 +778,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
948
778
|
params.cache_type_v = argv[++i];
|
|
949
779
|
return true;
|
|
950
780
|
}
|
|
951
|
-
if (arg == "--multiline-input") {
|
|
781
|
+
if (arg == "-mli" || arg == "--multiline-input") {
|
|
952
782
|
params.multiline_input = true;
|
|
953
783
|
return true;
|
|
954
784
|
}
|
|
@@ -960,11 +790,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
960
790
|
params.cont_batching = true;
|
|
961
791
|
return true;
|
|
962
792
|
}
|
|
793
|
+
if (arg == "-nocb" || arg == "--no-cont-batching") {
|
|
794
|
+
params.cont_batching = false;
|
|
795
|
+
return true;
|
|
796
|
+
}
|
|
963
797
|
if (arg == "-fa" || arg == "--flash-attn") {
|
|
964
798
|
params.flash_attn = true;
|
|
965
799
|
return true;
|
|
966
800
|
}
|
|
967
|
-
if (arg == "--color") {
|
|
801
|
+
if (arg == "-co" || arg == "--color") {
|
|
968
802
|
params.use_color = true;
|
|
969
803
|
return true;
|
|
970
804
|
}
|
|
@@ -972,46 +806,34 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
972
806
|
params.use_mlock = true;
|
|
973
807
|
return true;
|
|
974
808
|
}
|
|
975
|
-
if (arg == "
|
|
976
|
-
|
|
977
|
-
invalid_param = true;
|
|
978
|
-
return true;
|
|
979
|
-
}
|
|
809
|
+
if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
|
|
810
|
+
CHECK_ARG
|
|
980
811
|
params.n_gpu_layers = std::stoi(argv[i]);
|
|
981
812
|
if (!llama_supports_gpu_offload()) {
|
|
982
|
-
fprintf(stderr, "warning: not compiled with GPU offload support, --
|
|
813
|
+
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
|
|
983
814
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
|
984
815
|
}
|
|
985
816
|
return true;
|
|
986
817
|
}
|
|
987
|
-
if (arg == "
|
|
988
|
-
|
|
989
|
-
invalid_param = true;
|
|
990
|
-
return true;
|
|
991
|
-
}
|
|
818
|
+
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
|
|
819
|
+
CHECK_ARG
|
|
992
820
|
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
|
993
821
|
if (!llama_supports_gpu_offload()) {
|
|
994
|
-
fprintf(stderr, "warning: not compiled with GPU offload support, --
|
|
822
|
+
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
|
995
823
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
|
996
824
|
}
|
|
997
825
|
return true;
|
|
998
826
|
}
|
|
999
827
|
if (arg == "--main-gpu" || arg == "-mg") {
|
|
1000
|
-
|
|
1001
|
-
invalid_param = true;
|
|
1002
|
-
return true;
|
|
1003
|
-
}
|
|
828
|
+
CHECK_ARG
|
|
1004
829
|
params.main_gpu = std::stoi(argv[i]);
|
|
1005
|
-
#ifndef
|
|
1006
|
-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n");
|
|
1007
|
-
#endif //
|
|
830
|
+
#ifndef GGML_USE_CUDA_SYCL_VULKAN
|
|
831
|
+
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
|
|
832
|
+
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
|
1008
833
|
return true;
|
|
1009
834
|
}
|
|
1010
835
|
if (arg == "--split-mode" || arg == "-sm") {
|
|
1011
|
-
|
|
1012
|
-
invalid_param = true;
|
|
1013
|
-
return true;
|
|
1014
|
-
}
|
|
836
|
+
CHECK_ARG
|
|
1015
837
|
std::string arg_next = argv[i];
|
|
1016
838
|
if (arg_next == "none") {
|
|
1017
839
|
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
|
@@ -1030,16 +852,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1030
852
|
invalid_param = true;
|
|
1031
853
|
return true;
|
|
1032
854
|
}
|
|
1033
|
-
#ifndef
|
|
1034
|
-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n");
|
|
1035
|
-
#endif //
|
|
855
|
+
#ifndef GGML_USE_CUDA_SYCL_VULKAN
|
|
856
|
+
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
|
|
857
|
+
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
|
1036
858
|
return true;
|
|
1037
859
|
}
|
|
1038
860
|
if (arg == "--tensor-split" || arg == "-ts") {
|
|
1039
|
-
|
|
1040
|
-
invalid_param = true;
|
|
1041
|
-
return true;
|
|
1042
|
-
}
|
|
861
|
+
CHECK_ARG
|
|
1043
862
|
std::string arg_next = argv[i];
|
|
1044
863
|
|
|
1045
864
|
// split string by , and /
|
|
@@ -1064,10 +883,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1064
883
|
return true;
|
|
1065
884
|
}
|
|
1066
885
|
if (arg == "--rpc") {
|
|
1067
|
-
|
|
1068
|
-
invalid_param = true;
|
|
1069
|
-
return true;
|
|
1070
|
-
}
|
|
886
|
+
CHECK_ARG
|
|
1071
887
|
params.rpc_servers = argv[i];
|
|
1072
888
|
return true;
|
|
1073
889
|
}
|
|
@@ -1076,10 +892,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1076
892
|
return true;
|
|
1077
893
|
}
|
|
1078
894
|
if (arg == "--numa") {
|
|
1079
|
-
|
|
1080
|
-
invalid_param = true;
|
|
1081
|
-
return true;
|
|
1082
|
-
}
|
|
895
|
+
CHECK_ARG
|
|
1083
896
|
std::string value(argv[i]);
|
|
1084
897
|
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
|
1085
898
|
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
|
|
@@ -1087,6 +900,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1087
900
|
else { invalid_param = true; }
|
|
1088
901
|
return true;
|
|
1089
902
|
}
|
|
903
|
+
if (arg == "-v" || arg == "--verbose") {
|
|
904
|
+
params.verbosity = 1;
|
|
905
|
+
return true;
|
|
906
|
+
}
|
|
907
|
+
if (arg == "--verbosity") {
|
|
908
|
+
CHECK_ARG
|
|
909
|
+
params.verbosity = std::stoi(argv[i]);
|
|
910
|
+
return true;
|
|
911
|
+
}
|
|
1090
912
|
if (arg == "--verbose-prompt") {
|
|
1091
913
|
params.verbose_prompt = true;
|
|
1092
914
|
return true;
|
|
@@ -1096,18 +918,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1096
918
|
return true;
|
|
1097
919
|
}
|
|
1098
920
|
if (arg == "-r" || arg == "--reverse-prompt") {
|
|
1099
|
-
|
|
1100
|
-
invalid_param = true;
|
|
1101
|
-
return true;
|
|
1102
|
-
}
|
|
921
|
+
CHECK_ARG
|
|
1103
922
|
params.antiprompt.emplace_back(argv[i]);
|
|
1104
923
|
return true;
|
|
1105
924
|
}
|
|
1106
925
|
if (arg == "-ld" || arg == "--logdir") {
|
|
1107
|
-
|
|
1108
|
-
invalid_param = true;
|
|
1109
|
-
return true;
|
|
1110
|
-
}
|
|
926
|
+
CHECK_ARG
|
|
1111
927
|
params.logdir = argv[i];
|
|
1112
928
|
|
|
1113
929
|
if (params.logdir.back() != DIRECTORY_SEPARATOR) {
|
|
@@ -1116,209 +932,400 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1116
932
|
return true;
|
|
1117
933
|
}
|
|
1118
934
|
if (arg == "-lcs" || arg == "--lookup-cache-static") {
|
|
1119
|
-
|
|
935
|
+
CHECK_ARG
|
|
936
|
+
params.lookup_cache_static = argv[i];
|
|
937
|
+
return true;
|
|
938
|
+
}
|
|
939
|
+
if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
|
|
940
|
+
CHECK_ARG
|
|
941
|
+
params.lookup_cache_dynamic = argv[i];
|
|
942
|
+
return true;
|
|
943
|
+
}
|
|
944
|
+
if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
|
|
945
|
+
CHECK_ARG
|
|
946
|
+
params.logits_file = argv[i];
|
|
947
|
+
return true;
|
|
948
|
+
}
|
|
949
|
+
if (arg == "--perplexity" || arg == "--all-logits") {
|
|
950
|
+
params.logits_all = true;
|
|
951
|
+
return true;
|
|
952
|
+
}
|
|
953
|
+
if (arg == "--ppl-stride") {
|
|
954
|
+
CHECK_ARG
|
|
955
|
+
params.ppl_stride = std::stoi(argv[i]);
|
|
956
|
+
return true;
|
|
957
|
+
}
|
|
958
|
+
if (arg == "--ppl-output-type") {
|
|
959
|
+
CHECK_ARG
|
|
960
|
+
params.ppl_output_type = std::stoi(argv[i]);
|
|
961
|
+
return true;
|
|
962
|
+
}
|
|
963
|
+
if (arg == "-ptc" || arg == "--print-token-count") {
|
|
964
|
+
CHECK_ARG
|
|
965
|
+
params.n_print = std::stoi(argv[i]);
|
|
966
|
+
return true;
|
|
967
|
+
}
|
|
968
|
+
if (arg == "--check-tensors") {
|
|
969
|
+
params.check_tensors = true;
|
|
970
|
+
return true;
|
|
971
|
+
}
|
|
972
|
+
if (arg == "--hellaswag") {
|
|
973
|
+
params.hellaswag = true;
|
|
974
|
+
return true;
|
|
975
|
+
}
|
|
976
|
+
if (arg == "--hellaswag-tasks") {
|
|
977
|
+
CHECK_ARG
|
|
978
|
+
params.hellaswag_tasks = std::stoi(argv[i]);
|
|
979
|
+
return true;
|
|
980
|
+
}
|
|
981
|
+
if (arg == "--winogrande") {
|
|
982
|
+
params.winogrande = true;
|
|
983
|
+
return true;
|
|
984
|
+
}
|
|
985
|
+
if (arg == "--winogrande-tasks") {
|
|
986
|
+
CHECK_ARG
|
|
987
|
+
params.winogrande_tasks = std::stoi(argv[i]);
|
|
988
|
+
return true;
|
|
989
|
+
}
|
|
990
|
+
if (arg == "--multiple-choice") {
|
|
991
|
+
params.multiple_choice = true;
|
|
992
|
+
return true;
|
|
993
|
+
}
|
|
994
|
+
if (arg == "--multiple-choice-tasks") {
|
|
995
|
+
CHECK_ARG
|
|
996
|
+
params.multiple_choice_tasks = std::stoi(argv[i]);
|
|
997
|
+
return true;
|
|
998
|
+
}
|
|
999
|
+
if (arg == "--kl-divergence") {
|
|
1000
|
+
params.kl_divergence = true;
|
|
1001
|
+
return true;
|
|
1002
|
+
}
|
|
1003
|
+
if (arg == "--ignore-eos") {
|
|
1004
|
+
params.ignore_eos = true;
|
|
1005
|
+
return true;
|
|
1006
|
+
}
|
|
1007
|
+
if (arg == "--penalize-nl") {
|
|
1008
|
+
sparams.penalize_nl = true;
|
|
1009
|
+
return true;
|
|
1010
|
+
}
|
|
1011
|
+
if (arg == "-l" || arg == "--logit-bias") {
|
|
1012
|
+
CHECK_ARG
|
|
1013
|
+
std::stringstream ss(argv[i]);
|
|
1014
|
+
llama_token key;
|
|
1015
|
+
char sign;
|
|
1016
|
+
std::string value_str;
|
|
1017
|
+
try {
|
|
1018
|
+
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
|
1019
|
+
sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
|
1020
|
+
}
|
|
1021
|
+
else {
|
|
1022
|
+
throw std::exception();
|
|
1023
|
+
}
|
|
1024
|
+
}
|
|
1025
|
+
catch (const std::exception&) {
|
|
1120
1026
|
invalid_param = true;
|
|
1121
1027
|
return true;
|
|
1122
1028
|
}
|
|
1123
|
-
params.lookup_cache_static = argv[i];
|
|
1124
1029
|
return true;
|
|
1125
1030
|
}
|
|
1126
|
-
if (arg == "-
|
|
1127
|
-
|
|
1031
|
+
if (arg == "-h" || arg == "--help" || arg == "--usage" ) {
|
|
1032
|
+
params.usage = true;
|
|
1033
|
+
return true;
|
|
1034
|
+
}
|
|
1035
|
+
if (arg == "--version") {
|
|
1036
|
+
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
|
1037
|
+
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
|
1038
|
+
exit(0);
|
|
1039
|
+
}
|
|
1040
|
+
if (arg == "--in-prefix-bos") {
|
|
1041
|
+
params.input_prefix_bos = true;
|
|
1042
|
+
params.enable_chat_template = false;
|
|
1043
|
+
return true;
|
|
1044
|
+
}
|
|
1045
|
+
if (arg == "--in-prefix") {
|
|
1046
|
+
CHECK_ARG
|
|
1047
|
+
params.input_prefix = argv[i];
|
|
1048
|
+
params.enable_chat_template = false;
|
|
1049
|
+
return true;
|
|
1050
|
+
}
|
|
1051
|
+
if (arg == "--in-suffix") {
|
|
1052
|
+
CHECK_ARG
|
|
1053
|
+
params.input_suffix = argv[i];
|
|
1054
|
+
params.enable_chat_template = false;
|
|
1055
|
+
return true;
|
|
1056
|
+
}
|
|
1057
|
+
if (arg == "--spm-infill") {
|
|
1058
|
+
params.spm_infill = true;
|
|
1059
|
+
return true;
|
|
1060
|
+
}
|
|
1061
|
+
if (arg == "--grammar") {
|
|
1062
|
+
CHECK_ARG
|
|
1063
|
+
sparams.grammar = argv[i];
|
|
1064
|
+
return true;
|
|
1065
|
+
}
|
|
1066
|
+
if (arg == "--grammar-file") {
|
|
1067
|
+
CHECK_ARG
|
|
1068
|
+
std::ifstream file(argv[i]);
|
|
1069
|
+
if (!file) {
|
|
1070
|
+
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
1128
1071
|
invalid_param = true;
|
|
1129
1072
|
return true;
|
|
1130
1073
|
}
|
|
1131
|
-
|
|
1074
|
+
std::copy(
|
|
1075
|
+
std::istreambuf_iterator<char>(file),
|
|
1076
|
+
std::istreambuf_iterator<char>(),
|
|
1077
|
+
std::back_inserter(sparams.grammar)
|
|
1078
|
+
);
|
|
1132
1079
|
return true;
|
|
1133
1080
|
}
|
|
1134
|
-
if (arg == "
|
|
1135
|
-
|
|
1081
|
+
if (arg == "-j" || arg == "--json-schema") {
|
|
1082
|
+
CHECK_ARG
|
|
1083
|
+
sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
|
|
1084
|
+
return true;
|
|
1085
|
+
}
|
|
1086
|
+
if (arg == "--override-kv") {
|
|
1087
|
+
CHECK_ARG
|
|
1088
|
+
if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
|
|
1089
|
+
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
|
1136
1090
|
invalid_param = true;
|
|
1137
1091
|
return true;
|
|
1138
1092
|
}
|
|
1139
|
-
params.logits_file = argv[i];
|
|
1140
1093
|
return true;
|
|
1141
1094
|
}
|
|
1142
|
-
if (arg == "--
|
|
1143
|
-
|
|
1095
|
+
if (arg == "--host") {
|
|
1096
|
+
CHECK_ARG
|
|
1097
|
+
params.hostname = argv[i];
|
|
1144
1098
|
return true;
|
|
1145
1099
|
}
|
|
1146
|
-
if (arg == "--
|
|
1147
|
-
|
|
1100
|
+
if (arg == "--port") {
|
|
1101
|
+
CHECK_ARG
|
|
1102
|
+
params.port = std::stoi(argv[i]);
|
|
1103
|
+
return true;
|
|
1104
|
+
}
|
|
1105
|
+
if (arg == "--path") {
|
|
1106
|
+
CHECK_ARG
|
|
1107
|
+
params.public_path = argv[i];
|
|
1108
|
+
return true;
|
|
1109
|
+
}
|
|
1110
|
+
if (arg == "--api-key") {
|
|
1111
|
+
CHECK_ARG
|
|
1112
|
+
params.api_keys.push_back(argv[i]);
|
|
1113
|
+
return true;
|
|
1114
|
+
}
|
|
1115
|
+
if (arg == "--api-key-file") {
|
|
1116
|
+
CHECK_ARG
|
|
1117
|
+
std::ifstream key_file(argv[i]);
|
|
1118
|
+
if (!key_file) {
|
|
1119
|
+
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
1148
1120
|
invalid_param = true;
|
|
1149
1121
|
return true;
|
|
1150
1122
|
}
|
|
1151
|
-
|
|
1123
|
+
std::string key;
|
|
1124
|
+
while (std::getline(key_file, key)) {
|
|
1125
|
+
if (!key.empty()) {
|
|
1126
|
+
params.api_keys.push_back(key);
|
|
1127
|
+
}
|
|
1128
|
+
}
|
|
1129
|
+
key_file.close();
|
|
1130
|
+
return true;
|
|
1131
|
+
}
|
|
1132
|
+
if (arg == "--ssl-key-file") {
|
|
1133
|
+
CHECK_ARG
|
|
1134
|
+
params.ssl_file_key = argv[i];
|
|
1135
|
+
return true;
|
|
1136
|
+
}
|
|
1137
|
+
if (arg == "--ssl-cert-file") {
|
|
1138
|
+
CHECK_ARG
|
|
1139
|
+
params.ssl_file_cert = argv[i];
|
|
1140
|
+
return true;
|
|
1141
|
+
}
|
|
1142
|
+
if (arg == "--timeout" || arg == "-to") {
|
|
1143
|
+
CHECK_ARG
|
|
1144
|
+
params.timeout_read = std::stoi(argv[i]);
|
|
1145
|
+
params.timeout_write = std::stoi(argv[i]);
|
|
1146
|
+
return true;
|
|
1147
|
+
}
|
|
1148
|
+
if (arg == "--threads-http") {
|
|
1149
|
+
CHECK_ARG
|
|
1150
|
+
params.n_threads_http = std::stoi(argv[i]);
|
|
1151
|
+
return true;
|
|
1152
|
+
}
|
|
1153
|
+
if (arg == "-spf" || arg == "--system-prompt-file") {
|
|
1154
|
+
CHECK_ARG
|
|
1155
|
+
std::ifstream file(argv[i]);
|
|
1156
|
+
if (!file) {
|
|
1157
|
+
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
1158
|
+
invalid_param = true;
|
|
1159
|
+
return true;
|
|
1160
|
+
}
|
|
1161
|
+
std::string system_prompt;
|
|
1162
|
+
std::copy(
|
|
1163
|
+
std::istreambuf_iterator<char>(file),
|
|
1164
|
+
std::istreambuf_iterator<char>(),
|
|
1165
|
+
std::back_inserter(system_prompt)
|
|
1166
|
+
);
|
|
1167
|
+
params.system_prompt = system_prompt;
|
|
1168
|
+
return true;
|
|
1169
|
+
}
|
|
1170
|
+
if (arg == "--log-format") {
|
|
1171
|
+
CHECK_ARG
|
|
1172
|
+
if (std::strcmp(argv[i], "json") == 0) {
|
|
1173
|
+
params.log_json = true;
|
|
1174
|
+
} else if (std::strcmp(argv[i], "text") == 0) {
|
|
1175
|
+
params.log_json = false;
|
|
1176
|
+
} else {
|
|
1177
|
+
invalid_param = true;
|
|
1178
|
+
return true;
|
|
1179
|
+
}
|
|
1180
|
+
return true;
|
|
1181
|
+
}
|
|
1182
|
+
if (arg == "--no-slots") {
|
|
1183
|
+
params.endpoint_slots = false;
|
|
1184
|
+
return true;
|
|
1185
|
+
}
|
|
1186
|
+
if (arg == "--metrics") {
|
|
1187
|
+
params.endpoint_metrics = true;
|
|
1188
|
+
return true;
|
|
1189
|
+
}
|
|
1190
|
+
if (arg == "--slot-save-path") {
|
|
1191
|
+
CHECK_ARG
|
|
1192
|
+
params.slot_save_path = argv[i];
|
|
1193
|
+
// if doesn't end with DIRECTORY_SEPARATOR, add it
|
|
1194
|
+
if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
|
|
1195
|
+
params.slot_save_path += DIRECTORY_SEPARATOR;
|
|
1196
|
+
}
|
|
1152
1197
|
return true;
|
|
1153
1198
|
}
|
|
1154
|
-
if (arg == "
|
|
1155
|
-
|
|
1199
|
+
if (arg == "--chat-template") {
|
|
1200
|
+
CHECK_ARG
|
|
1201
|
+
if (!llama_chat_verify_template(argv[i])) {
|
|
1202
|
+
fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
|
|
1203
|
+
fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
|
|
1156
1204
|
invalid_param = true;
|
|
1157
1205
|
return true;
|
|
1158
1206
|
}
|
|
1159
|
-
params.
|
|
1207
|
+
params.chat_template = argv[i];
|
|
1160
1208
|
return true;
|
|
1161
1209
|
}
|
|
1162
|
-
if (arg == "--
|
|
1163
|
-
|
|
1210
|
+
if (arg == "--slot-prompt-similarity" || arg == "-sps") {
|
|
1211
|
+
CHECK_ARG
|
|
1212
|
+
params.slot_prompt_similarity = std::stof(argv[i]);
|
|
1164
1213
|
return true;
|
|
1165
1214
|
}
|
|
1166
|
-
if (arg == "
|
|
1167
|
-
|
|
1168
|
-
invalid_param = true;
|
|
1169
|
-
return true;
|
|
1170
|
-
}
|
|
1171
|
-
params.ppl_output_type = std::stoi(argv[i]);
|
|
1215
|
+
if (arg == "-pps") {
|
|
1216
|
+
params.is_pp_shared = true;
|
|
1172
1217
|
return true;
|
|
1173
1218
|
}
|
|
1174
|
-
if (arg == "
|
|
1175
|
-
|
|
1219
|
+
if (arg == "-npp") {
|
|
1220
|
+
CHECK_ARG
|
|
1221
|
+
auto p = string_split<int>(argv[i], split_delim);
|
|
1222
|
+
params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
|
|
1176
1223
|
return true;
|
|
1177
1224
|
}
|
|
1178
|
-
if (arg == "
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
}
|
|
1183
|
-
params.hellaswag_tasks = std::stoi(argv[i]);
|
|
1225
|
+
if (arg == "-ntg") {
|
|
1226
|
+
CHECK_ARG
|
|
1227
|
+
auto p = string_split<int>(argv[i], split_delim);
|
|
1228
|
+
params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
|
|
1184
1229
|
return true;
|
|
1185
1230
|
}
|
|
1186
|
-
if (arg == "
|
|
1187
|
-
|
|
1231
|
+
if (arg == "-npl") {
|
|
1232
|
+
CHECK_ARG
|
|
1233
|
+
auto p = string_split<int>(argv[i], split_delim);
|
|
1234
|
+
params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
|
|
1188
1235
|
return true;
|
|
1189
1236
|
}
|
|
1190
|
-
if (arg == "--
|
|
1191
|
-
|
|
1237
|
+
if (arg == "--context-file") {
|
|
1238
|
+
CHECK_ARG
|
|
1239
|
+
std::ifstream file(argv[i], std::ios::binary);
|
|
1240
|
+
if (!file) {
|
|
1241
|
+
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
1192
1242
|
invalid_param = true;
|
|
1193
1243
|
return true;
|
|
1194
1244
|
}
|
|
1195
|
-
params.
|
|
1245
|
+
params.context_files.push_back(argv[i]);
|
|
1196
1246
|
return true;
|
|
1197
1247
|
}
|
|
1198
|
-
if (arg == "--
|
|
1199
|
-
|
|
1248
|
+
if (arg == "--chunk-size") {
|
|
1249
|
+
CHECK_ARG
|
|
1250
|
+
params.chunk_size = std::stoi(argv[i]);
|
|
1200
1251
|
return true;
|
|
1201
1252
|
}
|
|
1202
|
-
if (arg == "--
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
return true;
|
|
1206
|
-
}
|
|
1207
|
-
params.multiple_choice_tasks = std::stoi(argv[i]);
|
|
1253
|
+
if (arg == "--chunk-separator") {
|
|
1254
|
+
CHECK_ARG
|
|
1255
|
+
params.chunk_separator = argv[i];
|
|
1208
1256
|
return true;
|
|
1209
1257
|
}
|
|
1210
|
-
if (arg == "--
|
|
1211
|
-
|
|
1258
|
+
if (arg == "--junk") {
|
|
1259
|
+
CHECK_ARG
|
|
1260
|
+
params.n_junk = std::stoi(argv[i]);
|
|
1212
1261
|
return true;
|
|
1213
1262
|
}
|
|
1214
|
-
if (arg == "--
|
|
1215
|
-
|
|
1263
|
+
if (arg == "--pos") {
|
|
1264
|
+
CHECK_ARG
|
|
1265
|
+
params.i_pos = std::stoi(argv[i]);
|
|
1216
1266
|
return true;
|
|
1217
1267
|
}
|
|
1218
|
-
if (arg == "--
|
|
1219
|
-
|
|
1268
|
+
if (arg == "-o" || arg == "--output" || arg == "--output-file") {
|
|
1269
|
+
CHECK_ARG
|
|
1270
|
+
params.out_file = argv[i];
|
|
1271
|
+
params.cvector_outfile = argv[i];
|
|
1272
|
+
params.lora_outfile = argv[i];
|
|
1220
1273
|
return true;
|
|
1221
1274
|
}
|
|
1222
|
-
if (arg == "-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
return true;
|
|
1226
|
-
}
|
|
1227
|
-
std::stringstream ss(argv[i]);
|
|
1228
|
-
llama_token key;
|
|
1229
|
-
char sign;
|
|
1230
|
-
std::string value_str;
|
|
1231
|
-
try {
|
|
1232
|
-
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
|
1233
|
-
sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
|
1234
|
-
}
|
|
1235
|
-
else {
|
|
1236
|
-
throw std::exception();
|
|
1237
|
-
}
|
|
1238
|
-
}
|
|
1239
|
-
catch (const std::exception&) {
|
|
1240
|
-
invalid_param = true;
|
|
1241
|
-
return true;
|
|
1242
|
-
}
|
|
1275
|
+
if (arg == "-ofreq" || arg == "--output-frequency") {
|
|
1276
|
+
CHECK_ARG
|
|
1277
|
+
params.n_out_freq = std::stoi(argv[i]);
|
|
1243
1278
|
return true;
|
|
1244
1279
|
}
|
|
1245
|
-
if (arg == "-
|
|
1246
|
-
|
|
1247
|
-
|
|
1280
|
+
if (arg == "--save-frequency") {
|
|
1281
|
+
CHECK_ARG
|
|
1282
|
+
params.n_save_freq = std::stoi(argv[i]);
|
|
1283
|
+
return true;
|
|
1248
1284
|
}
|
|
1249
|
-
if (arg == "--
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
exit(0);
|
|
1285
|
+
if (arg == "--process-output") {
|
|
1286
|
+
params.process_output = true;
|
|
1287
|
+
return true;
|
|
1253
1288
|
}
|
|
1254
|
-
if (arg == "--
|
|
1255
|
-
params.
|
|
1289
|
+
if (arg == "--no-ppl") {
|
|
1290
|
+
params.compute_ppl = false;
|
|
1256
1291
|
return true;
|
|
1257
1292
|
}
|
|
1258
|
-
if (arg == "--
|
|
1259
|
-
|
|
1293
|
+
if (arg == "--chunk" || arg == "--from-chunk") {
|
|
1294
|
+
CHECK_ARG
|
|
1295
|
+
params.i_chunk = std::stoi(argv[i]);
|
|
1260
1296
|
return true;
|
|
1261
1297
|
}
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
}
|
|
1267
|
-
params.input_prefix = argv[i];
|
|
1298
|
+
// cvector params
|
|
1299
|
+
if (arg == "--positive-file") {
|
|
1300
|
+
CHECK_ARG
|
|
1301
|
+
params.cvector_positive_file = argv[i];
|
|
1268
1302
|
return true;
|
|
1269
1303
|
}
|
|
1270
|
-
if (arg == "--
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
return true;
|
|
1274
|
-
}
|
|
1275
|
-
params.input_suffix = argv[i];
|
|
1304
|
+
if (arg == "--negative-file") {
|
|
1305
|
+
CHECK_ARG
|
|
1306
|
+
params.cvector_negative_file = argv[i];
|
|
1276
1307
|
return true;
|
|
1277
1308
|
}
|
|
1278
|
-
if (arg == "--
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
return true;
|
|
1282
|
-
}
|
|
1283
|
-
sparams.grammar = argv[i];
|
|
1309
|
+
if (arg == "--pca-batch") {
|
|
1310
|
+
CHECK_ARG
|
|
1311
|
+
params.n_pca_batch = std::stoi(argv[i]);
|
|
1284
1312
|
return true;
|
|
1285
1313
|
}
|
|
1286
|
-
if (arg == "--
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
return true;
|
|
1290
|
-
}
|
|
1291
|
-
std::ifstream file(argv[i]);
|
|
1292
|
-
if (!file) {
|
|
1293
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
1294
|
-
invalid_param = true;
|
|
1295
|
-
return true;
|
|
1296
|
-
}
|
|
1297
|
-
std::copy(
|
|
1298
|
-
std::istreambuf_iterator<char>(file),
|
|
1299
|
-
std::istreambuf_iterator<char>(),
|
|
1300
|
-
std::back_inserter(sparams.grammar)
|
|
1301
|
-
);
|
|
1314
|
+
if (arg == "--pca-iter") {
|
|
1315
|
+
CHECK_ARG
|
|
1316
|
+
params.n_pca_iterations = std::stoi(argv[i]);
|
|
1302
1317
|
return true;
|
|
1303
1318
|
}
|
|
1304
|
-
if (arg == "
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
}
|
|
1309
|
-
|
|
1319
|
+
if (arg == "--method") {
|
|
1320
|
+
CHECK_ARG
|
|
1321
|
+
std::string value(argv[i]);
|
|
1322
|
+
/**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
|
|
1323
|
+
else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
|
|
1324
|
+
else { invalid_param = true; }
|
|
1310
1325
|
return true;
|
|
1311
1326
|
}
|
|
1312
|
-
if (arg == "--
|
|
1313
|
-
|
|
1314
|
-
invalid_param = true;
|
|
1315
|
-
return true;
|
|
1316
|
-
}
|
|
1317
|
-
if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
|
|
1318
|
-
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
|
1319
|
-
invalid_param = true;
|
|
1320
|
-
return true;
|
|
1321
|
-
}
|
|
1327
|
+
if (arg == "--no-warmup") {
|
|
1328
|
+
params.warmup = false;
|
|
1322
1329
|
return true;
|
|
1323
1330
|
}
|
|
1324
1331
|
#ifndef LOG_DISABLE_LOGS
|
|
@@ -1332,10 +1339,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1332
1339
|
// We have a matching known parameter requiring an argument,
|
|
1333
1340
|
// now we need to check if there is anything after this argv
|
|
1334
1341
|
// and flag invalid_param or parse it.
|
|
1335
|
-
|
|
1336
|
-
invalid_param = true;
|
|
1337
|
-
return true;
|
|
1338
|
-
}
|
|
1342
|
+
CHECK_ARG
|
|
1339
1343
|
if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) {
|
|
1340
1344
|
invalid_param = true;
|
|
1341
1345
|
return true;
|
|
@@ -1348,6 +1352,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1348
1352
|
return false;
|
|
1349
1353
|
}
|
|
1350
1354
|
|
|
1355
|
+
#ifdef __GNUC__
|
|
1356
|
+
#ifdef __MINGW32__
|
|
1357
|
+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
|
1358
|
+
#else
|
|
1359
|
+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
|
1360
|
+
#endif
|
|
1361
|
+
#else
|
|
1362
|
+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
|
1363
|
+
#endif
|
|
1364
|
+
|
|
1351
1365
|
void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
1352
1366
|
const llama_sampling_params & sparams = params.sparams;
|
|
1353
1367
|
|
|
@@ -1359,198 +1373,340 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|
|
1359
1373
|
}
|
|
1360
1374
|
sampler_type_names.pop_back();
|
|
1361
1375
|
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1376
|
+
struct option_info {
|
|
1377
|
+
LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5)
|
|
1378
|
+
option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) {
|
|
1379
|
+
va_list args_list;
|
|
1380
|
+
va_start(args_list, desc);
|
|
1381
|
+
char buffer[1024];
|
|
1382
|
+
vsnprintf(buffer, sizeof(buffer), desc, args_list);
|
|
1383
|
+
va_end(args_list);
|
|
1384
|
+
this->desc = buffer;
|
|
1385
|
+
}
|
|
1386
|
+
|
|
1387
|
+
option_info(const std::string & grp) : grp(grp) {}
|
|
1388
|
+
|
|
1389
|
+
std::string tags;
|
|
1390
|
+
std::string args;
|
|
1391
|
+
std::string desc;
|
|
1392
|
+
std::string grp;
|
|
1393
|
+
};
|
|
1394
|
+
|
|
1395
|
+
std::vector<option_info> options;
|
|
1396
|
+
|
|
1397
|
+
// TODO: filter by tags
|
|
1398
|
+
|
|
1399
|
+
options.push_back({ "general" });
|
|
1400
|
+
options.push_back({ "*", "-h, --help, --usage", "print usage and exit" });
|
|
1401
|
+
options.push_back({ "*", " --version", "show version and build info" });
|
|
1402
|
+
options.push_back({ "*", "-v, --verbose", "print verbose information" });
|
|
1403
|
+
options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity });
|
|
1404
|
+
options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
|
|
1405
|
+
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
|
|
1406
|
+
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
|
|
1407
|
+
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
|
|
1408
|
+
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
|
|
1409
|
+
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
|
|
1410
|
+
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
|
|
1411
|
+
options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
|
|
1412
|
+
"number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
|
|
1413
|
+
options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
|
|
1414
|
+
options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
|
|
1415
|
+
options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
|
|
1416
|
+
"path to static lookup cache to use for lookup decoding (not updated by generation)" });
|
|
1417
|
+
options.push_back({ "*", "-lcd, --lookup-cache-dynamic FNAME",
|
|
1418
|
+
"path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
|
|
1419
|
+
|
|
1420
|
+
options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
|
|
1421
|
+
options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
|
|
1422
|
+
options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch });
|
|
1423
|
+
options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch });
|
|
1424
|
+
options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
|
|
1425
|
+
options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
|
|
1426
|
+
options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
|
|
1427
|
+
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n"
|
|
1428
|
+
"in conversation mode, this will be used as system prompt\n"
|
|
1429
|
+
"(default: '%s')", params.prompt.c_str() });
|
|
1430
|
+
options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
|
|
1431
|
+
options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" });
|
|
1432
|
+
options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
|
|
1433
|
+
options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
|
|
1434
|
+
options.push_back({ "*", " --no-escape", "do not process escape sequences" });
|
|
1435
|
+
options.push_back({ "main", "-ptc, --print-token-count N", "print token count every N tokens (default: %d)", params.n_print });
|
|
1436
|
+
options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" });
|
|
1437
|
+
options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n"
|
|
1438
|
+
"not supported with --interactive or other interactive options" });
|
|
1439
|
+
options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" });
|
|
1440
|
+
options.push_back({ "main", "-r, --reverse-prompt PROMPT",
|
|
1441
|
+
"halt generation at PROMPT, return control in interactive mode\n"
|
|
1442
|
+
"can be specified more than once for multiple prompts" });
|
|
1443
|
+
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
|
|
1444
|
+
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode, does not print special tokens and suffix/prefix\n"
|
|
1445
|
+
"if suffix/prefix are not specified, default chat template will be used\n"
|
|
1446
|
+
"(default: %s)", params.conversation ? "true" : "false" });
|
|
1447
|
+
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
|
|
1448
|
+
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
|
|
1449
|
+
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
|
|
1450
|
+
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
|
|
1451
|
+
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
|
|
1452
|
+
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
|
|
1453
|
+
options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" });
|
|
1454
|
+
options.push_back({ "server infill",
|
|
1455
|
+
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
|
|
1456
|
+
|
|
1457
|
+
options.push_back({ "sampling" });
|
|
1458
|
+
options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
|
|
1459
|
+
"(default: %s)", sampler_type_names.c_str() });
|
|
1460
|
+
options.push_back({ "*", " --sampling-seq SEQUENCE",
|
|
1461
|
+
"simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() });
|
|
1462
|
+
options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" });
|
|
1463
|
+
options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" });
|
|
1464
|
+
options.push_back({ "*", " --temp N", "temperature (default: %.1f)", (double)sparams.temp });
|
|
1465
|
+
options.push_back({ "*", " --top-k N", "top-k sampling (default: %d, 0 = disabled)", sparams.top_k });
|
|
1466
|
+
options.push_back({ "*", " --top-p N", "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p });
|
|
1467
|
+
options.push_back({ "*", " --min-p N", "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p });
|
|
1468
|
+
options.push_back({ "*", " --tfs N", "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z });
|
|
1469
|
+
options.push_back({ "*", " --typical N", "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p });
|
|
1470
|
+
options.push_back({ "*", " --repeat-last-n N", "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n });
|
|
1471
|
+
options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat });
|
|
1472
|
+
options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present });
|
|
1473
|
+
options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq });
|
|
1474
|
+
options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range });
|
|
1475
|
+
options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent });
|
|
1476
|
+
options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n"
|
|
1477
|
+
"Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
|
|
1478
|
+
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat });
|
|
1479
|
+
options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
|
|
1480
|
+
options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
|
|
1481
|
+
options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n"
|
|
1482
|
+
"i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
|
|
1483
|
+
"or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
|
|
1484
|
+
options.push_back({ "main", " --cfg-negative-prompt PROMPT",
|
|
1485
|
+
"negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() });
|
|
1486
|
+
options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
|
|
1487
|
+
"negative prompt file to use for guidance" });
|
|
1488
|
+
options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
|
|
1489
|
+
options.push_back({ "main", " --chat-template JINJA_TEMPLATE",
|
|
1490
|
+
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
|
1491
|
+
"if suffix/prefix are specified, template will be disabled\n"
|
|
1492
|
+
"only commonly used templates are accepted:\n"
|
|
1493
|
+
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
|
|
1494
|
+
options.push_back({ "grammar" });
|
|
1495
|
+
options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
|
|
1496
|
+
options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
|
|
1497
|
+
options.push_back({ "*", "-j, --json-schema SCHEMA",
|
|
1498
|
+
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n"
|
|
1499
|
+
"For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
|
|
1500
|
+
|
|
1501
|
+
options.push_back({ "embedding" });
|
|
1502
|
+
options.push_back({ "embedding", " --pooling {none,mean,cls,last}",
|
|
1503
|
+
"pooling type for embeddings, use model default if unspecified" });
|
|
1504
|
+
options.push_back({ "embedding", " --attention {causal,non-causal}",
|
|
1505
|
+
"attention type for embeddings, use model default if unspecified" });
|
|
1506
|
+
|
|
1507
|
+
options.push_back({ "context hacking" });
|
|
1508
|
+
options.push_back({ "*", " --rope-scaling {none,linear,yarn}",
|
|
1509
|
+
"RoPE frequency scaling method, defaults to linear unless specified by the model" });
|
|
1510
|
+
options.push_back({ "*", " --rope-scale N", "RoPE context scaling factor, expands context by a factor of N" });
|
|
1511
|
+
options.push_back({ "*", " --rope-freq-base N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" });
|
|
1512
|
+
options.push_back({ "*", " --rope-freq-scale N", "RoPE frequency scaling factor, expands context by a factor of 1/N" });
|
|
1513
|
+
options.push_back({ "*", " --yarn-orig-ctx N", "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx });
|
|
1514
|
+
options.push_back({ "*", " --yarn-ext-factor N", "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor });
|
|
1515
|
+
options.push_back({ "*", " --yarn-attn-factor N", "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor });
|
|
1516
|
+
options.push_back({ "*", " --yarn-beta-slow N", "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow });
|
|
1517
|
+
options.push_back({ "*", " --yarn-beta-fast N", "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast });
|
|
1518
|
+
options.push_back({ "*", "-gan, --grp-attn-n N", "group-attention factor (default: %d)", params.grp_attn_n });
|
|
1519
|
+
options.push_back({ "*", "-gaw, --grp-attn-w N", "group-attention width (default: %.1f)", (double)params.grp_attn_w });
|
|
1520
|
+
options.push_back({ "*", "-dkvc, --dump-kv-cache", "verbose print of the KV cache" });
|
|
1521
|
+
options.push_back({ "*", "-nkvo, --no-kv-offload", "disable KV offload" });
|
|
1522
|
+
options.push_back({ "*", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() });
|
|
1523
|
+
options.push_back({ "*", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() });
|
|
1524
|
+
|
|
1525
|
+
options.push_back({ "perplexity" });
|
|
1526
|
+
options.push_back({ "perplexity", " --all-logits", "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" });
|
|
1527
|
+
options.push_back({ "perplexity", " --hellaswag", "compute HellaSwag score over random tasks from datafile supplied with -f" });
|
|
1528
|
+
options.push_back({ "perplexity", " --hellaswag-tasks N", "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks });
|
|
1529
|
+
options.push_back({ "perplexity", " --winogrande", "compute Winogrande score over random tasks from datafile supplied with -f" });
|
|
1530
|
+
options.push_back({ "perplexity", " --winogrande-tasks N", "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks });
|
|
1531
|
+
options.push_back({ "perplexity", " --multiple-choice", "compute multiple choice score over random tasks from datafile supplied with -f" });
|
|
1532
|
+
options.push_back({ "perplexity", " --multiple-choice-tasks N",
|
|
1533
|
+
"number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks });
|
|
1534
|
+
options.push_back({ "perplexity", " --kl-divergence", "computes KL-divergence to logits provided via --kl-divergence-base" });
|
|
1535
|
+
options.push_back({ "perplexity", " --ppl-stride N", "stride for perplexity calculation (default: %d)", params.ppl_stride });
|
|
1536
|
+
options.push_back({ "perplexity", " --ppl-output-type {0,1}",
|
|
1537
|
+
"output type for perplexity calculation (default: %d)", params.ppl_output_type });
|
|
1538
|
+
|
|
1539
|
+
options.push_back({ "parallel" });
|
|
1540
|
+
options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold });
|
|
1541
|
+
options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
|
|
1542
|
+
options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
|
|
1543
|
+
options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
|
|
1544
|
+
options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" });
|
|
1545
|
+
|
|
1546
|
+
options.push_back({ "multi-modality" });
|
|
1547
|
+
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
|
|
1548
|
+
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
|
|
1549
|
+
|
|
1550
|
+
options.push_back({ "backend" });
|
|
1551
|
+
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
|
|
1552
|
+
|
|
1476
1553
|
if (llama_supports_mlock()) {
|
|
1477
|
-
|
|
1554
|
+
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
|
|
1478
1555
|
}
|
|
1479
1556
|
if (llama_supports_mmap()) {
|
|
1480
|
-
|
|
1481
|
-
}
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1557
|
+
options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
|
|
1558
|
+
}
|
|
1559
|
+
options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n"
|
|
1560
|
+
" - distribute: spread execution evenly over all nodes\n"
|
|
1561
|
+
" - isolate: only spawn threads on CPUs on the node that execution started on\n"
|
|
1562
|
+
" - numactl: use the CPU map provided by numactl\n"
|
|
1563
|
+
"if run without this previously, it is recommended to drop the system page cache before using this\n"
|
|
1564
|
+
"see https://github.com/ggerganov/llama.cpp/issues/1437" });
|
|
1565
|
+
|
|
1488
1566
|
if (llama_supports_gpu_offload()) {
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
}
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1567
|
+
options.push_back({ "*", "-ngl, --gpu-layers N",
|
|
1568
|
+
"number of layers to store in VRAM" });
|
|
1569
|
+
options.push_back({ "*", "-ngld, --gpu-layers-draft N",
|
|
1570
|
+
"number of layers to store in VRAM for the draft model" });
|
|
1571
|
+
options.push_back({ "*", "-sm, --split-mode SPLIT_MODE",
|
|
1572
|
+
"how to split the model across multiple GPUs, one of:\n"
|
|
1573
|
+
" - none: use one GPU only\n"
|
|
1574
|
+
" - layer (default): split layers and KV across GPUs\n"
|
|
1575
|
+
" - row: split rows across GPUs" });
|
|
1576
|
+
options.push_back({ "*", "-ts, --tensor-split SPLIT",
|
|
1577
|
+
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
|
|
1578
|
+
options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n"
|
|
1579
|
+
"or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
|
|
1580
|
+
}
|
|
1581
|
+
|
|
1582
|
+
options.push_back({ "model" });
|
|
1583
|
+
options.push_back({ "*", " --check-tensors", "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" });
|
|
1584
|
+
options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
|
|
1585
|
+
"advanced option to override model metadata by key. may be specified multiple times.\n"
|
|
1586
|
+
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
|
|
1587
|
+
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" });
|
|
1588
|
+
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
|
1589
|
+
options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
|
|
1590
|
+
"note: this argument can be repeated to add multiple control vectors" });
|
|
1591
|
+
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
|
|
1592
|
+
"add a control vector with user defined scaling SCALE\n"
|
|
1593
|
+
"note: this argument can be repeated to add multiple scaled control vectors" });
|
|
1594
|
+
options.push_back({ "*", " --control-vector-layer-range START END",
|
|
1595
|
+
"layer range to apply the control vector(s) to, start and end inclusive" });
|
|
1596
|
+
options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
|
|
1597
|
+
"or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
|
|
1598
|
+
options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" });
|
|
1599
|
+
options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
|
|
1600
|
+
options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
|
|
1601
|
+
options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" });
|
|
1602
|
+
options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
|
|
1603
|
+
|
|
1604
|
+
options.push_back({ "retrieval" });
|
|
1605
|
+
options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" });
|
|
1606
|
+
options.push_back({ "retrieval", " --chunk-size N", "minimum length of embedded text chunks (default: %d)", params.chunk_size });
|
|
1607
|
+
options.push_back({ "retrieval", " --chunk-separator STRING",
|
|
1608
|
+
"separator between chunks (default: '%s')", params.chunk_separator.c_str() });
|
|
1609
|
+
|
|
1610
|
+
options.push_back({ "passkey" });
|
|
1611
|
+
options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk });
|
|
1612
|
+
options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos });
|
|
1613
|
+
|
|
1614
|
+
options.push_back({ "imatrix" });
|
|
1615
|
+
options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() });
|
|
1616
|
+
options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq });
|
|
1617
|
+
options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
|
|
1618
|
+
options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
|
|
1619
|
+
options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
|
|
1620
|
+
options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk });
|
|
1621
|
+
|
|
1622
|
+
options.push_back({ "bench" });
|
|
1623
|
+
options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
|
|
1624
|
+
options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" });
|
|
1625
|
+
options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" });
|
|
1626
|
+
options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" });
|
|
1627
|
+
|
|
1628
|
+
options.push_back({ "embedding" });
|
|
1629
|
+
options.push_back({ "embedding", " --embd-normalize", "normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize });
|
|
1630
|
+
options.push_back({ "embedding", " --embd-output-format", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix" });
|
|
1631
|
+
options.push_back({ "embedding", " --embd-separator", "separator of embendings (default \\n) for example \"<#sep#>\"" });
|
|
1632
|
+
|
|
1633
|
+
options.push_back({ "server" });
|
|
1634
|
+
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
|
|
1635
|
+
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
|
|
1636
|
+
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
|
|
1637
|
+
options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
|
|
1638
|
+
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
|
|
1639
|
+
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
|
|
1640
|
+
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
|
|
1641
|
+
options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" });
|
|
1642
|
+
options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read });
|
|
1643
|
+
options.push_back({ "server", " --threads-http N", "number of threads used to process HTTP requests (default: %d)", params.n_threads_http });
|
|
1644
|
+
options.push_back({ "server", " --system-prompt-file FNAME",
|
|
1645
|
+
"set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" });
|
|
1646
|
+
options.push_back({ "server", " --log-format {text,json}",
|
|
1647
|
+
"log output format: json or text (default: json)" });
|
|
1648
|
+
options.push_back({ "server", " --metrics", "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" });
|
|
1649
|
+
options.push_back({ "server", " --no-slots", "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" });
|
|
1650
|
+
options.push_back({ "server", " --slot-save-path PATH", "path to save slot kv cache (default: disabled)" });
|
|
1651
|
+
options.push_back({ "server", " --chat-template JINJA_TEMPLATE",
|
|
1652
|
+
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
|
1653
|
+
"only commonly used templates are accepted:\n"
|
|
1654
|
+
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
|
|
1655
|
+
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
|
|
1656
|
+
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
|
|
1657
|
+
|
|
1551
1658
|
#ifndef LOG_DISABLE_LOGS
|
|
1552
|
-
|
|
1659
|
+
options.push_back({ "logging" });
|
|
1660
|
+
options.push_back({ "*", " --simple-io", "use basic IO for better compatibility in subprocesses and limited consoles" });
|
|
1661
|
+
options.push_back({ "*", "-ld, --logdir LOGDIR", "path under which to save YAML logs (no logging if unset)" });
|
|
1662
|
+
options.push_back({ "logging", " --log-test", "Run simple logging test" });
|
|
1663
|
+
options.push_back({ "logging", " --log-disable", "Disable trace logs" });
|
|
1664
|
+
options.push_back({ "logging", " --log-enable", "Enable trace logs" });
|
|
1665
|
+
options.push_back({ "logging", " --log-file FNAME", "Specify a log filename (without extension)" });
|
|
1666
|
+
options.push_back({ "logging", " --log-new", "Create a separate new log file on start. "
|
|
1667
|
+
"Each log file will have unique name: \"<name>.<ID>.log\"" });
|
|
1668
|
+
options.push_back({ "logging", " --log-append", "Don't truncate the old log file." });
|
|
1553
1669
|
#endif // LOG_DISABLE_LOGS
|
|
1670
|
+
|
|
1671
|
+
options.push_back({ "cvector" });
|
|
1672
|
+
options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() });
|
|
1673
|
+
options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
|
|
1674
|
+
options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
|
|
1675
|
+
options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
|
|
1676
|
+
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
|
|
1677
|
+
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
|
|
1678
|
+
|
|
1679
|
+
options.push_back({ "export-lora" });
|
|
1680
|
+
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
|
|
1681
|
+
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
|
|
1682
|
+
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
|
1683
|
+
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
|
|
1684
|
+
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
|
|
1685
|
+
|
|
1686
|
+
printf("usage: %s [options]\n", argv[0]);
|
|
1687
|
+
|
|
1688
|
+
for (const auto & o : options) {
|
|
1689
|
+
if (!o.grp.empty()) {
|
|
1690
|
+
printf("\n%s:\n\n", o.grp.c_str());
|
|
1691
|
+
continue;
|
|
1692
|
+
}
|
|
1693
|
+
printf(" %-32s", o.args.c_str());
|
|
1694
|
+
if (o.args.length() > 30) {
|
|
1695
|
+
printf("\n%34s", "");
|
|
1696
|
+
}
|
|
1697
|
+
|
|
1698
|
+
const auto desc = o.desc;
|
|
1699
|
+
size_t start = 0;
|
|
1700
|
+
size_t end = desc.find('\n');
|
|
1701
|
+
while (end != std::string::npos) {
|
|
1702
|
+
printf("%s\n%34s", desc.substr(start, end - start).c_str(), "");
|
|
1703
|
+
start = end + 1;
|
|
1704
|
+
end = desc.find('\n', start);
|
|
1705
|
+
}
|
|
1706
|
+
|
|
1707
|
+
printf("%s\n", desc.substr(start).c_str());
|
|
1708
|
+
}
|
|
1709
|
+
printf("\n");
|
|
1554
1710
|
}
|
|
1555
1711
|
|
|
1556
1712
|
std::string gpt_params_get_system_info(const gpt_params & params) {
|
|
@@ -1610,24 +1766,6 @@ std::string string_get_sortable_timestamp() {
|
|
|
1610
1766
|
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
|
|
1611
1767
|
}
|
|
1612
1768
|
|
|
1613
|
-
std::string string_random_prompt(std::mt19937 & rng) {
|
|
1614
|
-
const int r = rng() % 10;
|
|
1615
|
-
switch (r) {
|
|
1616
|
-
case 0: return "So";
|
|
1617
|
-
case 1: return "Once upon a time";
|
|
1618
|
-
case 2: return "When";
|
|
1619
|
-
case 3: return "The";
|
|
1620
|
-
case 4: return "After";
|
|
1621
|
-
case 5: return "If";
|
|
1622
|
-
case 6: return "import";
|
|
1623
|
-
case 7: return "He";
|
|
1624
|
-
case 8: return "She";
|
|
1625
|
-
case 9: return "They";
|
|
1626
|
-
}
|
|
1627
|
-
|
|
1628
|
-
GGML_UNREACHABLE();
|
|
1629
|
-
}
|
|
1630
|
-
|
|
1631
1769
|
void string_process_escapes(std::string & input) {
|
|
1632
1770
|
std::size_t input_len = input.length();
|
|
1633
1771
|
std::size_t output_idx = 0;
|
|
@@ -1887,6 +2025,16 @@ std::string fs_get_cache_directory() {
|
|
|
1887
2025
|
return ensure_trailing_slash(cache_directory);
|
|
1888
2026
|
}
|
|
1889
2027
|
|
|
2028
|
+
std::string fs_get_cache_file(const std::string & filename) {
|
|
2029
|
+
GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
|
|
2030
|
+
std::string cache_directory = fs_get_cache_directory();
|
|
2031
|
+
const bool success = fs_create_directory_with_parents(cache_directory);
|
|
2032
|
+
if (!success) {
|
|
2033
|
+
throw std::runtime_error("failed to create cache directory: " + cache_directory);
|
|
2034
|
+
}
|
|
2035
|
+
return cache_directory + filename;
|
|
2036
|
+
}
|
|
2037
|
+
|
|
1890
2038
|
|
|
1891
2039
|
//
|
|
1892
2040
|
// Model utils
|
|
@@ -1898,9 +2046,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
|
1898
2046
|
llama_model * model = nullptr;
|
|
1899
2047
|
|
|
1900
2048
|
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
|
1901
|
-
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
|
|
2049
|
+
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
|
1902
2050
|
} else if (!params.model_url.empty()) {
|
|
1903
|
-
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
|
|
2051
|
+
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
|
1904
2052
|
} else {
|
|
1905
2053
|
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
|
1906
2054
|
}
|
|
@@ -1946,19 +2094,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
|
1946
2094
|
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
|
|
1947
2095
|
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
|
|
1948
2096
|
float lora_scale = std::get<1>(params.lora_adapter[i]);
|
|
1949
|
-
|
|
1950
|
-
|
|
1951
|
-
lora_scale,
|
|
1952
|
-
((i > 0) || params.lora_base.empty())
|
|
1953
|
-
? NULL
|
|
1954
|
-
: params.lora_base.c_str(),
|
|
1955
|
-
params.n_threads);
|
|
1956
|
-
if (err != 0) {
|
|
2097
|
+
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
|
|
2098
|
+
if (adapter == nullptr) {
|
|
1957
2099
|
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
|
1958
2100
|
llama_free(lctx);
|
|
1959
2101
|
llama_free_model(model);
|
|
1960
2102
|
return std::make_tuple(nullptr, nullptr);
|
|
1961
2103
|
}
|
|
2104
|
+
llama_lora_adapter_set(lctx, adapter, lora_scale);
|
|
1962
2105
|
}
|
|
1963
2106
|
|
|
1964
2107
|
if (params.ignore_eos) {
|
|
@@ -1968,7 +2111,24 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
|
1968
2111
|
if (params.warmup) {
|
|
1969
2112
|
LOG("warming up the model with an empty run\n");
|
|
1970
2113
|
|
|
1971
|
-
std::vector<llama_token> tmp
|
|
2114
|
+
std::vector<llama_token> tmp;
|
|
2115
|
+
llama_token bos = llama_token_bos(model);
|
|
2116
|
+
llama_token eos = llama_token_eos(model);
|
|
2117
|
+
// some models (e.g. T5) don't have a BOS token
|
|
2118
|
+
if (bos != -1) {
|
|
2119
|
+
tmp.push_back(bos);
|
|
2120
|
+
}
|
|
2121
|
+
tmp.push_back(eos);
|
|
2122
|
+
|
|
2123
|
+
if (llama_model_has_encoder(model)) {
|
|
2124
|
+
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
|
|
2125
|
+
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
|
2126
|
+
if (decoder_start_token_id == -1) {
|
|
2127
|
+
decoder_start_token_id = bos;
|
|
2128
|
+
}
|
|
2129
|
+
tmp.clear();
|
|
2130
|
+
tmp.push_back(decoder_start_token_id);
|
|
2131
|
+
}
|
|
1972
2132
|
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
|
1973
2133
|
llama_kv_cache_clear(lctx);
|
|
1974
2134
|
llama_synchronize(lctx);
|
|
@@ -2051,6 +2211,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
|
2051
2211
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
|
2052
2212
|
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
|
2053
2213
|
cparams.pooling_type = params.pooling_type;
|
|
2214
|
+
cparams.attention_type = params.attention_type;
|
|
2054
2215
|
cparams.defrag_thold = params.defrag_thold;
|
|
2055
2216
|
cparams.cb_eval = params.cb_eval;
|
|
2056
2217
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
|
@@ -2070,7 +2231,7 @@ static bool starts_with(const std::string & str, const std::string & prefix) {
|
|
|
2070
2231
|
return str.rfind(prefix, 0) == 0;
|
|
2071
2232
|
}
|
|
2072
2233
|
|
|
2073
|
-
static bool llama_download_file(const std::string & url, const std::string & path) {
|
|
2234
|
+
static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
|
2074
2235
|
|
|
2075
2236
|
// Initialize libcurl
|
|
2076
2237
|
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
|
@@ -2085,6 +2246,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
2085
2246
|
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
2086
2247
|
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
|
2087
2248
|
|
|
2249
|
+
// Check if hf-token or bearer-token was specified
|
|
2250
|
+
if (!hf_token.empty()) {
|
|
2251
|
+
std::string auth_header = "Authorization: Bearer ";
|
|
2252
|
+
auth_header += hf_token.c_str();
|
|
2253
|
+
struct curl_slist *http_headers = NULL;
|
|
2254
|
+
http_headers = curl_slist_append(http_headers, auth_header.c_str());
|
|
2255
|
+
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
|
|
2256
|
+
}
|
|
2257
|
+
|
|
2088
2258
|
#if defined(_WIN32)
|
|
2089
2259
|
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
|
2090
2260
|
// operating system. Currently implemented under MS-Windows.
|
|
@@ -2201,7 +2371,14 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
2201
2371
|
}
|
|
2202
2372
|
|
|
2203
2373
|
// Set the output file
|
|
2204
|
-
|
|
2374
|
+
|
|
2375
|
+
struct FILE_deleter {
|
|
2376
|
+
void operator()(FILE * f) const {
|
|
2377
|
+
fclose(f);
|
|
2378
|
+
}
|
|
2379
|
+
};
|
|
2380
|
+
|
|
2381
|
+
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
|
|
2205
2382
|
if (!outfile) {
|
|
2206
2383
|
fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
|
|
2207
2384
|
return false;
|
|
@@ -2273,6 +2450,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
2273
2450
|
struct llama_model * llama_load_model_from_url(
|
|
2274
2451
|
const char * model_url,
|
|
2275
2452
|
const char * path_model,
|
|
2453
|
+
const char * hf_token,
|
|
2276
2454
|
const struct llama_model_params & params) {
|
|
2277
2455
|
// Basic validation of the model_url
|
|
2278
2456
|
if (!model_url || strlen(model_url) == 0) {
|
|
@@ -2280,7 +2458,7 @@ struct llama_model * llama_load_model_from_url(
|
|
|
2280
2458
|
return NULL;
|
|
2281
2459
|
}
|
|
2282
2460
|
|
|
2283
|
-
if (!llama_download_file(model_url, path_model)) {
|
|
2461
|
+
if (!llama_download_file(model_url, path_model, hf_token)) {
|
|
2284
2462
|
return NULL;
|
|
2285
2463
|
}
|
|
2286
2464
|
|
|
@@ -2328,14 +2506,14 @@ struct llama_model * llama_load_model_from_url(
|
|
|
2328
2506
|
// Prepare download in parallel
|
|
2329
2507
|
std::vector<std::future<bool>> futures_download;
|
|
2330
2508
|
for (int idx = 1; idx < n_split; idx++) {
|
|
2331
|
-
futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool {
|
|
2509
|
+
futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
|
|
2332
2510
|
char split_path[PATH_MAX] = {0};
|
|
2333
2511
|
llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
|
|
2334
2512
|
|
|
2335
2513
|
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
|
2336
2514
|
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
|
|
2337
2515
|
|
|
2338
|
-
return llama_download_file(split_url, split_path);
|
|
2516
|
+
return llama_download_file(split_url, split_path, hf_token);
|
|
2339
2517
|
}, idx));
|
|
2340
2518
|
}
|
|
2341
2519
|
|
|
@@ -2354,6 +2532,7 @@ struct llama_model * llama_load_model_from_hf(
|
|
|
2354
2532
|
const char * repo,
|
|
2355
2533
|
const char * model,
|
|
2356
2534
|
const char * path_model,
|
|
2535
|
+
const char * hf_token,
|
|
2357
2536
|
const struct llama_model_params & params) {
|
|
2358
2537
|
// construct hugging face model url:
|
|
2359
2538
|
//
|
|
@@ -2369,7 +2548,7 @@ struct llama_model * llama_load_model_from_hf(
|
|
|
2369
2548
|
model_url += "/resolve/main/";
|
|
2370
2549
|
model_url += model;
|
|
2371
2550
|
|
|
2372
|
-
return llama_load_model_from_url(model_url.c_str(), path_model, params);
|
|
2551
|
+
return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
|
|
2373
2552
|
}
|
|
2374
2553
|
|
|
2375
2554
|
#else
|
|
@@ -2377,6 +2556,7 @@ struct llama_model * llama_load_model_from_hf(
|
|
|
2377
2556
|
struct llama_model * llama_load_model_from_url(
|
|
2378
2557
|
const char * /*model_url*/,
|
|
2379
2558
|
const char * /*path_model*/,
|
|
2559
|
+
const char * /*hf_token*/,
|
|
2380
2560
|
const struct llama_model_params & /*params*/) {
|
|
2381
2561
|
fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
|
2382
2562
|
return nullptr;
|
|
@@ -2386,6 +2566,7 @@ struct llama_model * llama_load_model_from_hf(
|
|
|
2386
2566
|
const char * /*repo*/,
|
|
2387
2567
|
const char * /*model*/,
|
|
2388
2568
|
const char * /*path_model*/,
|
|
2569
|
+
const char * /*hf_token*/,
|
|
2389
2570
|
const struct llama_model_params & /*params*/) {
|
|
2390
2571
|
fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
|
2391
2572
|
return nullptr;
|
|
@@ -2450,57 +2631,126 @@ std::vector<llama_token> llama_tokenize(
|
|
|
2450
2631
|
}
|
|
2451
2632
|
|
|
2452
2633
|
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
|
2453
|
-
std::
|
|
2454
|
-
|
|
2455
|
-
|
|
2456
|
-
|
|
2457
|
-
|
|
2458
|
-
|
|
2459
|
-
|
|
2460
|
-
|
|
2634
|
+
std::string piece;
|
|
2635
|
+
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
|
2636
|
+
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
|
2637
|
+
if (n_chars < 0) {
|
|
2638
|
+
piece.resize(-n_chars);
|
|
2639
|
+
int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
|
2640
|
+
GGML_ASSERT(check == -n_chars);
|
|
2641
|
+
}
|
|
2642
|
+
else {
|
|
2643
|
+
piece.resize(n_chars);
|
|
2461
2644
|
}
|
|
2462
2645
|
|
|
2463
|
-
return
|
|
2646
|
+
return piece;
|
|
2464
2647
|
}
|
|
2465
2648
|
|
|
2466
|
-
std::string
|
|
2467
|
-
|
|
2468
|
-
|
|
2469
|
-
|
|
2470
|
-
|
|
2649
|
+
std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
|
2650
|
+
std::string text;
|
|
2651
|
+
text.resize(std::max(text.capacity(), tokens.size()));
|
|
2652
|
+
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
|
2653
|
+
if (n_chars < 0) {
|
|
2654
|
+
text.resize(-n_chars);
|
|
2655
|
+
n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
|
2656
|
+
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
|
|
2657
|
+
}
|
|
2471
2658
|
|
|
2472
|
-
|
|
2473
|
-
piece = llama_token_to_piece(ctx, tokens[i]);
|
|
2659
|
+
text.resize(n_chars);
|
|
2474
2660
|
|
|
2475
|
-
|
|
2476
|
-
|
|
2477
|
-
|
|
2478
|
-
}
|
|
2661
|
+
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
|
|
2662
|
+
return text;
|
|
2663
|
+
}
|
|
2479
2664
|
|
|
2480
|
-
|
|
2481
|
-
|
|
2665
|
+
bool llama_should_add_bos_token(const llama_model * model) {
|
|
2666
|
+
const int add_bos = llama_add_bos_token(model);
|
|
2482
2667
|
|
|
2483
|
-
return
|
|
2668
|
+
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
|
|
2484
2669
|
}
|
|
2485
2670
|
|
|
2486
|
-
|
|
2487
|
-
|
|
2488
|
-
|
|
2671
|
+
//
|
|
2672
|
+
// Chat template utils
|
|
2673
|
+
//
|
|
2674
|
+
|
|
2675
|
+
bool llama_chat_verify_template(const std::string & tmpl) {
|
|
2676
|
+
llama_chat_message chat[] = {{"user", "test"}};
|
|
2677
|
+
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
|
2678
|
+
return res >= 0;
|
|
2679
|
+
}
|
|
2489
2680
|
|
|
2490
|
-
|
|
2491
|
-
|
|
2681
|
+
std::string llama_chat_apply_template(const struct llama_model * model,
|
|
2682
|
+
const std::string & tmpl,
|
|
2683
|
+
const std::vector<llama_chat_msg> & msgs,
|
|
2684
|
+
bool add_ass) {
|
|
2685
|
+
int alloc_size = 0;
|
|
2686
|
+
bool fallback = false; // indicate if we must fallback to default chatml
|
|
2687
|
+
std::vector<llama_chat_message> chat;
|
|
2688
|
+
for (auto & msg : msgs) {
|
|
2689
|
+
chat.push_back({msg.role.c_str(), msg.content.c_str()});
|
|
2690
|
+
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
|
|
2691
|
+
}
|
|
2692
|
+
|
|
2693
|
+
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
|
|
2694
|
+
std::vector<char> buf(alloc_size);
|
|
2695
|
+
|
|
2696
|
+
// run the first time to get the total output length
|
|
2697
|
+
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
|
2698
|
+
|
|
2699
|
+
// error: chat template is not supported
|
|
2700
|
+
if (res < 0) {
|
|
2701
|
+
if (ptr_tmpl != nullptr) {
|
|
2702
|
+
// if the custom "tmpl" is not supported, we throw an error
|
|
2703
|
+
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
|
2704
|
+
throw std::runtime_error("this custom template is not supported");
|
|
2705
|
+
} else {
|
|
2706
|
+
// If the built-in template is not supported, we default to chatml
|
|
2707
|
+
res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
|
2708
|
+
fallback = true;
|
|
2709
|
+
}
|
|
2710
|
+
}
|
|
2492
2711
|
|
|
2493
|
-
|
|
2712
|
+
// if it turns out that our buffer is too small, we resize it
|
|
2713
|
+
if ((size_t) res > buf.size()) {
|
|
2714
|
+
buf.resize(res);
|
|
2715
|
+
res = llama_chat_apply_template(
|
|
2716
|
+
fallback ? nullptr : model,
|
|
2717
|
+
fallback ? "chatml" : ptr_tmpl,
|
|
2718
|
+
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
|
2494
2719
|
}
|
|
2495
2720
|
|
|
2496
|
-
|
|
2497
|
-
return
|
|
2721
|
+
std::string formatted_chat(buf.data(), res);
|
|
2722
|
+
return formatted_chat;
|
|
2498
2723
|
}
|
|
2499
2724
|
|
|
2500
|
-
|
|
2501
|
-
|
|
2725
|
+
std::string llama_chat_format_single(const struct llama_model * model,
|
|
2726
|
+
const std::string & tmpl,
|
|
2727
|
+
const std::vector<llama_chat_msg> & past_msg,
|
|
2728
|
+
const llama_chat_msg & new_msg,
|
|
2729
|
+
bool add_ass) {
|
|
2730
|
+
std::ostringstream ss;
|
|
2731
|
+
auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
|
|
2732
|
+
std::vector<llama_chat_msg> chat_new(past_msg);
|
|
2733
|
+
// if the past_msg ends with a newline, we must preserve it in the formatted version
|
|
2734
|
+
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
|
|
2735
|
+
ss << "\n";
|
|
2736
|
+
};
|
|
2737
|
+
// format chat with new_msg
|
|
2738
|
+
chat_new.push_back(new_msg);
|
|
2739
|
+
auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
|
|
2740
|
+
// get the diff part
|
|
2741
|
+
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
|
|
2742
|
+
return ss.str();
|
|
2743
|
+
}
|
|
2502
2744
|
|
|
2503
|
-
|
|
2745
|
+
std::string llama_chat_format_example(const struct llama_model * model,
|
|
2746
|
+
const std::string & tmpl) {
|
|
2747
|
+
std::vector<llama_chat_msg> msgs = {
|
|
2748
|
+
{"system", "You are a helpful assistant"},
|
|
2749
|
+
{"user", "Hello"},
|
|
2750
|
+
{"assistant", "Hi there"},
|
|
2751
|
+
{"user", "How are you?"},
|
|
2752
|
+
};
|
|
2753
|
+
return llama_chat_apply_template(model, tmpl, msgs, true);
|
|
2504
2754
|
}
|
|
2505
2755
|
|
|
2506
2756
|
//
|
|
@@ -2582,14 +2832,34 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
|
|
|
2582
2832
|
// Embedding utils
|
|
2583
2833
|
//
|
|
2584
2834
|
|
|
2585
|
-
void llama_embd_normalize(const float * inp, float * out, int n) {
|
|
2835
|
+
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
|
|
2586
2836
|
double sum = 0.0;
|
|
2587
|
-
|
|
2588
|
-
|
|
2837
|
+
|
|
2838
|
+
switch (embd_norm) {
|
|
2839
|
+
case -1: // no normalisation
|
|
2840
|
+
sum = 1.0;
|
|
2841
|
+
break;
|
|
2842
|
+
case 0: // max absolute
|
|
2843
|
+
for (int i = 0; i < n; i++) {
|
|
2844
|
+
if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
|
|
2845
|
+
}
|
|
2846
|
+
sum /= 32760.0; // make an int16 range
|
|
2847
|
+
break;
|
|
2848
|
+
case 2: // euclidean
|
|
2849
|
+
for (int i = 0; i < n; i++) {
|
|
2850
|
+
sum += inp[i] * inp[i];
|
|
2851
|
+
}
|
|
2852
|
+
sum = std::sqrt(sum);
|
|
2853
|
+
break;
|
|
2854
|
+
default: // p-norm (euclidean is p-norm p=2)
|
|
2855
|
+
for (int i = 0; i < n; i++) {
|
|
2856
|
+
sum += std::pow(std::abs(inp[i]), embd_norm);
|
|
2857
|
+
}
|
|
2858
|
+
sum = std::pow(sum, 1.0 / embd_norm);
|
|
2859
|
+
break;
|
|
2589
2860
|
}
|
|
2590
|
-
sum = sqrt(sum);
|
|
2591
2861
|
|
|
2592
|
-
const float norm = sum > 0.0 ? 1.
|
|
2862
|
+
const float norm = sum > 0.0 ? 1.0 / sum : 0.0f;
|
|
2593
2863
|
|
|
2594
2864
|
for (int i = 0; i < n; i++) {
|
|
2595
2865
|
out[i] = inp[i] * norm;
|
|
@@ -2607,6 +2877,14 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
|
|
|
2607
2877
|
sum2 += embd2[i] * embd2[i];
|
|
2608
2878
|
}
|
|
2609
2879
|
|
|
2880
|
+
// Handle the case where one or both vectors are zero vectors
|
|
2881
|
+
if (sum1 == 0.0 || sum2 == 0.0) {
|
|
2882
|
+
if (sum1 == 0.0 && sum2 == 0.0) {
|
|
2883
|
+
return 1.0f; // two zero vectors are similar
|
|
2884
|
+
}
|
|
2885
|
+
return 0.0f;
|
|
2886
|
+
}
|
|
2887
|
+
|
|
2610
2888
|
return sum / (sqrt(sum1) * sqrt(sum2));
|
|
2611
2889
|
}
|
|
2612
2890
|
|
|
@@ -2615,125 +2893,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
|
|
|
2615
2893
|
//
|
|
2616
2894
|
|
|
2617
2895
|
static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
|
|
2618
|
-
int32_t n_tensors;
|
|
2619
|
-
|
|
2620
|
-
size_t n_bytes = 0;
|
|
2621
|
-
|
|
2622
|
-
uint32_t max_direction_layer = 0;
|
|
2623
|
-
|
|
2624
2896
|
llama_control_vector_data result = { -1, {} };
|
|
2625
2897
|
|
|
2626
|
-
|
|
2627
|
-
{
|
|
2628
|
-
|
|
2629
|
-
|
|
2630
|
-
|
|
2631
|
-
|
|
2632
|
-
|
|
2633
|
-
|
|
2634
|
-
|
|
2635
|
-
/* .no_alloc = */ true,
|
|
2636
|
-
/* .ctx = */ &meta_ctx,
|
|
2637
|
-
};
|
|
2638
|
-
struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
|
|
2639
|
-
if (!meta_ctx_gguf) {
|
|
2640
|
-
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
|
|
2641
|
-
ggml_free(meta_ctx);
|
|
2642
|
-
return result;
|
|
2643
|
-
}
|
|
2644
|
-
|
|
2645
|
-
n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
|
|
2646
|
-
for (int i = 0; i < n_tensors; i++) {
|
|
2647
|
-
std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
|
|
2648
|
-
|
|
2649
|
-
// split on '.'
|
|
2650
|
-
size_t dotpos = name.find('.');
|
|
2651
|
-
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
|
|
2652
|
-
try {
|
|
2653
|
-
uint32_t layer = std::stoi(name.substr(dotpos + 1));
|
|
2654
|
-
if (layer == 0) {
|
|
2655
|
-
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
|
|
2656
|
-
ggml_free(meta_ctx);
|
|
2657
|
-
gguf_free(meta_ctx_gguf);
|
|
2658
|
-
return result;
|
|
2659
|
-
}
|
|
2660
|
-
if (layer > max_direction_layer) {
|
|
2661
|
-
max_direction_layer = layer;
|
|
2662
|
-
}
|
|
2663
|
-
} catch (...) {
|
|
2664
|
-
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
|
|
2665
|
-
ggml_free(meta_ctx);
|
|
2666
|
-
gguf_free(meta_ctx_gguf);
|
|
2667
|
-
return result;
|
|
2668
|
-
}
|
|
2669
|
-
}
|
|
2670
|
-
|
|
2671
|
-
struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
|
|
2672
|
-
if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
|
|
2673
|
-
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
|
|
2674
|
-
ggml_free(meta_ctx);
|
|
2675
|
-
gguf_free(meta_ctx_gguf);
|
|
2676
|
-
return result;
|
|
2677
|
-
}
|
|
2678
|
-
if (result.n_embd == -1) {
|
|
2679
|
-
result.n_embd = ggml_nelements(tensor_meta);
|
|
2680
|
-
} else if (ggml_nelements(tensor_meta) != result.n_embd) {
|
|
2681
|
-
fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
|
|
2682
|
-
ggml_free(meta_ctx);
|
|
2683
|
-
gguf_free(meta_ctx_gguf);
|
|
2684
|
-
return result;
|
|
2685
|
-
}
|
|
2686
|
-
n_bytes += ggml_nbytes(tensor_meta);
|
|
2687
|
-
}
|
|
2688
|
-
ggml_free(meta_ctx);
|
|
2689
|
-
gguf_free(meta_ctx_gguf);
|
|
2898
|
+
ggml_context * ctx = nullptr;
|
|
2899
|
+
struct gguf_init_params meta_gguf_params = {
|
|
2900
|
+
/* .no_alloc = */ false,
|
|
2901
|
+
/* .ctx = */ &ctx,
|
|
2902
|
+
};
|
|
2903
|
+
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
|
|
2904
|
+
if (!ctx_gguf) {
|
|
2905
|
+
fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
|
|
2906
|
+
return result;
|
|
2690
2907
|
}
|
|
2691
2908
|
|
|
2909
|
+
int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
|
|
2692
2910
|
if (n_tensors == 0) {
|
|
2693
2911
|
fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
|
|
2694
|
-
return result;
|
|
2695
2912
|
}
|
|
2696
2913
|
|
|
2697
|
-
|
|
2698
|
-
|
|
2699
|
-
/* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
|
|
2700
|
-
/* .mem_buffer = */ nullptr,
|
|
2701
|
-
/* .no_alloc = */ false,
|
|
2702
|
-
};
|
|
2703
|
-
struct ggml_context * ctx = ggml_init(ggml_params);
|
|
2914
|
+
for (int i = 0; i < n_tensors; i++) {
|
|
2915
|
+
std::string name = gguf_get_tensor_name(ctx_gguf, i);
|
|
2704
2916
|
|
|
2705
|
-
|
|
2706
|
-
|
|
2707
|
-
|
|
2708
|
-
|
|
2709
|
-
|
|
2710
|
-
|
|
2711
|
-
|
|
2712
|
-
|
|
2713
|
-
|
|
2714
|
-
|
|
2917
|
+
int layer_idx = -1;
|
|
2918
|
+
|
|
2919
|
+
// split on '.'
|
|
2920
|
+
size_t dotpos = name.find('.');
|
|
2921
|
+
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
|
|
2922
|
+
try {
|
|
2923
|
+
layer_idx = std::stoi(name.substr(dotpos + 1));
|
|
2924
|
+
} catch (...) {
|
|
2925
|
+
layer_idx = -1;
|
|
2926
|
+
}
|
|
2927
|
+
}
|
|
2928
|
+
if (layer_idx < 0) {
|
|
2929
|
+
fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
|
2930
|
+
result.n_embd = -1;
|
|
2931
|
+
break;
|
|
2932
|
+
} else if (layer_idx == 0) {
|
|
2933
|
+
fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
|
2934
|
+
result.n_embd = -1;
|
|
2935
|
+
break;
|
|
2936
|
+
}
|
|
2715
2937
|
|
|
2716
|
-
|
|
2717
|
-
|
|
2938
|
+
struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
|
|
2939
|
+
if (tensor->type != GGML_TYPE_F32) {
|
|
2940
|
+
fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
|
|
2941
|
+
result.n_embd = -1;
|
|
2942
|
+
break;
|
|
2943
|
+
}
|
|
2944
|
+
if (ggml_n_dims(tensor) != 1) {
|
|
2945
|
+
fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
|
|
2946
|
+
result.n_embd = -1;
|
|
2947
|
+
break;
|
|
2948
|
+
}
|
|
2718
2949
|
|
|
2719
|
-
|
|
2720
|
-
|
|
2721
|
-
|
|
2950
|
+
if (result.n_embd == -1) {
|
|
2951
|
+
result.n_embd = ggml_nelements(tensor);
|
|
2952
|
+
} else if (ggml_nelements(tensor) != result.n_embd) {
|
|
2953
|
+
fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
|
|
2954
|
+
result.n_embd = -1;
|
|
2955
|
+
break;
|
|
2956
|
+
}
|
|
2722
2957
|
|
|
2723
|
-
|
|
2958
|
+
// extend if necessary - do not store data for layer 0 (it's not used)
|
|
2959
|
+
result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
|
|
2724
2960
|
|
|
2725
|
-
|
|
2726
|
-
|
|
2727
|
-
|
|
2728
|
-
|
|
2729
|
-
}
|
|
2730
|
-
} else {
|
|
2731
|
-
for (int j = 0; j < result.n_embd; j++) {
|
|
2732
|
-
dst[j] = 0.0f;
|
|
2733
|
-
}
|
|
2961
|
+
const float * src = (const float *) tensor->data;
|
|
2962
|
+
float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
|
|
2963
|
+
for (int j = 0; j < result.n_embd; j++) {
|
|
2964
|
+
dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
|
|
2734
2965
|
}
|
|
2966
|
+
|
|
2735
2967
|
}
|
|
2736
2968
|
|
|
2969
|
+
if (result.n_embd == -1) {
|
|
2970
|
+
fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
|
|
2971
|
+
result.data.clear();
|
|
2972
|
+
}
|
|
2973
|
+
|
|
2974
|
+
gguf_free(ctx_gguf);
|
|
2975
|
+
ggml_free(ctx);
|
|
2976
|
+
|
|
2737
2977
|
return result;
|
|
2738
2978
|
}
|
|
2739
2979
|
|
|
@@ -2744,16 +2984,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|
|
2744
2984
|
auto cur = llama_control_vector_load_one(info);
|
|
2745
2985
|
|
|
2746
2986
|
if (cur.n_embd == -1) {
|
|
2747
|
-
|
|
2987
|
+
result.n_embd = -1;
|
|
2988
|
+
break;
|
|
2748
2989
|
}
|
|
2749
|
-
if (result.n_embd != -1 &&
|
|
2750
|
-
fprintf(stderr, "%s: control
|
|
2751
|
-
|
|
2990
|
+
if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
|
|
2991
|
+
fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
|
|
2992
|
+
result.n_embd = -1;
|
|
2993
|
+
break;
|
|
2752
2994
|
}
|
|
2753
2995
|
|
|
2754
2996
|
if (result.n_embd == -1) {
|
|
2755
2997
|
result = std::move(cur);
|
|
2756
2998
|
} else {
|
|
2999
|
+
result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f); // extend if necessary
|
|
2757
3000
|
for (size_t i = 0; i < cur.data.size(); i++) {
|
|
2758
3001
|
result.data[i] += cur.data[i];
|
|
2759
3002
|
}
|
|
@@ -2761,7 +3004,8 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|
|
2761
3004
|
}
|
|
2762
3005
|
|
|
2763
3006
|
if (result.n_embd == -1) {
|
|
2764
|
-
fprintf(stderr, "%s: no
|
|
3007
|
+
fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
|
|
3008
|
+
result.data.clear();
|
|
2765
3009
|
}
|
|
2766
3010
|
|
|
2767
3011
|
return result;
|
|
@@ -2844,7 +3088,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
|
2844
3088
|
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
|
2845
3089
|
fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
|
|
2846
3090
|
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
|
2847
|
-
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
|
2848
3091
|
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
|
2849
3092
|
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
|
2850
3093
|
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
|
@@ -2903,9 +3146,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
|
2903
3146
|
yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
|
2904
3147
|
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
|
|
2905
3148
|
yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
|
|
2906
|
-
fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
|
|
2907
3149
|
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
|
|
2908
|
-
fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
|
|
2909
3150
|
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
|
|
2910
3151
|
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
|
|
2911
3152
|
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
|
@@ -2932,7 +3173,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
|
2932
3173
|
}
|
|
2933
3174
|
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
|
|
2934
3175
|
}
|
|
2935
|
-
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
|
2936
3176
|
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
|
2937
3177
|
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
|
2938
3178
|
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
|
@@ -2955,7 +3195,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
|
2955
3195
|
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
|
2956
3196
|
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
|
2957
3197
|
yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
|
|
2958
|
-
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
|
2959
3198
|
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
|
|
2960
3199
|
|
|
2961
3200
|
fprintf(stream, "reverse_prompt:\n");
|