@fugood/llama.node 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +17 -7
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +89 -27
- package/src/LlamaContext.h +2 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +240 -168
- package/src/llama.cpp/.github/workflows/docker.yml +8 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +14 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -4
- package/src/llama.cpp/common/arg.cpp +986 -770
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +212 -351
- package/src/llama.cpp/common/common.h +204 -117
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +163 -121
- package/src/llama.cpp/common/sampling.h +41 -20
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +134 -161
- package/src/llama.cpp/examples/CMakeLists.txt +33 -14
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +19 -18
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +41 -87
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +263 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +83 -22
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
- package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +73 -114
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
- package/src/llama.cpp/examples/server/server.cpp +2073 -1339
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +354 -277
- package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/simple/simple.cpp +130 -94
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
- package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +159 -417
- package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +93 -52
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +4 -8
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +779 -194
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +55 -10
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +4317 -2979
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -38
- package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
- package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +62 -20
- package/src/llama.cpp/tests/test-sampling.cpp +163 -138
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -17,27 +17,27 @@
|
|
|
17
17
|
|
|
18
18
|
using json = nlohmann::ordered_json;
|
|
19
19
|
|
|
20
|
-
|
|
20
|
+
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
|
|
21
21
|
this->examples = std::move(examples);
|
|
22
22
|
return *this;
|
|
23
23
|
}
|
|
24
24
|
|
|
25
|
-
|
|
25
|
+
common_arg & common_arg::set_env(const char * env) {
|
|
26
26
|
help = help + "\n(env: " + env + ")";
|
|
27
27
|
this->env = env;
|
|
28
28
|
return *this;
|
|
29
29
|
}
|
|
30
30
|
|
|
31
|
-
|
|
31
|
+
common_arg & common_arg::set_sparam() {
|
|
32
32
|
is_sparam = true;
|
|
33
33
|
return *this;
|
|
34
34
|
}
|
|
35
35
|
|
|
36
|
-
bool
|
|
36
|
+
bool common_arg::in_example(enum llama_example ex) {
|
|
37
37
|
return examples.find(ex) != examples.end();
|
|
38
38
|
}
|
|
39
39
|
|
|
40
|
-
bool
|
|
40
|
+
bool common_arg::get_value_from_env(std::string & output) {
|
|
41
41
|
if (env == nullptr) return false;
|
|
42
42
|
char * value = std::getenv(env);
|
|
43
43
|
if (value) {
|
|
@@ -47,7 +47,7 @@ bool llama_arg::get_value_from_env(std::string & output) {
|
|
|
47
47
|
return false;
|
|
48
48
|
}
|
|
49
49
|
|
|
50
|
-
bool
|
|
50
|
+
bool common_arg::has_value_from_env() {
|
|
51
51
|
return env != nullptr && std::getenv(env);
|
|
52
52
|
}
|
|
53
53
|
|
|
@@ -78,7 +78,7 @@ static std::vector<std::string> break_str_into_lines(std::string input, size_t m
|
|
|
78
78
|
return result;
|
|
79
79
|
}
|
|
80
80
|
|
|
81
|
-
std::string
|
|
81
|
+
std::string common_arg::to_string() {
|
|
82
82
|
// params for printing to console
|
|
83
83
|
const static int n_leading_spaces = 40;
|
|
84
84
|
const static int n_char_per_line_help = 70; // TODO: detect this based on current console
|
|
@@ -119,64 +119,75 @@ std::string llama_arg::to_string() {
|
|
|
119
119
|
// utils
|
|
120
120
|
//
|
|
121
121
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
#else
|
|
129
|
-
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
|
130
|
-
#endif
|
|
131
|
-
|
|
132
|
-
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
|
133
|
-
static std::string format(const char * fmt, ...) {
|
|
134
|
-
va_list ap;
|
|
135
|
-
va_list ap2;
|
|
136
|
-
va_start(ap, fmt);
|
|
137
|
-
va_copy(ap2, ap);
|
|
138
|
-
int size = vsnprintf(NULL, 0, fmt, ap);
|
|
139
|
-
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
|
140
|
-
std::vector<char> buf(size + 1);
|
|
141
|
-
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
|
142
|
-
GGML_ASSERT(size2 == size);
|
|
143
|
-
va_end(ap2);
|
|
144
|
-
va_end(ap);
|
|
145
|
-
return std::string(buf.data(), size);
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
static void gpt_params_handle_model_default(gpt_params & params) {
|
|
149
|
-
if (!params.hf_repo.empty()) {
|
|
122
|
+
static void common_params_handle_model_default(
|
|
123
|
+
std::string & model,
|
|
124
|
+
std::string & model_url,
|
|
125
|
+
std::string & hf_repo,
|
|
126
|
+
std::string & hf_file) {
|
|
127
|
+
if (!hf_repo.empty()) {
|
|
150
128
|
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
151
|
-
if (
|
|
152
|
-
if (
|
|
129
|
+
if (hf_file.empty()) {
|
|
130
|
+
if (model.empty()) {
|
|
153
131
|
throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
|
|
154
132
|
}
|
|
155
|
-
|
|
156
|
-
} else if (
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
133
|
+
hf_file = model;
|
|
134
|
+
} else if (model.empty()) {
|
|
135
|
+
// this is to avoid different repo having same file name, or same file name in different subdirs
|
|
136
|
+
std::string filename = hf_repo + "_" + hf_file;
|
|
137
|
+
// to make sure we don't have any slashes in the filename
|
|
138
|
+
string_replace_all(filename, "/", "_");
|
|
139
|
+
model = fs_get_cache_file(filename);
|
|
140
|
+
}
|
|
141
|
+
} else if (!model_url.empty()) {
|
|
142
|
+
if (model.empty()) {
|
|
143
|
+
auto f = string_split<std::string>(model_url, '#').front();
|
|
144
|
+
f = string_split<std::string>(f, '?').front();
|
|
145
|
+
model = fs_get_cache_file(string_split<std::string>(f, '/').back());
|
|
146
|
+
}
|
|
147
|
+
} else if (model.empty()) {
|
|
148
|
+
model = DEFAULT_MODEL_PATH;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
const std::vector<ggml_type> kv_cache_types = {
|
|
153
|
+
GGML_TYPE_F32,
|
|
154
|
+
GGML_TYPE_F16,
|
|
155
|
+
GGML_TYPE_BF16,
|
|
156
|
+
GGML_TYPE_Q8_0,
|
|
157
|
+
GGML_TYPE_Q4_0,
|
|
158
|
+
GGML_TYPE_Q4_1,
|
|
159
|
+
GGML_TYPE_IQ4_NL,
|
|
160
|
+
GGML_TYPE_Q5_0,
|
|
161
|
+
GGML_TYPE_Q5_1,
|
|
162
|
+
};
|
|
163
|
+
|
|
164
|
+
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
|
165
|
+
for (const auto & type : kv_cache_types) {
|
|
166
|
+
if (ggml_type_name(type) == s) {
|
|
167
|
+
return type;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
throw std::runtime_error("Unsupported cache type: " + s);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
static std::string get_all_kv_cache_types() {
|
|
174
|
+
std::ostringstream msg;
|
|
175
|
+
for (const auto & type : kv_cache_types) {
|
|
176
|
+
msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
|
|
167
177
|
}
|
|
178
|
+
return msg.str();
|
|
168
179
|
}
|
|
169
180
|
|
|
170
181
|
//
|
|
171
182
|
// CLI argument parsing functions
|
|
172
183
|
//
|
|
173
184
|
|
|
174
|
-
static bool
|
|
185
|
+
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
|
|
175
186
|
std::string arg;
|
|
176
187
|
const std::string arg_prefix = "--";
|
|
177
|
-
|
|
188
|
+
common_params & params = ctx_arg.params;
|
|
178
189
|
|
|
179
|
-
std::unordered_map<std::string,
|
|
190
|
+
std::unordered_map<std::string, common_arg *> arg_to_options;
|
|
180
191
|
for (auto & opt : ctx_arg.options) {
|
|
181
192
|
for (const auto & arg : opt.args) {
|
|
182
193
|
arg_to_options[arg] = &opt;
|
|
@@ -199,7 +210,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
|
|
|
199
210
|
continue;
|
|
200
211
|
}
|
|
201
212
|
} catch (std::exception & e) {
|
|
202
|
-
throw std::invalid_argument(
|
|
213
|
+
throw std::invalid_argument(string_format(
|
|
203
214
|
"error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
|
|
204
215
|
}
|
|
205
216
|
}
|
|
@@ -220,7 +231,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
|
|
|
220
231
|
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
221
232
|
}
|
|
222
233
|
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
|
223
|
-
throw std::invalid_argument(
|
|
234
|
+
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
|
224
235
|
}
|
|
225
236
|
auto opt = *arg_to_options[arg];
|
|
226
237
|
if (opt.has_value_from_env()) {
|
|
@@ -252,23 +263,26 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
|
|
|
252
263
|
continue;
|
|
253
264
|
}
|
|
254
265
|
} catch (std::exception & e) {
|
|
255
|
-
throw std::invalid_argument(
|
|
266
|
+
throw std::invalid_argument(string_format(
|
|
256
267
|
"error while handling argument \"%s\": %s\n\n"
|
|
257
268
|
"usage:\n%s\n\nto show complete usage, run with -h",
|
|
258
269
|
arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
|
|
259
270
|
}
|
|
260
271
|
}
|
|
261
272
|
|
|
262
|
-
postprocess_cpu_params(params.cpuparams,
|
|
273
|
+
postprocess_cpu_params(params.cpuparams, nullptr);
|
|
263
274
|
postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
|
|
264
|
-
|
|
265
|
-
postprocess_cpu_params(params.
|
|
275
|
+
|
|
276
|
+
postprocess_cpu_params(params.speculative.cpuparams, ¶ms.cpuparams);
|
|
277
|
+
postprocess_cpu_params(params.speculative.cpuparams_batch, ¶ms.cpuparams_batch);
|
|
266
278
|
|
|
267
279
|
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
|
|
268
280
|
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
|
269
281
|
}
|
|
270
282
|
|
|
271
|
-
|
|
283
|
+
// TODO: refactor model params in a common struct
|
|
284
|
+
common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file);
|
|
285
|
+
common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file);
|
|
272
286
|
|
|
273
287
|
if (params.escape) {
|
|
274
288
|
string_process_escapes(params.prompt);
|
|
@@ -277,6 +291,9 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
|
|
|
277
291
|
for (auto & antiprompt : params.antiprompt) {
|
|
278
292
|
string_process_escapes(antiprompt);
|
|
279
293
|
}
|
|
294
|
+
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
|
|
295
|
+
string_process_escapes(seq_breaker);
|
|
296
|
+
}
|
|
280
297
|
}
|
|
281
298
|
|
|
282
299
|
if (!params.kv_overrides.empty()) {
|
|
@@ -291,16 +308,16 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
|
|
|
291
308
|
return true;
|
|
292
309
|
}
|
|
293
310
|
|
|
294
|
-
static void
|
|
295
|
-
auto print_options = [](std::vector<
|
|
296
|
-
for (
|
|
311
|
+
static void common_params_print_usage(common_params_context & ctx_arg) {
|
|
312
|
+
auto print_options = [](std::vector<common_arg *> & options) {
|
|
313
|
+
for (common_arg * opt : options) {
|
|
297
314
|
printf("%s", opt->to_string().c_str());
|
|
298
315
|
}
|
|
299
316
|
};
|
|
300
317
|
|
|
301
|
-
std::vector<
|
|
302
|
-
std::vector<
|
|
303
|
-
std::vector<
|
|
318
|
+
std::vector<common_arg *> common_options;
|
|
319
|
+
std::vector<common_arg *> sparam_options;
|
|
320
|
+
std::vector<common_arg *> specific_options;
|
|
304
321
|
for (auto & opt : ctx_arg.options) {
|
|
305
322
|
// in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
|
|
306
323
|
if (opt.is_sparam) {
|
|
@@ -320,17 +337,38 @@ static void gpt_params_print_usage(gpt_params_context & ctx_arg) {
|
|
|
320
337
|
print_options(specific_options);
|
|
321
338
|
}
|
|
322
339
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
340
|
+
static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
|
|
341
|
+
std::vector<ggml_backend_dev_t> devices;
|
|
342
|
+
auto dev_names = string_split<std::string>(value, ',');
|
|
343
|
+
if (dev_names.empty()) {
|
|
344
|
+
throw std::invalid_argument("no devices specified");
|
|
345
|
+
}
|
|
346
|
+
if (dev_names.size() == 1 && dev_names[0] == "none") {
|
|
347
|
+
devices.push_back(nullptr);
|
|
348
|
+
} else {
|
|
349
|
+
for (const auto & device : dev_names) {
|
|
350
|
+
auto * dev = ggml_backend_dev_by_name(device.c_str());
|
|
351
|
+
if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
|
|
352
|
+
throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
|
|
353
|
+
}
|
|
354
|
+
devices.push_back(dev);
|
|
355
|
+
}
|
|
356
|
+
devices.push_back(nullptr);
|
|
357
|
+
}
|
|
358
|
+
return devices;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
|
362
|
+
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
|
|
363
|
+
const common_params params_org = ctx_arg.params; // the example can modify the default params
|
|
326
364
|
|
|
327
365
|
try {
|
|
328
|
-
if (!
|
|
366
|
+
if (!common_params_parse_ex(argc, argv, ctx_arg)) {
|
|
329
367
|
ctx_arg.params = params_org;
|
|
330
368
|
return false;
|
|
331
369
|
}
|
|
332
370
|
if (ctx_arg.params.usage) {
|
|
333
|
-
|
|
371
|
+
common_params_print_usage(ctx_arg);
|
|
334
372
|
if (ctx_arg.print_usage) {
|
|
335
373
|
ctx_arg.print_usage(argc, argv);
|
|
336
374
|
}
|
|
@@ -345,16 +383,31 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example
|
|
|
345
383
|
return true;
|
|
346
384
|
}
|
|
347
385
|
|
|
348
|
-
|
|
349
|
-
|
|
386
|
+
static std::string list_builtin_chat_templates() {
|
|
387
|
+
std::vector<const char *> supported_tmpl;
|
|
388
|
+
int32_t res = llama_chat_builtin_templates(nullptr, 0);
|
|
389
|
+
supported_tmpl.resize(res);
|
|
390
|
+
res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
|
|
391
|
+
std::ostringstream msg;
|
|
392
|
+
for (auto & tmpl : supported_tmpl) {
|
|
393
|
+
msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", ");
|
|
394
|
+
}
|
|
395
|
+
return msg.str();
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
|
399
|
+
// load dynamic backends
|
|
400
|
+
ggml_backend_load_all();
|
|
401
|
+
|
|
402
|
+
common_params_context ctx_arg(params);
|
|
350
403
|
ctx_arg.print_usage = print_usage;
|
|
351
404
|
ctx_arg.ex = ex;
|
|
352
405
|
|
|
353
406
|
std::string sampler_type_chars;
|
|
354
407
|
std::string sampler_type_names;
|
|
355
|
-
for (const auto & sampler : params.
|
|
356
|
-
sampler_type_chars +=
|
|
357
|
-
sampler_type_names +=
|
|
408
|
+
for (const auto & sampler : params.sampling.samplers) {
|
|
409
|
+
sampler_type_chars += common_sampler_type_to_chr(sampler);
|
|
410
|
+
sampler_type_names += common_sampler_type_to_str(sampler) + ";";
|
|
358
411
|
}
|
|
359
412
|
sampler_type_names.pop_back();
|
|
360
413
|
|
|
@@ -366,374 +419,252 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
366
419
|
* - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
|
|
367
420
|
* - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
|
|
368
421
|
*/
|
|
369
|
-
auto add_opt = [&](
|
|
422
|
+
auto add_opt = [&](common_arg arg) {
|
|
370
423
|
if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
|
|
371
424
|
ctx_arg.options.push_back(std::move(arg));
|
|
372
425
|
}
|
|
373
426
|
};
|
|
374
427
|
|
|
375
428
|
|
|
376
|
-
add_opt(
|
|
429
|
+
add_opt(common_arg(
|
|
377
430
|
{"-h", "--help", "--usage"},
|
|
378
431
|
"print usage and exit",
|
|
379
|
-
[](
|
|
432
|
+
[](common_params & params) {
|
|
380
433
|
params.usage = true;
|
|
381
434
|
}
|
|
382
435
|
));
|
|
383
|
-
add_opt(
|
|
436
|
+
add_opt(common_arg(
|
|
384
437
|
{"--version"},
|
|
385
438
|
"show version and build info",
|
|
386
|
-
[](
|
|
439
|
+
[](common_params &) {
|
|
387
440
|
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
|
388
441
|
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
|
389
442
|
exit(0);
|
|
390
443
|
}
|
|
391
444
|
));
|
|
392
|
-
add_opt(
|
|
445
|
+
add_opt(common_arg(
|
|
393
446
|
{"--verbose-prompt"},
|
|
394
|
-
|
|
395
|
-
[](
|
|
447
|
+
string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
|
|
448
|
+
[](common_params & params) {
|
|
396
449
|
params.verbose_prompt = true;
|
|
397
450
|
}
|
|
398
451
|
));
|
|
399
|
-
add_opt(
|
|
452
|
+
add_opt(common_arg(
|
|
400
453
|
{"--no-display-prompt"},
|
|
401
|
-
|
|
402
|
-
[](
|
|
454
|
+
string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
|
|
455
|
+
[](common_params & params) {
|
|
403
456
|
params.display_prompt = false;
|
|
404
457
|
}
|
|
405
458
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
406
|
-
add_opt(
|
|
459
|
+
add_opt(common_arg(
|
|
407
460
|
{"-co", "--color"},
|
|
408
|
-
|
|
409
|
-
[](
|
|
461
|
+
string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
|
|
462
|
+
[](common_params & params) {
|
|
410
463
|
params.use_color = true;
|
|
411
464
|
}
|
|
412
465
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
|
413
|
-
add_opt(
|
|
466
|
+
add_opt(common_arg(
|
|
414
467
|
{"-t", "--threads"}, "N",
|
|
415
|
-
|
|
416
|
-
[](
|
|
468
|
+
string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
|
469
|
+
[](common_params & params, int value) {
|
|
417
470
|
params.cpuparams.n_threads = value;
|
|
418
471
|
if (params.cpuparams.n_threads <= 0) {
|
|
419
472
|
params.cpuparams.n_threads = std::thread::hardware_concurrency();
|
|
420
473
|
}
|
|
421
474
|
}
|
|
422
475
|
).set_env("LLAMA_ARG_THREADS"));
|
|
423
|
-
add_opt(
|
|
476
|
+
add_opt(common_arg(
|
|
424
477
|
{"-tb", "--threads-batch"}, "N",
|
|
425
478
|
"number of threads to use during batch and prompt processing (default: same as --threads)",
|
|
426
|
-
[](
|
|
479
|
+
[](common_params & params, int value) {
|
|
427
480
|
params.cpuparams_batch.n_threads = value;
|
|
428
481
|
if (params.cpuparams_batch.n_threads <= 0) {
|
|
429
482
|
params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
|
430
483
|
}
|
|
431
484
|
}
|
|
432
485
|
));
|
|
433
|
-
add_opt(
|
|
434
|
-
{"-td", "--threads-draft"}, "N",
|
|
435
|
-
"number of threads to use during generation (default: same as --threads)",
|
|
436
|
-
[](gpt_params & params, int value) {
|
|
437
|
-
params.draft_cpuparams.n_threads = value;
|
|
438
|
-
if (params.draft_cpuparams.n_threads <= 0) {
|
|
439
|
-
params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
|
|
440
|
-
}
|
|
441
|
-
}
|
|
442
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
443
|
-
add_opt(llama_arg(
|
|
444
|
-
{"-tbd", "--threads-batch-draft"}, "N",
|
|
445
|
-
"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
|
|
446
|
-
[](gpt_params & params, int value) {
|
|
447
|
-
params.draft_cpuparams_batch.n_threads = value;
|
|
448
|
-
if (params.draft_cpuparams_batch.n_threads <= 0) {
|
|
449
|
-
params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
|
450
|
-
}
|
|
451
|
-
}
|
|
452
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
453
|
-
add_opt(llama_arg(
|
|
486
|
+
add_opt(common_arg(
|
|
454
487
|
{"-C", "--cpu-mask"}, "M",
|
|
455
488
|
"CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
|
|
456
|
-
[](
|
|
489
|
+
[](common_params & params, const std::string & mask) {
|
|
457
490
|
params.cpuparams.mask_valid = true;
|
|
458
491
|
if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) {
|
|
459
492
|
throw std::invalid_argument("invalid cpumask");
|
|
460
493
|
}
|
|
461
494
|
}
|
|
462
495
|
));
|
|
463
|
-
add_opt(
|
|
496
|
+
add_opt(common_arg(
|
|
464
497
|
{"-Cr", "--cpu-range"}, "lo-hi",
|
|
465
498
|
"range of CPUs for affinity. Complements --cpu-mask",
|
|
466
|
-
[](
|
|
499
|
+
[](common_params & params, const std::string & range) {
|
|
467
500
|
params.cpuparams.mask_valid = true;
|
|
468
501
|
if (!parse_cpu_range(range, params.cpuparams.cpumask)) {
|
|
469
502
|
throw std::invalid_argument("invalid range");
|
|
470
503
|
}
|
|
471
504
|
}
|
|
472
505
|
));
|
|
473
|
-
add_opt(
|
|
506
|
+
add_opt(common_arg(
|
|
474
507
|
{"--cpu-strict"}, "<0|1>",
|
|
475
|
-
|
|
476
|
-
[](
|
|
508
|
+
string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
|
|
509
|
+
[](common_params & params, const std::string & value) {
|
|
477
510
|
params.cpuparams.strict_cpu = std::stoul(value);
|
|
478
511
|
}
|
|
479
512
|
));
|
|
480
|
-
add_opt(
|
|
513
|
+
add_opt(common_arg(
|
|
481
514
|
{"--prio"}, "N",
|
|
482
|
-
|
|
483
|
-
[](
|
|
515
|
+
string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
|
|
516
|
+
[](common_params & params, int prio) {
|
|
484
517
|
if (prio < 0 || prio > 3) {
|
|
485
518
|
throw std::invalid_argument("invalid value");
|
|
486
519
|
}
|
|
487
520
|
params.cpuparams.priority = (enum ggml_sched_priority) prio;
|
|
488
521
|
}
|
|
489
522
|
));
|
|
490
|
-
add_opt(
|
|
523
|
+
add_opt(common_arg(
|
|
491
524
|
{"--poll"}, "<0...100>",
|
|
492
|
-
|
|
493
|
-
[](
|
|
525
|
+
string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
|
|
526
|
+
[](common_params & params, const std::string & value) {
|
|
494
527
|
params.cpuparams.poll = std::stoul(value);
|
|
495
528
|
}
|
|
496
529
|
));
|
|
497
|
-
add_opt(
|
|
530
|
+
add_opt(common_arg(
|
|
498
531
|
{"-Cb", "--cpu-mask-batch"}, "M",
|
|
499
532
|
"CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
|
|
500
|
-
[](
|
|
533
|
+
[](common_params & params, const std::string & mask) {
|
|
501
534
|
params.cpuparams_batch.mask_valid = true;
|
|
502
535
|
if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) {
|
|
503
536
|
throw std::invalid_argument("invalid cpumask");
|
|
504
537
|
}
|
|
505
538
|
}
|
|
506
539
|
));
|
|
507
|
-
add_opt(
|
|
540
|
+
add_opt(common_arg(
|
|
508
541
|
{"-Crb", "--cpu-range-batch"}, "lo-hi",
|
|
509
542
|
"ranges of CPUs for affinity. Complements --cpu-mask-batch",
|
|
510
|
-
[](
|
|
543
|
+
[](common_params & params, const std::string & range) {
|
|
511
544
|
params.cpuparams_batch.mask_valid = true;
|
|
512
545
|
if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) {
|
|
513
546
|
throw std::invalid_argument("invalid range");
|
|
514
547
|
}
|
|
515
548
|
}
|
|
516
549
|
));
|
|
517
|
-
add_opt(
|
|
550
|
+
add_opt(common_arg(
|
|
518
551
|
{"--cpu-strict-batch"}, "<0|1>",
|
|
519
552
|
"use strict CPU placement (default: same as --cpu-strict)",
|
|
520
|
-
[](
|
|
553
|
+
[](common_params & params, int value) {
|
|
521
554
|
params.cpuparams_batch.strict_cpu = value;
|
|
522
555
|
}
|
|
523
556
|
));
|
|
524
|
-
add_opt(
|
|
557
|
+
add_opt(common_arg(
|
|
525
558
|
{"--prio-batch"}, "N",
|
|
526
|
-
|
|
527
|
-
[](
|
|
559
|
+
string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
|
|
560
|
+
[](common_params & params, int prio) {
|
|
528
561
|
if (prio < 0 || prio > 3) {
|
|
529
562
|
throw std::invalid_argument("invalid value");
|
|
530
563
|
}
|
|
531
564
|
params.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
|
|
532
565
|
}
|
|
533
566
|
));
|
|
534
|
-
add_opt(
|
|
567
|
+
add_opt(common_arg(
|
|
535
568
|
{"--poll-batch"}, "<0|1>",
|
|
536
569
|
"use polling to wait for work (default: same as --poll)",
|
|
537
|
-
[](
|
|
570
|
+
[](common_params & params, int value) {
|
|
538
571
|
params.cpuparams_batch.poll = value;
|
|
539
572
|
}
|
|
540
573
|
));
|
|
541
|
-
add_opt(
|
|
542
|
-
{"-Cd", "--cpu-mask-draft"}, "M",
|
|
543
|
-
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
|
544
|
-
[](gpt_params & params, const std::string & mask) {
|
|
545
|
-
params.draft_cpuparams.mask_valid = true;
|
|
546
|
-
if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
|
|
547
|
-
throw std::invalid_argument("invalid cpumask");
|
|
548
|
-
}
|
|
549
|
-
}
|
|
550
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
551
|
-
add_opt(llama_arg(
|
|
552
|
-
{"-Crd", "--cpu-range-draft"}, "lo-hi",
|
|
553
|
-
"Ranges of CPUs for affinity. Complements --cpu-mask-draft",
|
|
554
|
-
[](gpt_params & params, const std::string & range) {
|
|
555
|
-
params.draft_cpuparams.mask_valid = true;
|
|
556
|
-
if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
|
|
557
|
-
throw std::invalid_argument("invalid range");
|
|
558
|
-
}
|
|
559
|
-
}
|
|
560
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
561
|
-
add_opt(llama_arg(
|
|
562
|
-
{"--cpu-strict-draft"}, "<0|1>",
|
|
563
|
-
"Use strict CPU placement for draft model (default: same as --cpu-strict)",
|
|
564
|
-
[](gpt_params & params, int value) {
|
|
565
|
-
params.draft_cpuparams.strict_cpu = value;
|
|
566
|
-
}
|
|
567
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
568
|
-
add_opt(llama_arg(
|
|
569
|
-
{"--prio-draft"}, "N",
|
|
570
|
-
format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
|
|
571
|
-
[](gpt_params & params, int prio) {
|
|
572
|
-
if (prio < 0 || prio > 3) {
|
|
573
|
-
throw std::invalid_argument("invalid value");
|
|
574
|
-
}
|
|
575
|
-
params.draft_cpuparams.priority = (enum ggml_sched_priority) prio;
|
|
576
|
-
}
|
|
577
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
578
|
-
add_opt(llama_arg(
|
|
579
|
-
{"--poll-draft"}, "<0|1>",
|
|
580
|
-
"Use polling to wait for draft model work (default: same as --poll])",
|
|
581
|
-
[](gpt_params & params, int value) {
|
|
582
|
-
params.draft_cpuparams.poll = value;
|
|
583
|
-
}
|
|
584
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
585
|
-
add_opt(llama_arg(
|
|
586
|
-
{"-Cbd", "--cpu-mask-batch-draft"}, "M",
|
|
587
|
-
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
|
588
|
-
[](gpt_params & params, const std::string & mask) {
|
|
589
|
-
params.draft_cpuparams_batch.mask_valid = true;
|
|
590
|
-
if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) {
|
|
591
|
-
throw std::invalid_argument("invalid cpumask");
|
|
592
|
-
}
|
|
593
|
-
}
|
|
594
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
595
|
-
add_opt(llama_arg(
|
|
596
|
-
{"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
|
|
597
|
-
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
|
|
598
|
-
[](gpt_params & params, const std::string & range) {
|
|
599
|
-
params.draft_cpuparams_batch.mask_valid = true;
|
|
600
|
-
if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
|
|
601
|
-
throw std::invalid_argument("invalid cpumask");
|
|
602
|
-
}
|
|
603
|
-
}
|
|
604
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
605
|
-
add_opt(llama_arg(
|
|
606
|
-
{"--cpu-strict-batch-draft"}, "<0|1>",
|
|
607
|
-
"Use strict CPU placement for draft model (default: --cpu-strict-draft)",
|
|
608
|
-
[](gpt_params & params, int value) {
|
|
609
|
-
params.draft_cpuparams_batch.strict_cpu = value;
|
|
610
|
-
}
|
|
611
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
612
|
-
add_opt(llama_arg(
|
|
613
|
-
{"--prio-batch-draft"}, "N",
|
|
614
|
-
format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
|
|
615
|
-
[](gpt_params & params, int prio) {
|
|
616
|
-
if (prio < 0 || prio > 3) {
|
|
617
|
-
throw std::invalid_argument("invalid value");
|
|
618
|
-
}
|
|
619
|
-
params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio;
|
|
620
|
-
}
|
|
621
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
622
|
-
add_opt(llama_arg(
|
|
623
|
-
{"--poll-batch-draft"}, "<0|1>",
|
|
624
|
-
"Use polling to wait for draft model work (default: --poll-draft)",
|
|
625
|
-
[](gpt_params & params, int value) {
|
|
626
|
-
params.draft_cpuparams_batch.poll = value;
|
|
627
|
-
}
|
|
628
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
629
|
-
add_opt(llama_arg(
|
|
630
|
-
{"--draft"}, "N",
|
|
631
|
-
format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
|
|
632
|
-
[](gpt_params & params, int value) {
|
|
633
|
-
params.n_draft = value;
|
|
634
|
-
}
|
|
635
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
|
636
|
-
add_opt(llama_arg(
|
|
637
|
-
{"-ps", "--p-split"}, "N",
|
|
638
|
-
format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
|
|
639
|
-
[](gpt_params & params, const std::string & value) {
|
|
640
|
-
params.p_split = std::stof(value);
|
|
641
|
-
}
|
|
642
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
643
|
-
add_opt(llama_arg(
|
|
574
|
+
add_opt(common_arg(
|
|
644
575
|
{"-lcs", "--lookup-cache-static"}, "FNAME",
|
|
645
576
|
"path to static lookup cache to use for lookup decoding (not updated by generation)",
|
|
646
|
-
[](
|
|
577
|
+
[](common_params & params, const std::string & value) {
|
|
647
578
|
params.lookup_cache_static = value;
|
|
648
579
|
}
|
|
649
580
|
).set_examples({LLAMA_EXAMPLE_LOOKUP}));
|
|
650
|
-
add_opt(
|
|
581
|
+
add_opt(common_arg(
|
|
651
582
|
{"-lcd", "--lookup-cache-dynamic"}, "FNAME",
|
|
652
583
|
"path to dynamic lookup cache to use for lookup decoding (updated by generation)",
|
|
653
|
-
[](
|
|
584
|
+
[](common_params & params, const std::string & value) {
|
|
654
585
|
params.lookup_cache_dynamic = value;
|
|
655
586
|
}
|
|
656
587
|
).set_examples({LLAMA_EXAMPLE_LOOKUP}));
|
|
657
|
-
add_opt(
|
|
588
|
+
add_opt(common_arg(
|
|
658
589
|
{"-c", "--ctx-size"}, "N",
|
|
659
|
-
|
|
660
|
-
[](
|
|
590
|
+
string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
|
|
591
|
+
[](common_params & params, int value) {
|
|
661
592
|
params.n_ctx = value;
|
|
662
593
|
}
|
|
663
594
|
).set_env("LLAMA_ARG_CTX_SIZE"));
|
|
664
|
-
add_opt(
|
|
595
|
+
add_opt(common_arg(
|
|
665
596
|
{"-n", "--predict", "--n-predict"}, "N",
|
|
666
|
-
|
|
667
|
-
[](
|
|
597
|
+
string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
|
|
598
|
+
[](common_params & params, int value) {
|
|
668
599
|
params.n_predict = value;
|
|
669
600
|
}
|
|
670
601
|
).set_env("LLAMA_ARG_N_PREDICT"));
|
|
671
|
-
add_opt(
|
|
602
|
+
add_opt(common_arg(
|
|
672
603
|
{"-b", "--batch-size"}, "N",
|
|
673
|
-
|
|
674
|
-
[](
|
|
604
|
+
string_format("logical maximum batch size (default: %d)", params.n_batch),
|
|
605
|
+
[](common_params & params, int value) {
|
|
675
606
|
params.n_batch = value;
|
|
676
607
|
}
|
|
677
608
|
).set_env("LLAMA_ARG_BATCH"));
|
|
678
|
-
add_opt(
|
|
609
|
+
add_opt(common_arg(
|
|
679
610
|
{"-ub", "--ubatch-size"}, "N",
|
|
680
|
-
|
|
681
|
-
[](
|
|
611
|
+
string_format("physical maximum batch size (default: %d)", params.n_ubatch),
|
|
612
|
+
[](common_params & params, int value) {
|
|
682
613
|
params.n_ubatch = value;
|
|
683
614
|
}
|
|
684
615
|
).set_env("LLAMA_ARG_UBATCH"));
|
|
685
|
-
add_opt(
|
|
616
|
+
add_opt(common_arg(
|
|
686
617
|
{"--keep"}, "N",
|
|
687
|
-
|
|
688
|
-
[](
|
|
618
|
+
string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
|
|
619
|
+
[](common_params & params, int value) {
|
|
689
620
|
params.n_keep = value;
|
|
690
621
|
}
|
|
691
622
|
));
|
|
692
|
-
add_opt(
|
|
623
|
+
add_opt(common_arg(
|
|
693
624
|
{"--no-context-shift"},
|
|
694
|
-
|
|
695
|
-
[](
|
|
625
|
+
string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
|
626
|
+
[](common_params & params) {
|
|
696
627
|
params.ctx_shift = false;
|
|
697
628
|
}
|
|
698
|
-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
|
699
|
-
add_opt(
|
|
629
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
|
630
|
+
add_opt(common_arg(
|
|
700
631
|
{"--chunks"}, "N",
|
|
701
|
-
|
|
702
|
-
[](
|
|
632
|
+
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
|
|
633
|
+
[](common_params & params, int value) {
|
|
703
634
|
params.n_chunks = value;
|
|
704
635
|
}
|
|
705
636
|
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
|
|
706
|
-
add_opt(
|
|
637
|
+
add_opt(common_arg(
|
|
707
638
|
{"-fa", "--flash-attn"},
|
|
708
|
-
|
|
709
|
-
[](
|
|
639
|
+
string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
|
|
640
|
+
[](common_params & params) {
|
|
710
641
|
params.flash_attn = true;
|
|
711
642
|
}
|
|
712
643
|
).set_env("LLAMA_ARG_FLASH_ATTN"));
|
|
713
|
-
add_opt(
|
|
644
|
+
add_opt(common_arg(
|
|
714
645
|
{"-p", "--prompt"}, "PROMPT",
|
|
715
646
|
ex == LLAMA_EXAMPLE_MAIN
|
|
716
647
|
? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
|
|
717
648
|
: "prompt to start generation with",
|
|
718
|
-
[](
|
|
649
|
+
[](common_params & params, const std::string & value) {
|
|
719
650
|
params.prompt = value;
|
|
720
651
|
}
|
|
721
652
|
));
|
|
722
|
-
add_opt(
|
|
653
|
+
add_opt(common_arg(
|
|
723
654
|
{"--no-perf"},
|
|
724
|
-
|
|
725
|
-
[](
|
|
655
|
+
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
|
656
|
+
[](common_params & params) {
|
|
726
657
|
params.no_perf = true;
|
|
727
|
-
params.
|
|
658
|
+
params.sampling.no_perf = true;
|
|
728
659
|
}
|
|
729
660
|
).set_env("LLAMA_ARG_NO_PERF"));
|
|
730
|
-
add_opt(
|
|
661
|
+
add_opt(common_arg(
|
|
731
662
|
{"-f", "--file"}, "FNAME",
|
|
732
663
|
"a file containing the prompt (default: none)",
|
|
733
|
-
[](
|
|
664
|
+
[](common_params & params, const std::string & value) {
|
|
734
665
|
std::ifstream file(value);
|
|
735
666
|
if (!file) {
|
|
736
|
-
throw std::runtime_error(
|
|
667
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
737
668
|
}
|
|
738
669
|
// store the external file name in params
|
|
739
670
|
params.prompt_file = value;
|
|
@@ -743,24 +674,24 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
743
674
|
}
|
|
744
675
|
}
|
|
745
676
|
));
|
|
746
|
-
add_opt(
|
|
677
|
+
add_opt(common_arg(
|
|
747
678
|
{"--in-file"}, "FNAME",
|
|
748
679
|
"an input file (repeat to specify multiple files)",
|
|
749
|
-
[](
|
|
680
|
+
[](common_params & params, const std::string & value) {
|
|
750
681
|
std::ifstream file(value);
|
|
751
682
|
if (!file) {
|
|
752
|
-
throw std::runtime_error(
|
|
683
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
753
684
|
}
|
|
754
685
|
params.in_files.push_back(value);
|
|
755
686
|
}
|
|
756
687
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
757
|
-
add_opt(
|
|
688
|
+
add_opt(common_arg(
|
|
758
689
|
{"-bf", "--binary-file"}, "FNAME",
|
|
759
690
|
"binary file containing the prompt (default: none)",
|
|
760
|
-
[](
|
|
691
|
+
[](common_params & params, const std::string & value) {
|
|
761
692
|
std::ifstream file(value, std::ios::binary);
|
|
762
693
|
if (!file) {
|
|
763
|
-
throw std::runtime_error(
|
|
694
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
764
695
|
}
|
|
765
696
|
// store the external file name in params
|
|
766
697
|
params.prompt_file = value;
|
|
@@ -770,287 +701,351 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
770
701
|
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
|
|
771
702
|
}
|
|
772
703
|
));
|
|
773
|
-
add_opt(
|
|
704
|
+
add_opt(common_arg(
|
|
774
705
|
{"-e", "--escape"},
|
|
775
|
-
|
|
776
|
-
[](
|
|
706
|
+
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
|
707
|
+
[](common_params & params) {
|
|
777
708
|
params.escape = true;
|
|
778
709
|
}
|
|
779
710
|
));
|
|
780
|
-
add_opt(
|
|
711
|
+
add_opt(common_arg(
|
|
781
712
|
{"--no-escape"},
|
|
782
713
|
"do not process escape sequences",
|
|
783
|
-
[](
|
|
714
|
+
[](common_params & params) {
|
|
784
715
|
params.escape = false;
|
|
785
716
|
}
|
|
786
717
|
));
|
|
787
|
-
add_opt(
|
|
718
|
+
add_opt(common_arg(
|
|
788
719
|
{"-ptc", "--print-token-count"}, "N",
|
|
789
|
-
|
|
790
|
-
[](
|
|
720
|
+
string_format("print token count every N tokens (default: %d)", params.n_print),
|
|
721
|
+
[](common_params & params, int value) {
|
|
791
722
|
params.n_print = value;
|
|
792
723
|
}
|
|
793
724
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
794
|
-
add_opt(
|
|
725
|
+
add_opt(common_arg(
|
|
795
726
|
{"--prompt-cache"}, "FNAME",
|
|
796
727
|
"file to cache prompt state for faster startup (default: none)",
|
|
797
|
-
[](
|
|
728
|
+
[](common_params & params, const std::string & value) {
|
|
798
729
|
params.path_prompt_cache = value;
|
|
799
730
|
}
|
|
800
731
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
801
|
-
add_opt(
|
|
732
|
+
add_opt(common_arg(
|
|
802
733
|
{"--prompt-cache-all"},
|
|
803
734
|
"if specified, saves user input and generations to cache as well\n",
|
|
804
|
-
[](
|
|
735
|
+
[](common_params & params) {
|
|
805
736
|
params.prompt_cache_all = true;
|
|
806
737
|
}
|
|
807
738
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
808
|
-
add_opt(
|
|
739
|
+
add_opt(common_arg(
|
|
809
740
|
{"--prompt-cache-ro"},
|
|
810
741
|
"if specified, uses the prompt cache but does not update it",
|
|
811
|
-
[](
|
|
742
|
+
[](common_params & params) {
|
|
812
743
|
params.prompt_cache_ro = true;
|
|
813
744
|
}
|
|
814
745
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
815
|
-
add_opt(
|
|
746
|
+
add_opt(common_arg(
|
|
816
747
|
{"-r", "--reverse-prompt"}, "PROMPT",
|
|
817
748
|
"halt generation at PROMPT, return control in interactive mode\n",
|
|
818
|
-
[](
|
|
749
|
+
[](common_params & params, const std::string & value) {
|
|
819
750
|
params.antiprompt.emplace_back(value);
|
|
820
751
|
}
|
|
821
752
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
822
|
-
add_opt(
|
|
753
|
+
add_opt(common_arg(
|
|
823
754
|
{"-sp", "--special"},
|
|
824
|
-
|
|
825
|
-
[](
|
|
755
|
+
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
|
|
756
|
+
[](common_params & params) {
|
|
826
757
|
params.special = true;
|
|
827
758
|
}
|
|
828
759
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
|
|
829
|
-
add_opt(
|
|
760
|
+
add_opt(common_arg(
|
|
830
761
|
{"-cnv", "--conversation"},
|
|
831
|
-
|
|
762
|
+
string_format(
|
|
832
763
|
"run in conversation mode:\n"
|
|
833
764
|
"- does not print special tokens and suffix/prefix\n"
|
|
834
765
|
"- interactive mode is also enabled\n"
|
|
835
766
|
"(default: %s)",
|
|
836
767
|
params.conversation ? "true" : "false"
|
|
837
768
|
),
|
|
838
|
-
[](
|
|
769
|
+
[](common_params & params) {
|
|
839
770
|
params.conversation = true;
|
|
840
771
|
}
|
|
841
772
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
842
|
-
add_opt(
|
|
773
|
+
add_opt(common_arg(
|
|
843
774
|
{"-i", "--interactive"},
|
|
844
|
-
|
|
845
|
-
[](
|
|
775
|
+
string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
|
|
776
|
+
[](common_params & params) {
|
|
846
777
|
params.interactive = true;
|
|
847
778
|
}
|
|
848
779
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
849
|
-
add_opt(
|
|
780
|
+
add_opt(common_arg(
|
|
850
781
|
{"-if", "--interactive-first"},
|
|
851
|
-
|
|
852
|
-
[](
|
|
782
|
+
string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
|
|
783
|
+
[](common_params & params) {
|
|
853
784
|
params.interactive_first = true;
|
|
854
785
|
}
|
|
855
786
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
856
|
-
add_opt(
|
|
787
|
+
add_opt(common_arg(
|
|
857
788
|
{"-mli", "--multiline-input"},
|
|
858
789
|
"allows you to write or paste multiple lines without ending each in '\\'",
|
|
859
|
-
[](
|
|
790
|
+
[](common_params & params) {
|
|
860
791
|
params.multiline_input = true;
|
|
861
792
|
}
|
|
862
793
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
863
|
-
add_opt(
|
|
794
|
+
add_opt(common_arg(
|
|
864
795
|
{"--in-prefix-bos"},
|
|
865
796
|
"prefix BOS to user inputs, preceding the `--in-prefix` string",
|
|
866
|
-
[](
|
|
797
|
+
[](common_params & params) {
|
|
867
798
|
params.input_prefix_bos = true;
|
|
868
799
|
params.enable_chat_template = false;
|
|
869
800
|
}
|
|
870
801
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
871
|
-
add_opt(
|
|
802
|
+
add_opt(common_arg(
|
|
872
803
|
{"--in-prefix"}, "STRING",
|
|
873
804
|
"string to prefix user inputs with (default: empty)",
|
|
874
|
-
[](
|
|
805
|
+
[](common_params & params, const std::string & value) {
|
|
875
806
|
params.input_prefix = value;
|
|
876
807
|
params.enable_chat_template = false;
|
|
877
808
|
}
|
|
878
809
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
|
879
|
-
add_opt(
|
|
810
|
+
add_opt(common_arg(
|
|
880
811
|
{"--in-suffix"}, "STRING",
|
|
881
812
|
"string to suffix after user inputs with (default: empty)",
|
|
882
|
-
[](
|
|
813
|
+
[](common_params & params, const std::string & value) {
|
|
883
814
|
params.input_suffix = value;
|
|
884
815
|
params.enable_chat_template = false;
|
|
885
816
|
}
|
|
886
817
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
|
887
|
-
add_opt(
|
|
818
|
+
add_opt(common_arg(
|
|
888
819
|
{"--no-warmup"},
|
|
889
820
|
"skip warming up the model with an empty run",
|
|
890
|
-
[](
|
|
821
|
+
[](common_params & params) {
|
|
891
822
|
params.warmup = false;
|
|
892
823
|
}
|
|
893
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
894
|
-
add_opt(
|
|
824
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
|
|
825
|
+
add_opt(common_arg(
|
|
895
826
|
{"--spm-infill"},
|
|
896
|
-
|
|
827
|
+
string_format(
|
|
897
828
|
"use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
|
|
898
829
|
params.spm_infill ? "enabled" : "disabled"
|
|
899
830
|
),
|
|
900
|
-
[](
|
|
831
|
+
[](common_params & params) {
|
|
901
832
|
params.spm_infill = true;
|
|
902
833
|
}
|
|
903
834
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
|
|
904
|
-
add_opt(
|
|
835
|
+
add_opt(common_arg(
|
|
905
836
|
{"--samplers"}, "SAMPLERS",
|
|
906
|
-
|
|
907
|
-
[](
|
|
908
|
-
const auto sampler_names = string_split(value, ';');
|
|
909
|
-
params.
|
|
837
|
+
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
|
838
|
+
[](common_params & params, const std::string & value) {
|
|
839
|
+
const auto sampler_names = string_split<std::string>(value, ';');
|
|
840
|
+
params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
|
|
910
841
|
}
|
|
911
842
|
).set_sparam());
|
|
912
|
-
add_opt(
|
|
843
|
+
add_opt(common_arg(
|
|
913
844
|
{"-s", "--seed"}, "SEED",
|
|
914
|
-
|
|
915
|
-
[](
|
|
916
|
-
params.
|
|
845
|
+
string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED),
|
|
846
|
+
[](common_params & params, const std::string & value) {
|
|
847
|
+
params.sampling.seed = std::stoul(value);
|
|
917
848
|
}
|
|
918
849
|
).set_sparam());
|
|
919
|
-
add_opt(
|
|
920
|
-
{"--sampling-seq"}, "SEQUENCE",
|
|
921
|
-
|
|
922
|
-
[](
|
|
923
|
-
params.
|
|
850
|
+
add_opt(common_arg(
|
|
851
|
+
{"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
|
|
852
|
+
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
|
|
853
|
+
[](common_params & params, const std::string & value) {
|
|
854
|
+
params.sampling.samplers = common_sampler_types_from_chars(value);
|
|
924
855
|
}
|
|
925
856
|
).set_sparam());
|
|
926
|
-
add_opt(
|
|
857
|
+
add_opt(common_arg(
|
|
927
858
|
{"--ignore-eos"},
|
|
928
859
|
"ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
|
|
929
|
-
[](
|
|
930
|
-
params.
|
|
931
|
-
}
|
|
932
|
-
).set_sparam());
|
|
933
|
-
add_opt(llama_arg(
|
|
934
|
-
{"--penalize-nl"},
|
|
935
|
-
format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
|
|
936
|
-
[](gpt_params & params) {
|
|
937
|
-
params.sparams.penalize_nl = true;
|
|
860
|
+
[](common_params & params) {
|
|
861
|
+
params.sampling.ignore_eos = true;
|
|
938
862
|
}
|
|
939
863
|
).set_sparam());
|
|
940
|
-
add_opt(
|
|
864
|
+
add_opt(common_arg(
|
|
941
865
|
{"--temp"}, "N",
|
|
942
|
-
|
|
943
|
-
[](
|
|
944
|
-
params.
|
|
945
|
-
params.
|
|
866
|
+
string_format("temperature (default: %.1f)", (double)params.sampling.temp),
|
|
867
|
+
[](common_params & params, const std::string & value) {
|
|
868
|
+
params.sampling.temp = std::stof(value);
|
|
869
|
+
params.sampling.temp = std::max(params.sampling.temp, 0.0f);
|
|
946
870
|
}
|
|
947
871
|
).set_sparam());
|
|
948
|
-
add_opt(
|
|
872
|
+
add_opt(common_arg(
|
|
949
873
|
{"--top-k"}, "N",
|
|
950
|
-
|
|
951
|
-
[](
|
|
952
|
-
params.
|
|
874
|
+
string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
|
|
875
|
+
[](common_params & params, int value) {
|
|
876
|
+
params.sampling.top_k = value;
|
|
953
877
|
}
|
|
954
878
|
).set_sparam());
|
|
955
|
-
add_opt(
|
|
879
|
+
add_opt(common_arg(
|
|
956
880
|
{"--top-p"}, "N",
|
|
957
|
-
|
|
958
|
-
[](
|
|
959
|
-
params.
|
|
881
|
+
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
|
|
882
|
+
[](common_params & params, const std::string & value) {
|
|
883
|
+
params.sampling.top_p = std::stof(value);
|
|
960
884
|
}
|
|
961
885
|
).set_sparam());
|
|
962
|
-
add_opt(
|
|
886
|
+
add_opt(common_arg(
|
|
963
887
|
{"--min-p"}, "N",
|
|
964
|
-
|
|
965
|
-
[](
|
|
966
|
-
params.
|
|
888
|
+
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
|
|
889
|
+
[](common_params & params, const std::string & value) {
|
|
890
|
+
params.sampling.min_p = std::stof(value);
|
|
967
891
|
}
|
|
968
892
|
).set_sparam());
|
|
969
|
-
add_opt(
|
|
970
|
-
{"--
|
|
971
|
-
|
|
972
|
-
[](
|
|
973
|
-
params.
|
|
893
|
+
add_opt(common_arg(
|
|
894
|
+
{"--xtc-probability"}, "N",
|
|
895
|
+
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
|
|
896
|
+
[](common_params & params, const std::string & value) {
|
|
897
|
+
params.sampling.xtc_probability = std::stof(value);
|
|
974
898
|
}
|
|
975
899
|
).set_sparam());
|
|
976
|
-
add_opt(
|
|
900
|
+
add_opt(common_arg(
|
|
901
|
+
{"--xtc-threshold"}, "N",
|
|
902
|
+
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
|
|
903
|
+
[](common_params & params, const std::string & value) {
|
|
904
|
+
params.sampling.xtc_threshold = std::stof(value);
|
|
905
|
+
}
|
|
906
|
+
).set_sparam());
|
|
907
|
+
add_opt(common_arg(
|
|
977
908
|
{"--typical"}, "N",
|
|
978
|
-
|
|
979
|
-
[](
|
|
980
|
-
params.
|
|
909
|
+
string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),
|
|
910
|
+
[](common_params & params, const std::string & value) {
|
|
911
|
+
params.sampling.typ_p = std::stof(value);
|
|
981
912
|
}
|
|
982
913
|
).set_sparam());
|
|
983
|
-
add_opt(
|
|
914
|
+
add_opt(common_arg(
|
|
984
915
|
{"--repeat-last-n"}, "N",
|
|
985
|
-
|
|
986
|
-
[](
|
|
987
|
-
|
|
988
|
-
|
|
916
|
+
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
|
|
917
|
+
[](common_params & params, int value) {
|
|
918
|
+
if (value < -1) {
|
|
919
|
+
throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
|
|
920
|
+
}
|
|
921
|
+
params.sampling.penalty_last_n = value;
|
|
922
|
+
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
|
|
989
923
|
}
|
|
990
924
|
).set_sparam());
|
|
991
|
-
add_opt(
|
|
925
|
+
add_opt(common_arg(
|
|
992
926
|
{"--repeat-penalty"}, "N",
|
|
993
|
-
|
|
994
|
-
[](
|
|
995
|
-
params.
|
|
927
|
+
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
|
|
928
|
+
[](common_params & params, const std::string & value) {
|
|
929
|
+
params.sampling.penalty_repeat = std::stof(value);
|
|
996
930
|
}
|
|
997
931
|
).set_sparam());
|
|
998
|
-
add_opt(
|
|
932
|
+
add_opt(common_arg(
|
|
999
933
|
{"--presence-penalty"}, "N",
|
|
1000
|
-
|
|
1001
|
-
[](
|
|
1002
|
-
params.
|
|
934
|
+
string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present),
|
|
935
|
+
[](common_params & params, const std::string & value) {
|
|
936
|
+
params.sampling.penalty_present = std::stof(value);
|
|
1003
937
|
}
|
|
1004
938
|
).set_sparam());
|
|
1005
|
-
add_opt(
|
|
939
|
+
add_opt(common_arg(
|
|
1006
940
|
{"--frequency-penalty"}, "N",
|
|
1007
|
-
|
|
1008
|
-
[](
|
|
1009
|
-
params.
|
|
941
|
+
string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
|
|
942
|
+
[](common_params & params, const std::string & value) {
|
|
943
|
+
params.sampling.penalty_freq = std::stof(value);
|
|
944
|
+
}
|
|
945
|
+
).set_sparam());
|
|
946
|
+
add_opt(common_arg(
|
|
947
|
+
{"--dry-multiplier"}, "N",
|
|
948
|
+
string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
|
|
949
|
+
[](common_params & params, const std::string & value) {
|
|
950
|
+
params.sampling.dry_multiplier = std::stof(value);
|
|
951
|
+
}
|
|
952
|
+
).set_sparam());
|
|
953
|
+
add_opt(common_arg(
|
|
954
|
+
{"--dry-base"}, "N",
|
|
955
|
+
string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base),
|
|
956
|
+
[](common_params & params, const std::string & value) {
|
|
957
|
+
float potential_base = std::stof(value);
|
|
958
|
+
if (potential_base >= 1.0f)
|
|
959
|
+
{
|
|
960
|
+
params.sampling.dry_base = potential_base;
|
|
961
|
+
}
|
|
962
|
+
}
|
|
963
|
+
).set_sparam());
|
|
964
|
+
add_opt(common_arg(
|
|
965
|
+
{"--dry-allowed-length"}, "N",
|
|
966
|
+
string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length),
|
|
967
|
+
[](common_params & params, int value) {
|
|
968
|
+
params.sampling.dry_allowed_length = value;
|
|
969
|
+
}
|
|
970
|
+
).set_sparam());
|
|
971
|
+
add_opt(common_arg(
|
|
972
|
+
{"--dry-penalty-last-n"}, "N",
|
|
973
|
+
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
|
|
974
|
+
[](common_params & params, int value) {
|
|
975
|
+
if (value < -1) {
|
|
976
|
+
throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
|
|
977
|
+
}
|
|
978
|
+
params.sampling.dry_penalty_last_n = value;
|
|
979
|
+
}
|
|
980
|
+
).set_sparam());
|
|
981
|
+
add_opt(common_arg(
|
|
982
|
+
{"--dry-sequence-breaker"}, "STRING",
|
|
983
|
+
string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
|
|
984
|
+
params.sampling.dry_sequence_breakers.empty() ? "none" :
|
|
985
|
+
std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()),
|
|
986
|
+
params.sampling.dry_sequence_breakers.end(),
|
|
987
|
+
std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'",
|
|
988
|
+
[](const std::string& a, const std::string& b) {
|
|
989
|
+
std::string formatted_b = (b == "\n") ? "\\n" : b;
|
|
990
|
+
return a + ", '" + formatted_b + "'";
|
|
991
|
+
}).c_str()),
|
|
992
|
+
[](common_params & params, const std::string & value) {
|
|
993
|
+
static bool defaults_cleared = false;
|
|
994
|
+
|
|
995
|
+
if (!defaults_cleared) {
|
|
996
|
+
params.sampling.dry_sequence_breakers.clear();
|
|
997
|
+
defaults_cleared = true;
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
if (value == "none") {
|
|
1001
|
+
params.sampling.dry_sequence_breakers.clear();
|
|
1002
|
+
} else {
|
|
1003
|
+
params.sampling.dry_sequence_breakers.emplace_back(value);
|
|
1004
|
+
}
|
|
1010
1005
|
}
|
|
1011
1006
|
).set_sparam());
|
|
1012
|
-
add_opt(
|
|
1007
|
+
add_opt(common_arg(
|
|
1013
1008
|
{"--dynatemp-range"}, "N",
|
|
1014
|
-
|
|
1015
|
-
[](
|
|
1016
|
-
params.
|
|
1009
|
+
string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
|
|
1010
|
+
[](common_params & params, const std::string & value) {
|
|
1011
|
+
params.sampling.dynatemp_range = std::stof(value);
|
|
1017
1012
|
}
|
|
1018
1013
|
).set_sparam());
|
|
1019
|
-
add_opt(
|
|
1014
|
+
add_opt(common_arg(
|
|
1020
1015
|
{"--dynatemp-exp"}, "N",
|
|
1021
|
-
|
|
1022
|
-
[](
|
|
1023
|
-
params.
|
|
1016
|
+
string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent),
|
|
1017
|
+
[](common_params & params, const std::string & value) {
|
|
1018
|
+
params.sampling.dynatemp_exponent = std::stof(value);
|
|
1024
1019
|
}
|
|
1025
1020
|
).set_sparam());
|
|
1026
|
-
add_opt(
|
|
1021
|
+
add_opt(common_arg(
|
|
1027
1022
|
{"--mirostat"}, "N",
|
|
1028
|
-
|
|
1029
|
-
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.
|
|
1030
|
-
[](
|
|
1031
|
-
params.
|
|
1023
|
+
string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
|
|
1024
|
+
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
|
|
1025
|
+
[](common_params & params, int value) {
|
|
1026
|
+
params.sampling.mirostat = value;
|
|
1032
1027
|
}
|
|
1033
1028
|
).set_sparam());
|
|
1034
|
-
add_opt(
|
|
1029
|
+
add_opt(common_arg(
|
|
1035
1030
|
{"--mirostat-lr"}, "N",
|
|
1036
|
-
|
|
1037
|
-
[](
|
|
1038
|
-
params.
|
|
1031
|
+
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
|
|
1032
|
+
[](common_params & params, const std::string & value) {
|
|
1033
|
+
params.sampling.mirostat_eta = std::stof(value);
|
|
1039
1034
|
}
|
|
1040
1035
|
).set_sparam());
|
|
1041
|
-
add_opt(
|
|
1036
|
+
add_opt(common_arg(
|
|
1042
1037
|
{"--mirostat-ent"}, "N",
|
|
1043
|
-
|
|
1044
|
-
[](
|
|
1045
|
-
params.
|
|
1038
|
+
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
|
|
1039
|
+
[](common_params & params, const std::string & value) {
|
|
1040
|
+
params.sampling.mirostat_tau = std::stof(value);
|
|
1046
1041
|
}
|
|
1047
1042
|
).set_sparam());
|
|
1048
|
-
add_opt(
|
|
1043
|
+
add_opt(common_arg(
|
|
1049
1044
|
{"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS",
|
|
1050
1045
|
"modifies the likelihood of token appearing in the completion,\n"
|
|
1051
1046
|
"i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
|
|
1052
1047
|
"or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
|
|
1053
|
-
[](
|
|
1048
|
+
[](common_params & params, const std::string & value) {
|
|
1054
1049
|
std::stringstream ss(value);
|
|
1055
1050
|
llama_token key;
|
|
1056
1051
|
char sign;
|
|
@@ -1058,7 +1053,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1058
1053
|
try {
|
|
1059
1054
|
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
|
1060
1055
|
const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
|
1061
|
-
params.
|
|
1056
|
+
params.sampling.logit_bias.push_back({key, bias});
|
|
1062
1057
|
} else {
|
|
1063
1058
|
throw std::invalid_argument("invalid input format");
|
|
1064
1059
|
}
|
|
@@ -1067,39 +1062,39 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1067
1062
|
}
|
|
1068
1063
|
}
|
|
1069
1064
|
).set_sparam());
|
|
1070
|
-
add_opt(
|
|
1065
|
+
add_opt(common_arg(
|
|
1071
1066
|
{"--grammar"}, "GRAMMAR",
|
|
1072
|
-
|
|
1073
|
-
[](
|
|
1074
|
-
params.
|
|
1067
|
+
string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
|
|
1068
|
+
[](common_params & params, const std::string & value) {
|
|
1069
|
+
params.sampling.grammar = value;
|
|
1075
1070
|
}
|
|
1076
1071
|
).set_sparam());
|
|
1077
|
-
add_opt(
|
|
1072
|
+
add_opt(common_arg(
|
|
1078
1073
|
{"--grammar-file"}, "FNAME",
|
|
1079
1074
|
"file to read grammar from",
|
|
1080
|
-
[](
|
|
1075
|
+
[](common_params & params, const std::string & value) {
|
|
1081
1076
|
std::ifstream file(value);
|
|
1082
1077
|
if (!file) {
|
|
1083
|
-
throw std::runtime_error(
|
|
1078
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
1084
1079
|
}
|
|
1085
1080
|
std::copy(
|
|
1086
1081
|
std::istreambuf_iterator<char>(file),
|
|
1087
1082
|
std::istreambuf_iterator<char>(),
|
|
1088
|
-
std::back_inserter(params.
|
|
1083
|
+
std::back_inserter(params.sampling.grammar)
|
|
1089
1084
|
);
|
|
1090
1085
|
}
|
|
1091
1086
|
).set_sparam());
|
|
1092
|
-
add_opt(
|
|
1087
|
+
add_opt(common_arg(
|
|
1093
1088
|
{"-j", "--json-schema"}, "SCHEMA",
|
|
1094
1089
|
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
|
|
1095
|
-
[](
|
|
1096
|
-
params.
|
|
1090
|
+
[](common_params & params, const std::string & value) {
|
|
1091
|
+
params.sampling.grammar = json_schema_to_grammar(json::parse(value));
|
|
1097
1092
|
}
|
|
1098
1093
|
).set_sparam());
|
|
1099
|
-
add_opt(
|
|
1094
|
+
add_opt(common_arg(
|
|
1100
1095
|
{"--pooling"}, "{none,mean,cls,last,rank}",
|
|
1101
1096
|
"pooling type for embeddings, use model default if unspecified",
|
|
1102
|
-
[](
|
|
1097
|
+
[](common_params & params, const std::string & value) {
|
|
1103
1098
|
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
|
|
1104
1099
|
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
|
|
1105
1100
|
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
|
|
@@ -1108,275 +1103,285 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1108
1103
|
else { throw std::invalid_argument("invalid value"); }
|
|
1109
1104
|
}
|
|
1110
1105
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
|
|
1111
|
-
add_opt(
|
|
1112
|
-
{"--attention"}, "{causal,non
|
|
1106
|
+
add_opt(common_arg(
|
|
1107
|
+
{"--attention"}, "{causal,non-causal}",
|
|
1113
1108
|
"attention type for embeddings, use model default if unspecified",
|
|
1114
|
-
[](
|
|
1109
|
+
[](common_params & params, const std::string & value) {
|
|
1115
1110
|
/**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
|
|
1116
1111
|
else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
|
|
1117
1112
|
else { throw std::invalid_argument("invalid value"); }
|
|
1118
1113
|
}
|
|
1119
1114
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
1120
|
-
add_opt(
|
|
1115
|
+
add_opt(common_arg(
|
|
1121
1116
|
{"--rope-scaling"}, "{none,linear,yarn}",
|
|
1122
1117
|
"RoPE frequency scaling method, defaults to linear unless specified by the model",
|
|
1123
|
-
[](
|
|
1118
|
+
[](common_params & params, const std::string & value) {
|
|
1124
1119
|
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
|
1125
1120
|
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
|
1126
1121
|
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
|
1127
1122
|
else { throw std::invalid_argument("invalid value"); }
|
|
1128
1123
|
}
|
|
1129
1124
|
).set_env("LLAMA_ARG_ROPE_SCALING_TYPE"));
|
|
1130
|
-
add_opt(
|
|
1125
|
+
add_opt(common_arg(
|
|
1131
1126
|
{"--rope-scale"}, "N",
|
|
1132
1127
|
"RoPE context scaling factor, expands context by a factor of N",
|
|
1133
|
-
[](
|
|
1128
|
+
[](common_params & params, const std::string & value) {
|
|
1134
1129
|
params.rope_freq_scale = 1.0f / std::stof(value);
|
|
1135
1130
|
}
|
|
1136
1131
|
).set_env("LLAMA_ARG_ROPE_SCALE"));
|
|
1137
|
-
add_opt(
|
|
1132
|
+
add_opt(common_arg(
|
|
1138
1133
|
{"--rope-freq-base"}, "N",
|
|
1139
1134
|
"RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
|
|
1140
|
-
[](
|
|
1135
|
+
[](common_params & params, const std::string & value) {
|
|
1141
1136
|
params.rope_freq_base = std::stof(value);
|
|
1142
1137
|
}
|
|
1143
1138
|
).set_env("LLAMA_ARG_ROPE_FREQ_BASE"));
|
|
1144
|
-
add_opt(
|
|
1139
|
+
add_opt(common_arg(
|
|
1145
1140
|
{"--rope-freq-scale"}, "N",
|
|
1146
1141
|
"RoPE frequency scaling factor, expands context by a factor of 1/N",
|
|
1147
|
-
[](
|
|
1142
|
+
[](common_params & params, const std::string & value) {
|
|
1148
1143
|
params.rope_freq_scale = std::stof(value);
|
|
1149
1144
|
}
|
|
1150
1145
|
).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
|
|
1151
|
-
add_opt(
|
|
1146
|
+
add_opt(common_arg(
|
|
1152
1147
|
{"--yarn-orig-ctx"}, "N",
|
|
1153
|
-
|
|
1154
|
-
[](
|
|
1148
|
+
string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
|
|
1149
|
+
[](common_params & params, int value) {
|
|
1155
1150
|
params.yarn_orig_ctx = value;
|
|
1156
1151
|
}
|
|
1157
1152
|
).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
|
|
1158
|
-
add_opt(
|
|
1153
|
+
add_opt(common_arg(
|
|
1159
1154
|
{"--yarn-ext-factor"}, "N",
|
|
1160
|
-
|
|
1161
|
-
[](
|
|
1155
|
+
string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
|
|
1156
|
+
[](common_params & params, const std::string & value) {
|
|
1162
1157
|
params.yarn_ext_factor = std::stof(value);
|
|
1163
1158
|
}
|
|
1164
1159
|
).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
|
|
1165
|
-
add_opt(
|
|
1160
|
+
add_opt(common_arg(
|
|
1166
1161
|
{"--yarn-attn-factor"}, "N",
|
|
1167
|
-
|
|
1168
|
-
[](
|
|
1162
|
+
string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
|
|
1163
|
+
[](common_params & params, const std::string & value) {
|
|
1169
1164
|
params.yarn_attn_factor = std::stof(value);
|
|
1170
1165
|
}
|
|
1171
1166
|
).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
|
|
1172
|
-
add_opt(
|
|
1167
|
+
add_opt(common_arg(
|
|
1173
1168
|
{"--yarn-beta-slow"}, "N",
|
|
1174
|
-
|
|
1175
|
-
[](
|
|
1169
|
+
string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
|
|
1170
|
+
[](common_params & params, const std::string & value) {
|
|
1176
1171
|
params.yarn_beta_slow = std::stof(value);
|
|
1177
1172
|
}
|
|
1178
1173
|
).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
|
|
1179
|
-
add_opt(
|
|
1174
|
+
add_opt(common_arg(
|
|
1180
1175
|
{"--yarn-beta-fast"}, "N",
|
|
1181
|
-
|
|
1182
|
-
[](
|
|
1176
|
+
string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
|
|
1177
|
+
[](common_params & params, const std::string & value) {
|
|
1183
1178
|
params.yarn_beta_fast = std::stof(value);
|
|
1184
1179
|
}
|
|
1185
1180
|
).set_env("LLAMA_ARG_YARN_BETA_FAST"));
|
|
1186
|
-
add_opt(
|
|
1181
|
+
add_opt(common_arg(
|
|
1187
1182
|
{"-gan", "--grp-attn-n"}, "N",
|
|
1188
|
-
|
|
1189
|
-
[](
|
|
1183
|
+
string_format("group-attention factor (default: %d)", params.grp_attn_n),
|
|
1184
|
+
[](common_params & params, int value) {
|
|
1190
1185
|
params.grp_attn_n = value;
|
|
1191
1186
|
}
|
|
1192
|
-
).set_env("LLAMA_ARG_GRP_ATTN_N"));
|
|
1193
|
-
add_opt(
|
|
1187
|
+
).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
|
|
1188
|
+
add_opt(common_arg(
|
|
1194
1189
|
{"-gaw", "--grp-attn-w"}, "N",
|
|
1195
|
-
|
|
1196
|
-
[](
|
|
1190
|
+
string_format("group-attention width (default: %d)", params.grp_attn_w),
|
|
1191
|
+
[](common_params & params, int value) {
|
|
1197
1192
|
params.grp_attn_w = value;
|
|
1198
1193
|
}
|
|
1199
|
-
).set_env("LLAMA_ARG_GRP_ATTN_W"));
|
|
1200
|
-
add_opt(
|
|
1194
|
+
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
1195
|
+
add_opt(common_arg(
|
|
1201
1196
|
{"-dkvc", "--dump-kv-cache"},
|
|
1202
1197
|
"verbose print of the KV cache",
|
|
1203
|
-
[](
|
|
1198
|
+
[](common_params & params) {
|
|
1204
1199
|
params.dump_kv_cache = true;
|
|
1205
1200
|
}
|
|
1206
1201
|
));
|
|
1207
|
-
add_opt(
|
|
1202
|
+
add_opt(common_arg(
|
|
1208
1203
|
{"-nkvo", "--no-kv-offload"},
|
|
1209
1204
|
"disable KV offload",
|
|
1210
|
-
[](
|
|
1205
|
+
[](common_params & params) {
|
|
1211
1206
|
params.no_kv_offload = true;
|
|
1212
1207
|
}
|
|
1213
1208
|
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
|
1214
|
-
add_opt(
|
|
1209
|
+
add_opt(common_arg(
|
|
1215
1210
|
{"-ctk", "--cache-type-k"}, "TYPE",
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1211
|
+
string_format(
|
|
1212
|
+
"KV cache data type for K\n"
|
|
1213
|
+
"allowed values: %s\n"
|
|
1214
|
+
"(default: %s)",
|
|
1215
|
+
get_all_kv_cache_types().c_str(),
|
|
1216
|
+
ggml_type_name(params.cache_type_k)
|
|
1217
|
+
),
|
|
1218
|
+
[](common_params & params, const std::string & value) {
|
|
1219
|
+
params.cache_type_k = kv_cache_type_from_str(value);
|
|
1220
1220
|
}
|
|
1221
1221
|
).set_env("LLAMA_ARG_CACHE_TYPE_K"));
|
|
1222
|
-
add_opt(
|
|
1222
|
+
add_opt(common_arg(
|
|
1223
1223
|
{"-ctv", "--cache-type-v"}, "TYPE",
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1224
|
+
string_format(
|
|
1225
|
+
"KV cache data type for V\n"
|
|
1226
|
+
"allowed values: %s\n"
|
|
1227
|
+
"(default: %s)",
|
|
1228
|
+
get_all_kv_cache_types().c_str(),
|
|
1229
|
+
ggml_type_name(params.cache_type_v)
|
|
1230
|
+
),
|
|
1231
|
+
[](common_params & params, const std::string & value) {
|
|
1232
|
+
params.cache_type_v = kv_cache_type_from_str(value);
|
|
1228
1233
|
}
|
|
1229
1234
|
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
|
|
1230
|
-
add_opt(
|
|
1235
|
+
add_opt(common_arg(
|
|
1231
1236
|
{"--perplexity", "--all-logits"},
|
|
1232
|
-
|
|
1233
|
-
[](
|
|
1237
|
+
string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
|
|
1238
|
+
[](common_params & params) {
|
|
1234
1239
|
params.logits_all = true;
|
|
1235
1240
|
}
|
|
1236
1241
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1237
|
-
add_opt(
|
|
1242
|
+
add_opt(common_arg(
|
|
1238
1243
|
{"--hellaswag"},
|
|
1239
1244
|
"compute HellaSwag score over random tasks from datafile supplied with -f",
|
|
1240
|
-
[](
|
|
1245
|
+
[](common_params & params) {
|
|
1241
1246
|
params.hellaswag = true;
|
|
1242
1247
|
}
|
|
1243
1248
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1244
|
-
add_opt(
|
|
1249
|
+
add_opt(common_arg(
|
|
1245
1250
|
{"--hellaswag-tasks"}, "N",
|
|
1246
|
-
|
|
1247
|
-
[](
|
|
1251
|
+
string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
|
|
1252
|
+
[](common_params & params, int value) {
|
|
1248
1253
|
params.hellaswag_tasks = value;
|
|
1249
1254
|
}
|
|
1250
1255
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1251
|
-
add_opt(
|
|
1256
|
+
add_opt(common_arg(
|
|
1252
1257
|
{"--winogrande"},
|
|
1253
1258
|
"compute Winogrande score over random tasks from datafile supplied with -f",
|
|
1254
|
-
[](
|
|
1259
|
+
[](common_params & params) {
|
|
1255
1260
|
params.winogrande = true;
|
|
1256
1261
|
}
|
|
1257
1262
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1258
|
-
add_opt(
|
|
1263
|
+
add_opt(common_arg(
|
|
1259
1264
|
{"--winogrande-tasks"}, "N",
|
|
1260
|
-
|
|
1261
|
-
[](
|
|
1265
|
+
string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
|
|
1266
|
+
[](common_params & params, int value) {
|
|
1262
1267
|
params.winogrande_tasks = value;
|
|
1263
1268
|
}
|
|
1264
1269
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1265
|
-
add_opt(
|
|
1270
|
+
add_opt(common_arg(
|
|
1266
1271
|
{"--multiple-choice"},
|
|
1267
1272
|
"compute multiple choice score over random tasks from datafile supplied with -f",
|
|
1268
|
-
[](
|
|
1273
|
+
[](common_params & params) {
|
|
1269
1274
|
params.multiple_choice = true;
|
|
1270
1275
|
}
|
|
1271
1276
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1272
|
-
add_opt(
|
|
1277
|
+
add_opt(common_arg(
|
|
1273
1278
|
{"--multiple-choice-tasks"}, "N",
|
|
1274
|
-
|
|
1275
|
-
[](
|
|
1279
|
+
string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
|
|
1280
|
+
[](common_params & params, int value) {
|
|
1276
1281
|
params.multiple_choice_tasks = value;
|
|
1277
1282
|
}
|
|
1278
1283
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1279
|
-
add_opt(
|
|
1284
|
+
add_opt(common_arg(
|
|
1280
1285
|
{"--kl-divergence"},
|
|
1281
1286
|
"computes KL-divergence to logits provided via --kl-divergence-base",
|
|
1282
|
-
[](
|
|
1287
|
+
[](common_params & params) {
|
|
1283
1288
|
params.kl_divergence = true;
|
|
1284
1289
|
}
|
|
1285
1290
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1286
|
-
add_opt(
|
|
1291
|
+
add_opt(common_arg(
|
|
1287
1292
|
{"--save-all-logits", "--kl-divergence-base"}, "FNAME",
|
|
1288
1293
|
"set logits file",
|
|
1289
|
-
[](
|
|
1294
|
+
[](common_params & params, const std::string & value) {
|
|
1290
1295
|
params.logits_file = value;
|
|
1291
1296
|
}
|
|
1292
1297
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1293
|
-
add_opt(
|
|
1298
|
+
add_opt(common_arg(
|
|
1294
1299
|
{"--ppl-stride"}, "N",
|
|
1295
|
-
|
|
1296
|
-
[](
|
|
1300
|
+
string_format("stride for perplexity calculation (default: %d)", params.ppl_stride),
|
|
1301
|
+
[](common_params & params, int value) {
|
|
1297
1302
|
params.ppl_stride = value;
|
|
1298
1303
|
}
|
|
1299
1304
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1300
|
-
add_opt(
|
|
1305
|
+
add_opt(common_arg(
|
|
1301
1306
|
{"--ppl-output-type"}, "<0|1>",
|
|
1302
|
-
|
|
1303
|
-
[](
|
|
1307
|
+
string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
|
|
1308
|
+
[](common_params & params, int value) {
|
|
1304
1309
|
params.ppl_output_type = value;
|
|
1305
1310
|
}
|
|
1306
1311
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1307
|
-
add_opt(
|
|
1312
|
+
add_opt(common_arg(
|
|
1308
1313
|
{"-dt", "--defrag-thold"}, "N",
|
|
1309
|
-
|
|
1310
|
-
[](
|
|
1314
|
+
string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
|
|
1315
|
+
[](common_params & params, const std::string & value) {
|
|
1311
1316
|
params.defrag_thold = std::stof(value);
|
|
1312
1317
|
}
|
|
1313
1318
|
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
|
|
1314
|
-
add_opt(
|
|
1319
|
+
add_opt(common_arg(
|
|
1315
1320
|
{"-np", "--parallel"}, "N",
|
|
1316
|
-
|
|
1317
|
-
[](
|
|
1321
|
+
string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
|
|
1322
|
+
[](common_params & params, int value) {
|
|
1318
1323
|
params.n_parallel = value;
|
|
1319
1324
|
}
|
|
1320
1325
|
).set_env("LLAMA_ARG_N_PARALLEL"));
|
|
1321
|
-
add_opt(
|
|
1326
|
+
add_opt(common_arg(
|
|
1322
1327
|
{"-ns", "--sequences"}, "N",
|
|
1323
|
-
|
|
1324
|
-
[](
|
|
1328
|
+
string_format("number of sequences to decode (default: %d)", params.n_sequences),
|
|
1329
|
+
[](common_params & params, int value) {
|
|
1325
1330
|
params.n_sequences = value;
|
|
1326
1331
|
}
|
|
1327
1332
|
).set_examples({LLAMA_EXAMPLE_PARALLEL}));
|
|
1328
|
-
add_opt(
|
|
1333
|
+
add_opt(common_arg(
|
|
1329
1334
|
{"-cb", "--cont-batching"},
|
|
1330
|
-
|
|
1331
|
-
[](
|
|
1335
|
+
string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
|
|
1336
|
+
[](common_params & params) {
|
|
1332
1337
|
params.cont_batching = true;
|
|
1333
1338
|
}
|
|
1334
1339
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
|
|
1335
|
-
add_opt(
|
|
1340
|
+
add_opt(common_arg(
|
|
1336
1341
|
{"-nocb", "--no-cont-batching"},
|
|
1337
1342
|
"disable continuous batching",
|
|
1338
|
-
[](
|
|
1343
|
+
[](common_params & params) {
|
|
1339
1344
|
params.cont_batching = false;
|
|
1340
1345
|
}
|
|
1341
1346
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
|
1342
|
-
add_opt(
|
|
1347
|
+
add_opt(common_arg(
|
|
1343
1348
|
{"--mmproj"}, "FILE",
|
|
1344
1349
|
"path to a multimodal projector file for LLaVA. see examples/llava/README.md",
|
|
1345
|
-
[](
|
|
1350
|
+
[](common_params & params, const std::string & value) {
|
|
1346
1351
|
params.mmproj = value;
|
|
1347
1352
|
}
|
|
1348
1353
|
).set_examples({LLAMA_EXAMPLE_LLAVA}));
|
|
1349
|
-
add_opt(
|
|
1354
|
+
add_opt(common_arg(
|
|
1350
1355
|
{"--image"}, "FILE",
|
|
1351
1356
|
"path to an image file. use with multimodal models. Specify multiple times for batching",
|
|
1352
|
-
[](
|
|
1357
|
+
[](common_params & params, const std::string & value) {
|
|
1353
1358
|
params.image.emplace_back(value);
|
|
1354
1359
|
}
|
|
1355
1360
|
).set_examples({LLAMA_EXAMPLE_LLAVA}));
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
add_opt(
|
|
1361
|
+
if (llama_supports_rpc()) {
|
|
1362
|
+
add_opt(common_arg(
|
|
1363
|
+
{"--rpc"}, "SERVERS",
|
|
1364
|
+
"comma separated list of RPC servers",
|
|
1365
|
+
[](common_params & params, const std::string & value) {
|
|
1366
|
+
params.rpc_servers = value;
|
|
1367
|
+
}
|
|
1368
|
+
).set_env("LLAMA_ARG_RPC"));
|
|
1369
|
+
}
|
|
1370
|
+
add_opt(common_arg(
|
|
1366
1371
|
{"--mlock"},
|
|
1367
1372
|
"force system to keep model in RAM rather than swapping or compressing",
|
|
1368
|
-
[](
|
|
1373
|
+
[](common_params & params) {
|
|
1369
1374
|
params.use_mlock = true;
|
|
1370
1375
|
}
|
|
1371
1376
|
).set_env("LLAMA_ARG_MLOCK"));
|
|
1372
|
-
add_opt(
|
|
1377
|
+
add_opt(common_arg(
|
|
1373
1378
|
{"--no-mmap"},
|
|
1374
1379
|
"do not memory-map model (slower load but may reduce pageouts if not using mlock)",
|
|
1375
|
-
[](
|
|
1380
|
+
[](common_params & params) {
|
|
1376
1381
|
params.use_mmap = false;
|
|
1377
1382
|
}
|
|
1378
1383
|
).set_env("LLAMA_ARG_NO_MMAP"));
|
|
1379
|
-
add_opt(
|
|
1384
|
+
add_opt(common_arg(
|
|
1380
1385
|
{"--numa"}, "TYPE",
|
|
1381
1386
|
"attempt optimizations that help on some NUMA systems\n"
|
|
1382
1387
|
"- distribute: spread execution evenly over all nodes\n"
|
|
@@ -1384,52 +1389,62 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1384
1389
|
"- numactl: use the CPU map provided by numactl\n"
|
|
1385
1390
|
"if run without this previously, it is recommended to drop the system page cache before using this\n"
|
|
1386
1391
|
"see https://github.com/ggerganov/llama.cpp/issues/1437",
|
|
1387
|
-
[](
|
|
1392
|
+
[](common_params & params, const std::string & value) {
|
|
1388
1393
|
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
|
1389
1394
|
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
|
|
1390
1395
|
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
|
|
1391
1396
|
else { throw std::invalid_argument("invalid value"); }
|
|
1392
1397
|
}
|
|
1393
1398
|
).set_env("LLAMA_ARG_NUMA"));
|
|
1394
|
-
add_opt(
|
|
1399
|
+
add_opt(common_arg(
|
|
1400
|
+
{"-dev", "--device"}, "<dev1,dev2,..>",
|
|
1401
|
+
"comma-separated list of devices to use for offloading (none = don't offload)\n"
|
|
1402
|
+
"use --list-devices to see a list of available devices",
|
|
1403
|
+
[](common_params & params, const std::string & value) {
|
|
1404
|
+
params.devices = parse_device_list(value);
|
|
1405
|
+
}
|
|
1406
|
+
).set_env("LLAMA_ARG_DEVICE"));
|
|
1407
|
+
add_opt(common_arg(
|
|
1408
|
+
{"--list-devices"},
|
|
1409
|
+
"print list of available devices and exit",
|
|
1410
|
+
[](common_params &) {
|
|
1411
|
+
printf("Available devices:\n");
|
|
1412
|
+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
1413
|
+
auto * dev = ggml_backend_dev_get(i);
|
|
1414
|
+
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
|
|
1415
|
+
size_t free, total;
|
|
1416
|
+
ggml_backend_dev_memory(dev, &free, &total);
|
|
1417
|
+
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
|
|
1418
|
+
}
|
|
1419
|
+
}
|
|
1420
|
+
exit(0);
|
|
1421
|
+
}
|
|
1422
|
+
));
|
|
1423
|
+
add_opt(common_arg(
|
|
1395
1424
|
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
|
1396
1425
|
"number of layers to store in VRAM",
|
|
1397
|
-
[](
|
|
1426
|
+
[](common_params & params, int value) {
|
|
1398
1427
|
params.n_gpu_layers = value;
|
|
1399
1428
|
if (!llama_supports_gpu_offload()) {
|
|
1400
|
-
fprintf(stderr, "warning:
|
|
1401
|
-
fprintf(stderr, "warning:
|
|
1429
|
+
fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
|
|
1430
|
+
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
|
1431
|
+
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
|
1402
1432
|
}
|
|
1403
1433
|
}
|
|
1404
1434
|
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
|
|
1405
|
-
add_opt(
|
|
1406
|
-
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
|
1407
|
-
"number of layers to store in VRAM for the draft model",
|
|
1408
|
-
[](gpt_params & params, int value) {
|
|
1409
|
-
params.n_gpu_layers_draft = value;
|
|
1410
|
-
if (!llama_supports_gpu_offload()) {
|
|
1411
|
-
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
|
1412
|
-
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
|
1413
|
-
}
|
|
1414
|
-
}
|
|
1415
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
1416
|
-
add_opt(llama_arg(
|
|
1435
|
+
add_opt(common_arg(
|
|
1417
1436
|
{"-sm", "--split-mode"}, "{none,layer,row}",
|
|
1418
1437
|
"how to split the model across multiple GPUs, one of:\n"
|
|
1419
1438
|
"- none: use one GPU only\n"
|
|
1420
1439
|
"- layer (default): split layers and KV across GPUs\n"
|
|
1421
1440
|
"- row: split rows across GPUs",
|
|
1422
|
-
[](
|
|
1441
|
+
[](common_params & params, const std::string & value) {
|
|
1423
1442
|
std::string arg_next = value;
|
|
1424
1443
|
if (arg_next == "none") {
|
|
1425
1444
|
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
|
1426
1445
|
} else if (arg_next == "layer") {
|
|
1427
1446
|
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
|
1428
1447
|
} else if (arg_next == "row") {
|
|
1429
|
-
#ifdef GGML_USE_SYCL
|
|
1430
|
-
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
|
|
1431
|
-
exit(1);
|
|
1432
|
-
#endif // GGML_USE_SYCL
|
|
1433
1448
|
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
|
1434
1449
|
} else {
|
|
1435
1450
|
throw std::invalid_argument("invalid value");
|
|
@@ -1439,10 +1454,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1439
1454
|
}
|
|
1440
1455
|
}
|
|
1441
1456
|
).set_env("LLAMA_ARG_SPLIT_MODE"));
|
|
1442
|
-
add_opt(
|
|
1457
|
+
add_opt(common_arg(
|
|
1443
1458
|
{"-ts", "--tensor-split"}, "N0,N1,N2,...",
|
|
1444
1459
|
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
|
|
1445
|
-
[](
|
|
1460
|
+
[](common_params & params, const std::string & value) {
|
|
1446
1461
|
std::string arg_next = value;
|
|
1447
1462
|
|
|
1448
1463
|
// split string by , and /
|
|
@@ -1451,7 +1466,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1451
1466
|
std::vector<std::string> split_arg{ it, {} };
|
|
1452
1467
|
if (split_arg.size() >= llama_max_devices()) {
|
|
1453
1468
|
throw std::invalid_argument(
|
|
1454
|
-
|
|
1469
|
+
string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
|
|
1455
1470
|
);
|
|
1456
1471
|
}
|
|
1457
1472
|
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
|
@@ -1466,315 +1481,329 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1466
1481
|
}
|
|
1467
1482
|
}
|
|
1468
1483
|
).set_env("LLAMA_ARG_TENSOR_SPLIT"));
|
|
1469
|
-
add_opt(
|
|
1484
|
+
add_opt(common_arg(
|
|
1470
1485
|
{"-mg", "--main-gpu"}, "INDEX",
|
|
1471
|
-
|
|
1472
|
-
[](
|
|
1486
|
+
string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
|
|
1487
|
+
[](common_params & params, int value) {
|
|
1473
1488
|
params.main_gpu = value;
|
|
1474
1489
|
if (!llama_supports_gpu_offload()) {
|
|
1475
1490
|
fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
|
|
1476
1491
|
}
|
|
1477
1492
|
}
|
|
1478
1493
|
).set_env("LLAMA_ARG_MAIN_GPU"));
|
|
1479
|
-
add_opt(
|
|
1494
|
+
add_opt(common_arg(
|
|
1480
1495
|
{"--check-tensors"},
|
|
1481
|
-
|
|
1482
|
-
[](
|
|
1496
|
+
string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
|
|
1497
|
+
[](common_params & params) {
|
|
1483
1498
|
params.check_tensors = true;
|
|
1484
1499
|
}
|
|
1485
1500
|
));
|
|
1486
|
-
add_opt(
|
|
1501
|
+
add_opt(common_arg(
|
|
1487
1502
|
{"--override-kv"}, "KEY=TYPE:VALUE",
|
|
1488
1503
|
"advanced option to override model metadata by key. may be specified multiple times.\n"
|
|
1489
1504
|
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
|
|
1490
|
-
[](
|
|
1505
|
+
[](common_params & params, const std::string & value) {
|
|
1491
1506
|
if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
|
|
1492
|
-
throw std::runtime_error(
|
|
1507
|
+
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str()));
|
|
1493
1508
|
}
|
|
1494
1509
|
}
|
|
1495
1510
|
));
|
|
1496
|
-
add_opt(
|
|
1511
|
+
add_opt(common_arg(
|
|
1497
1512
|
{"--lora"}, "FNAME",
|
|
1498
1513
|
"path to LoRA adapter (can be repeated to use multiple adapters)",
|
|
1499
|
-
[](
|
|
1514
|
+
[](common_params & params, const std::string & value) {
|
|
1500
1515
|
params.lora_adapters.push_back({ std::string(value), 1.0 });
|
|
1501
1516
|
}
|
|
1502
1517
|
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
|
1503
1518
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
1504
|
-
add_opt(
|
|
1519
|
+
add_opt(common_arg(
|
|
1505
1520
|
{"--lora-scaled"}, "FNAME", "SCALE",
|
|
1506
1521
|
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
|
|
1507
|
-
[](
|
|
1522
|
+
[](common_params & params, const std::string & fname, const std::string & scale) {
|
|
1508
1523
|
params.lora_adapters.push_back({ fname, std::stof(scale) });
|
|
1509
1524
|
}
|
|
1510
1525
|
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
|
1511
1526
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
1512
|
-
add_opt(
|
|
1527
|
+
add_opt(common_arg(
|
|
1513
1528
|
{"--control-vector"}, "FNAME",
|
|
1514
1529
|
"add a control vector\nnote: this argument can be repeated to add multiple control vectors",
|
|
1515
|
-
[](
|
|
1530
|
+
[](common_params & params, const std::string & value) {
|
|
1516
1531
|
params.control_vectors.push_back({ 1.0f, value, });
|
|
1517
1532
|
}
|
|
1518
1533
|
));
|
|
1519
|
-
add_opt(
|
|
1534
|
+
add_opt(common_arg(
|
|
1520
1535
|
{"--control-vector-scaled"}, "FNAME", "SCALE",
|
|
1521
1536
|
"add a control vector with user defined scaling SCALE\n"
|
|
1522
1537
|
"note: this argument can be repeated to add multiple scaled control vectors",
|
|
1523
|
-
[](
|
|
1538
|
+
[](common_params & params, const std::string & fname, const std::string & scale) {
|
|
1524
1539
|
params.control_vectors.push_back({ std::stof(scale), fname });
|
|
1525
1540
|
}
|
|
1526
1541
|
));
|
|
1527
|
-
add_opt(
|
|
1542
|
+
add_opt(common_arg(
|
|
1528
1543
|
{"--control-vector-layer-range"}, "START", "END",
|
|
1529
1544
|
"layer range to apply the control vector(s) to, start and end inclusive",
|
|
1530
|
-
[](
|
|
1545
|
+
[](common_params & params, const std::string & start, const std::string & end) {
|
|
1531
1546
|
params.control_vector_layer_start = std::stoi(start);
|
|
1532
1547
|
params.control_vector_layer_end = std::stoi(end);
|
|
1533
1548
|
}
|
|
1534
1549
|
));
|
|
1535
|
-
add_opt(
|
|
1550
|
+
add_opt(common_arg(
|
|
1536
1551
|
{"-a", "--alias"}, "STRING",
|
|
1537
1552
|
"set alias for model name (to be used by REST API)",
|
|
1538
|
-
[](
|
|
1553
|
+
[](common_params & params, const std::string & value) {
|
|
1539
1554
|
params.model_alias = value;
|
|
1540
1555
|
}
|
|
1541
1556
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
|
|
1542
|
-
add_opt(
|
|
1557
|
+
add_opt(common_arg(
|
|
1543
1558
|
{"-m", "--model"}, "FNAME",
|
|
1544
1559
|
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
|
1545
1560
|
? std::string("model path from which to load base model")
|
|
1546
|
-
:
|
|
1561
|
+
: string_format(
|
|
1547
1562
|
"model path (default: `models/$filename` with filename from `--hf-file` "
|
|
1548
1563
|
"or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
|
|
1549
1564
|
),
|
|
1550
|
-
[](
|
|
1565
|
+
[](common_params & params, const std::string & value) {
|
|
1551
1566
|
params.model = value;
|
|
1552
1567
|
}
|
|
1553
1568
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
|
|
1554
|
-
add_opt(
|
|
1555
|
-
{"-md", "--model-draft"}, "FNAME",
|
|
1556
|
-
"draft model for speculative decoding (default: unused)",
|
|
1557
|
-
[](gpt_params & params, const std::string & value) {
|
|
1558
|
-
params.model_draft = value;
|
|
1559
|
-
}
|
|
1560
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
1561
|
-
add_opt(llama_arg(
|
|
1569
|
+
add_opt(common_arg(
|
|
1562
1570
|
{"-mu", "--model-url"}, "MODEL_URL",
|
|
1563
1571
|
"model download url (default: unused)",
|
|
1564
|
-
[](
|
|
1572
|
+
[](common_params & params, const std::string & value) {
|
|
1565
1573
|
params.model_url = value;
|
|
1566
1574
|
}
|
|
1567
1575
|
).set_env("LLAMA_ARG_MODEL_URL"));
|
|
1568
|
-
add_opt(
|
|
1576
|
+
add_opt(common_arg(
|
|
1569
1577
|
{"-hfr", "--hf-repo"}, "REPO",
|
|
1570
1578
|
"Hugging Face model repository (default: unused)",
|
|
1571
|
-
[](
|
|
1579
|
+
[](common_params & params, const std::string & value) {
|
|
1572
1580
|
params.hf_repo = value;
|
|
1573
1581
|
}
|
|
1574
1582
|
).set_env("LLAMA_ARG_HF_REPO"));
|
|
1575
|
-
add_opt(
|
|
1583
|
+
add_opt(common_arg(
|
|
1576
1584
|
{"-hff", "--hf-file"}, "FILE",
|
|
1577
1585
|
"Hugging Face model file (default: unused)",
|
|
1578
|
-
[](
|
|
1586
|
+
[](common_params & params, const std::string & value) {
|
|
1579
1587
|
params.hf_file = value;
|
|
1580
1588
|
}
|
|
1581
1589
|
).set_env("LLAMA_ARG_HF_FILE"));
|
|
1582
|
-
add_opt(
|
|
1590
|
+
add_opt(common_arg(
|
|
1591
|
+
{"-hfrv", "--hf-repo-v"}, "REPO",
|
|
1592
|
+
"Hugging Face model repository for the vocoder model (default: unused)",
|
|
1593
|
+
[](common_params & params, const std::string & value) {
|
|
1594
|
+
params.vocoder.hf_repo = value;
|
|
1595
|
+
}
|
|
1596
|
+
).set_env("LLAMA_ARG_HF_REPO_V"));
|
|
1597
|
+
add_opt(common_arg(
|
|
1598
|
+
{"-hffv", "--hf-file-v"}, "FILE",
|
|
1599
|
+
"Hugging Face model file for the vocoder model (default: unused)",
|
|
1600
|
+
[](common_params & params, const std::string & value) {
|
|
1601
|
+
params.vocoder.hf_file = value;
|
|
1602
|
+
}
|
|
1603
|
+
).set_env("LLAMA_ARG_HF_FILE_V"));
|
|
1604
|
+
add_opt(common_arg(
|
|
1583
1605
|
{"-hft", "--hf-token"}, "TOKEN",
|
|
1584
1606
|
"Hugging Face access token (default: value from HF_TOKEN environment variable)",
|
|
1585
|
-
[](
|
|
1607
|
+
[](common_params & params, const std::string & value) {
|
|
1586
1608
|
params.hf_token = value;
|
|
1587
1609
|
}
|
|
1588
1610
|
).set_env("HF_TOKEN"));
|
|
1589
|
-
add_opt(
|
|
1611
|
+
add_opt(common_arg(
|
|
1590
1612
|
{"--context-file"}, "FNAME",
|
|
1591
1613
|
"file to load context from (repeat to specify multiple files)",
|
|
1592
|
-
[](
|
|
1614
|
+
[](common_params & params, const std::string & value) {
|
|
1593
1615
|
std::ifstream file(value, std::ios::binary);
|
|
1594
1616
|
if (!file) {
|
|
1595
|
-
throw std::runtime_error(
|
|
1617
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
1596
1618
|
}
|
|
1597
1619
|
params.context_files.push_back(value);
|
|
1598
1620
|
}
|
|
1599
1621
|
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
|
1600
|
-
add_opt(
|
|
1622
|
+
add_opt(common_arg(
|
|
1601
1623
|
{"--chunk-size"}, "N",
|
|
1602
|
-
|
|
1603
|
-
[](
|
|
1624
|
+
string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
|
|
1625
|
+
[](common_params & params, int value) {
|
|
1604
1626
|
params.chunk_size = value;
|
|
1605
1627
|
}
|
|
1606
1628
|
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
|
1607
|
-
add_opt(
|
|
1629
|
+
add_opt(common_arg(
|
|
1608
1630
|
{"--chunk-separator"}, "STRING",
|
|
1609
|
-
|
|
1610
|
-
[](
|
|
1631
|
+
string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
|
|
1632
|
+
[](common_params & params, const std::string & value) {
|
|
1611
1633
|
params.chunk_separator = value;
|
|
1612
1634
|
}
|
|
1613
1635
|
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
|
1614
|
-
add_opt(
|
|
1636
|
+
add_opt(common_arg(
|
|
1615
1637
|
{"--junk"}, "N",
|
|
1616
|
-
|
|
1617
|
-
[](
|
|
1638
|
+
string_format("number of times to repeat the junk text (default: %d)", params.n_junk),
|
|
1639
|
+
[](common_params & params, int value) {
|
|
1618
1640
|
params.n_junk = value;
|
|
1619
1641
|
}
|
|
1620
1642
|
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
|
1621
|
-
add_opt(
|
|
1643
|
+
add_opt(common_arg(
|
|
1622
1644
|
{"--pos"}, "N",
|
|
1623
|
-
|
|
1624
|
-
[](
|
|
1645
|
+
string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
|
|
1646
|
+
[](common_params & params, int value) {
|
|
1625
1647
|
params.i_pos = value;
|
|
1626
1648
|
}
|
|
1627
1649
|
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
|
1628
|
-
add_opt(
|
|
1650
|
+
add_opt(common_arg(
|
|
1629
1651
|
{"-o", "--output", "--output-file"}, "FNAME",
|
|
1630
|
-
|
|
1652
|
+
string_format("output file (default: '%s')",
|
|
1631
1653
|
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
|
1632
1654
|
? params.lora_outfile.c_str()
|
|
1633
1655
|
: ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
|
|
1634
1656
|
? params.cvector_outfile.c_str()
|
|
1635
1657
|
: params.out_file.c_str()),
|
|
1636
|
-
[](
|
|
1658
|
+
[](common_params & params, const std::string & value) {
|
|
1637
1659
|
params.out_file = value;
|
|
1638
1660
|
params.cvector_outfile = value;
|
|
1639
1661
|
params.lora_outfile = value;
|
|
1640
1662
|
}
|
|
1641
1663
|
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
1642
|
-
add_opt(
|
|
1664
|
+
add_opt(common_arg(
|
|
1643
1665
|
{"-ofreq", "--output-frequency"}, "N",
|
|
1644
|
-
|
|
1645
|
-
[](
|
|
1666
|
+
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
|
1667
|
+
[](common_params & params, int value) {
|
|
1646
1668
|
params.n_out_freq = value;
|
|
1647
1669
|
}
|
|
1648
1670
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
1649
|
-
add_opt(
|
|
1671
|
+
add_opt(common_arg(
|
|
1650
1672
|
{"--save-frequency"}, "N",
|
|
1651
|
-
|
|
1652
|
-
[](
|
|
1673
|
+
string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
|
|
1674
|
+
[](common_params & params, int value) {
|
|
1653
1675
|
params.n_save_freq = value;
|
|
1654
1676
|
}
|
|
1655
1677
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
1656
|
-
add_opt(
|
|
1678
|
+
add_opt(common_arg(
|
|
1657
1679
|
{"--process-output"},
|
|
1658
|
-
|
|
1659
|
-
[](
|
|
1680
|
+
string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
|
|
1681
|
+
[](common_params & params) {
|
|
1660
1682
|
params.process_output = true;
|
|
1661
1683
|
}
|
|
1662
1684
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
1663
|
-
add_opt(
|
|
1685
|
+
add_opt(common_arg(
|
|
1664
1686
|
{"--no-ppl"},
|
|
1665
|
-
|
|
1666
|
-
[](
|
|
1687
|
+
string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
|
|
1688
|
+
[](common_params & params) {
|
|
1667
1689
|
params.compute_ppl = false;
|
|
1668
1690
|
}
|
|
1669
1691
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
1670
|
-
add_opt(
|
|
1692
|
+
add_opt(common_arg(
|
|
1671
1693
|
{"--chunk", "--from-chunk"}, "N",
|
|
1672
|
-
|
|
1673
|
-
[](
|
|
1694
|
+
string_format("start processing the input from chunk N (default: %d)", params.i_chunk),
|
|
1695
|
+
[](common_params & params, int value) {
|
|
1674
1696
|
params.i_chunk = value;
|
|
1675
1697
|
}
|
|
1676
1698
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
1677
|
-
add_opt(
|
|
1699
|
+
add_opt(common_arg(
|
|
1678
1700
|
{"-pps"},
|
|
1679
|
-
|
|
1680
|
-
[](
|
|
1701
|
+
string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
|
|
1702
|
+
[](common_params & params) {
|
|
1681
1703
|
params.is_pp_shared = true;
|
|
1682
1704
|
}
|
|
1683
1705
|
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
|
1684
|
-
add_opt(
|
|
1706
|
+
add_opt(common_arg(
|
|
1685
1707
|
{"-npp"}, "n0,n1,...",
|
|
1686
1708
|
"number of prompt tokens",
|
|
1687
|
-
[](
|
|
1709
|
+
[](common_params & params, const std::string & value) {
|
|
1688
1710
|
auto p = string_split<int>(value, ',');
|
|
1689
1711
|
params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
|
|
1690
1712
|
}
|
|
1691
1713
|
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
|
1692
|
-
add_opt(
|
|
1714
|
+
add_opt(common_arg(
|
|
1693
1715
|
{"-ntg"}, "n0,n1,...",
|
|
1694
1716
|
"number of text generation tokens",
|
|
1695
|
-
[](
|
|
1717
|
+
[](common_params & params, const std::string & value) {
|
|
1696
1718
|
auto p = string_split<int>(value, ',');
|
|
1697
1719
|
params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
|
|
1698
1720
|
}
|
|
1699
1721
|
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
|
1700
|
-
add_opt(
|
|
1722
|
+
add_opt(common_arg(
|
|
1701
1723
|
{"-npl"}, "n0,n1,...",
|
|
1702
1724
|
"number of parallel prompts",
|
|
1703
|
-
[](
|
|
1725
|
+
[](common_params & params, const std::string & value) {
|
|
1704
1726
|
auto p = string_split<int>(value, ',');
|
|
1705
1727
|
params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
|
|
1706
1728
|
}
|
|
1707
1729
|
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
|
1708
|
-
add_opt(
|
|
1730
|
+
add_opt(common_arg(
|
|
1709
1731
|
{"--embd-normalize"}, "N",
|
|
1710
|
-
|
|
1711
|
-
[](
|
|
1732
|
+
string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
|
|
1733
|
+
[](common_params & params, int value) {
|
|
1712
1734
|
params.embd_normalize = value;
|
|
1713
1735
|
}
|
|
1714
1736
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
1715
|
-
add_opt(
|
|
1737
|
+
add_opt(common_arg(
|
|
1716
1738
|
{"--embd-output-format"}, "FORMAT",
|
|
1717
1739
|
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
|
|
1718
|
-
[](
|
|
1740
|
+
[](common_params & params, const std::string & value) {
|
|
1719
1741
|
params.embd_out = value;
|
|
1720
1742
|
}
|
|
1721
1743
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
1722
|
-
add_opt(
|
|
1744
|
+
add_opt(common_arg(
|
|
1723
1745
|
{"--embd-separator"}, "STRING",
|
|
1724
|
-
"separator of
|
|
1725
|
-
[](
|
|
1746
|
+
"separator of embeddings (default \\n) for example \"<#sep#>\"",
|
|
1747
|
+
[](common_params & params, const std::string & value) {
|
|
1726
1748
|
params.embd_sep = value;
|
|
1727
1749
|
}
|
|
1728
1750
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
1729
|
-
add_opt(
|
|
1751
|
+
add_opt(common_arg(
|
|
1730
1752
|
{"--host"}, "HOST",
|
|
1731
|
-
|
|
1732
|
-
[](
|
|
1753
|
+
string_format("ip address to listen (default: %s)", params.hostname.c_str()),
|
|
1754
|
+
[](common_params & params, const std::string & value) {
|
|
1733
1755
|
params.hostname = value;
|
|
1734
1756
|
}
|
|
1735
1757
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
|
|
1736
|
-
add_opt(
|
|
1758
|
+
add_opt(common_arg(
|
|
1737
1759
|
{"--port"}, "PORT",
|
|
1738
|
-
|
|
1739
|
-
[](
|
|
1760
|
+
string_format("port to listen (default: %d)", params.port),
|
|
1761
|
+
[](common_params & params, int value) {
|
|
1740
1762
|
params.port = value;
|
|
1741
1763
|
}
|
|
1742
1764
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
|
|
1743
|
-
add_opt(
|
|
1765
|
+
add_opt(common_arg(
|
|
1744
1766
|
{"--path"}, "PATH",
|
|
1745
|
-
|
|
1746
|
-
[](
|
|
1767
|
+
string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
|
|
1768
|
+
[](common_params & params, const std::string & value) {
|
|
1747
1769
|
params.public_path = value;
|
|
1748
1770
|
}
|
|
1749
1771
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
|
|
1750
|
-
add_opt(
|
|
1772
|
+
add_opt(common_arg(
|
|
1773
|
+
{"--no-webui"},
|
|
1774
|
+
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
|
|
1775
|
+
[](common_params & params) {
|
|
1776
|
+
params.webui = false;
|
|
1777
|
+
}
|
|
1778
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
|
|
1779
|
+
add_opt(common_arg(
|
|
1751
1780
|
{"--embedding", "--embeddings"},
|
|
1752
|
-
|
|
1753
|
-
[](
|
|
1781
|
+
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
|
|
1782
|
+
[](common_params & params) {
|
|
1754
1783
|
params.embedding = true;
|
|
1755
1784
|
}
|
|
1756
1785
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
|
1757
|
-
add_opt(
|
|
1786
|
+
add_opt(common_arg(
|
|
1758
1787
|
{"--reranking", "--rerank"},
|
|
1759
|
-
|
|
1760
|
-
[](
|
|
1788
|
+
string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
|
|
1789
|
+
[](common_params & params) {
|
|
1761
1790
|
params.reranking = true;
|
|
1762
1791
|
}
|
|
1763
1792
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
|
|
1764
|
-
add_opt(
|
|
1793
|
+
add_opt(common_arg(
|
|
1765
1794
|
{"--api-key"}, "KEY",
|
|
1766
1795
|
"API key to use for authentication (default: none)",
|
|
1767
|
-
[](
|
|
1796
|
+
[](common_params & params, const std::string & value) {
|
|
1768
1797
|
params.api_keys.push_back(value);
|
|
1769
1798
|
}
|
|
1770
1799
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
|
|
1771
|
-
add_opt(
|
|
1800
|
+
add_opt(common_arg(
|
|
1772
1801
|
{"--api-key-file"}, "FNAME",
|
|
1773
1802
|
"path to file containing API keys (default: none)",
|
|
1774
|
-
[](
|
|
1803
|
+
[](common_params & params, const std::string & value) {
|
|
1775
1804
|
std::ifstream key_file(value);
|
|
1776
1805
|
if (!key_file) {
|
|
1777
|
-
throw std::runtime_error(
|
|
1806
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
1778
1807
|
}
|
|
1779
1808
|
std::string key;
|
|
1780
1809
|
while (std::getline(key_file, key)) {
|
|
@@ -1785,70 +1814,74 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1785
1814
|
key_file.close();
|
|
1786
1815
|
}
|
|
1787
1816
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1788
|
-
add_opt(
|
|
1817
|
+
add_opt(common_arg(
|
|
1789
1818
|
{"--ssl-key-file"}, "FNAME",
|
|
1790
1819
|
"path to file a PEM-encoded SSL private key",
|
|
1791
|
-
[](
|
|
1820
|
+
[](common_params & params, const std::string & value) {
|
|
1792
1821
|
params.ssl_file_key = value;
|
|
1793
1822
|
}
|
|
1794
1823
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE"));
|
|
1795
|
-
add_opt(
|
|
1824
|
+
add_opt(common_arg(
|
|
1796
1825
|
{"--ssl-cert-file"}, "FNAME",
|
|
1797
1826
|
"path to file a PEM-encoded SSL certificate",
|
|
1798
|
-
[](
|
|
1827
|
+
[](common_params & params, const std::string & value) {
|
|
1799
1828
|
params.ssl_file_cert = value;
|
|
1800
1829
|
}
|
|
1801
1830
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
|
|
1802
|
-
add_opt(
|
|
1831
|
+
add_opt(common_arg(
|
|
1803
1832
|
{"-to", "--timeout"}, "N",
|
|
1804
|
-
|
|
1805
|
-
[](
|
|
1833
|
+
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
|
1834
|
+
[](common_params & params, int value) {
|
|
1806
1835
|
params.timeout_read = value;
|
|
1807
1836
|
params.timeout_write = value;
|
|
1808
1837
|
}
|
|
1809
1838
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
|
|
1810
|
-
add_opt(
|
|
1839
|
+
add_opt(common_arg(
|
|
1811
1840
|
{"--threads-http"}, "N",
|
|
1812
|
-
|
|
1813
|
-
[](
|
|
1841
|
+
string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
|
|
1842
|
+
[](common_params & params, int value) {
|
|
1814
1843
|
params.n_threads_http = value;
|
|
1815
1844
|
}
|
|
1816
1845
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
|
|
1817
|
-
add_opt(
|
|
1818
|
-
{"
|
|
1819
|
-
"
|
|
1820
|
-
[](
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
std::string system_prompt;
|
|
1826
|
-
std::copy(
|
|
1827
|
-
std::istreambuf_iterator<char>(file),
|
|
1828
|
-
std::istreambuf_iterator<char>(),
|
|
1829
|
-
std::back_inserter(system_prompt)
|
|
1830
|
-
);
|
|
1831
|
-
params.system_prompt = system_prompt;
|
|
1832
|
-
}
|
|
1833
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1834
|
-
add_opt(llama_arg(
|
|
1846
|
+
add_opt(common_arg(
|
|
1847
|
+
{"--cache-reuse"}, "N",
|
|
1848
|
+
string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
|
|
1849
|
+
[](common_params & params, int value) {
|
|
1850
|
+
params.n_cache_reuse = value;
|
|
1851
|
+
}
|
|
1852
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
|
|
1853
|
+
add_opt(common_arg(
|
|
1835
1854
|
{"--metrics"},
|
|
1836
|
-
|
|
1837
|
-
[](
|
|
1855
|
+
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
|
|
1856
|
+
[](common_params & params) {
|
|
1838
1857
|
params.endpoint_metrics = true;
|
|
1839
1858
|
}
|
|
1840
1859
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
|
|
1841
|
-
add_opt(
|
|
1860
|
+
add_opt(common_arg(
|
|
1861
|
+
{"--slots"},
|
|
1862
|
+
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
|
1863
|
+
[](common_params & params) {
|
|
1864
|
+
params.endpoint_slots = true;
|
|
1865
|
+
}
|
|
1866
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
|
|
1867
|
+
add_opt(common_arg(
|
|
1868
|
+
{"--props"},
|
|
1869
|
+
string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
|
|
1870
|
+
[](common_params & params) {
|
|
1871
|
+
params.endpoint_props = true;
|
|
1872
|
+
}
|
|
1873
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
|
|
1874
|
+
add_opt(common_arg(
|
|
1842
1875
|
{"--no-slots"},
|
|
1843
|
-
|
|
1844
|
-
[](
|
|
1876
|
+
"disables slots monitoring endpoint",
|
|
1877
|
+
[](common_params & params) {
|
|
1845
1878
|
params.endpoint_slots = false;
|
|
1846
1879
|
}
|
|
1847
1880
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
|
|
1848
|
-
add_opt(
|
|
1881
|
+
add_opt(common_arg(
|
|
1849
1882
|
{"--slot-save-path"}, "PATH",
|
|
1850
1883
|
"path to save slot kv cache (default: disabled)",
|
|
1851
|
-
[](
|
|
1884
|
+
[](common_params & params, const std::string & value) {
|
|
1852
1885
|
params.slot_save_path = value;
|
|
1853
1886
|
// if doesn't end with DIRECTORY_SEPARATOR, add it
|
|
1854
1887
|
if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
|
|
@@ -1856,14 +1889,16 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1856
1889
|
}
|
|
1857
1890
|
}
|
|
1858
1891
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1859
|
-
add_opt(
|
|
1892
|
+
add_opt(common_arg(
|
|
1860
1893
|
{"--chat-template"}, "JINJA_TEMPLATE",
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1894
|
+
string_format(
|
|
1895
|
+
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
|
1896
|
+
"if suffix/prefix are specified, template will be disabled\n"
|
|
1897
|
+
"list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
|
|
1898
|
+
),
|
|
1899
|
+
[](common_params & params, const std::string & value) {
|
|
1900
|
+
if (!common_chat_verify_template(value)) {
|
|
1901
|
+
throw std::runtime_error(string_format(
|
|
1867
1902
|
"error: the supplied chat template is not supported: %s\n"
|
|
1868
1903
|
"note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
|
|
1869
1904
|
value.c_str()
|
|
@@ -1872,135 +1907,316 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1872
1907
|
params.chat_template = value;
|
|
1873
1908
|
}
|
|
1874
1909
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
|
1875
|
-
add_opt(
|
|
1910
|
+
add_opt(common_arg(
|
|
1876
1911
|
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
|
1877
|
-
|
|
1878
|
-
[](
|
|
1912
|
+
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
|
1913
|
+
[](common_params & params, const std::string & value) {
|
|
1879
1914
|
params.slot_prompt_similarity = std::stof(value);
|
|
1880
1915
|
}
|
|
1881
1916
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1882
|
-
add_opt(
|
|
1917
|
+
add_opt(common_arg(
|
|
1883
1918
|
{"--lora-init-without-apply"},
|
|
1884
|
-
|
|
1885
|
-
[](
|
|
1919
|
+
string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
|
|
1920
|
+
[](common_params & params) {
|
|
1886
1921
|
params.lora_init_without_apply = true;
|
|
1887
1922
|
}
|
|
1888
1923
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1889
|
-
add_opt(
|
|
1924
|
+
add_opt(common_arg(
|
|
1890
1925
|
{"--simple-io"},
|
|
1891
1926
|
"use basic IO for better compatibility in subprocesses and limited consoles",
|
|
1892
|
-
[](
|
|
1927
|
+
[](common_params & params) {
|
|
1893
1928
|
params.simple_io = true;
|
|
1894
1929
|
}
|
|
1895
1930
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
|
1896
|
-
add_opt(
|
|
1897
|
-
{"-ld", "--logdir"}, "LOGDIR",
|
|
1898
|
-
"path under which to save YAML logs (no logging if unset)",
|
|
1899
|
-
[](gpt_params & params, const std::string & value) {
|
|
1900
|
-
params.logdir = value;
|
|
1901
|
-
|
|
1902
|
-
if (params.logdir.back() != DIRECTORY_SEPARATOR) {
|
|
1903
|
-
params.logdir += DIRECTORY_SEPARATOR;
|
|
1904
|
-
}
|
|
1905
|
-
}
|
|
1906
|
-
));
|
|
1907
|
-
add_opt(llama_arg(
|
|
1931
|
+
add_opt(common_arg(
|
|
1908
1932
|
{"--positive-file"}, "FNAME",
|
|
1909
|
-
|
|
1910
|
-
[](
|
|
1933
|
+
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|
|
1934
|
+
[](common_params & params, const std::string & value) {
|
|
1911
1935
|
params.cvector_positive_file = value;
|
|
1912
1936
|
}
|
|
1913
1937
|
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
|
1914
|
-
add_opt(
|
|
1938
|
+
add_opt(common_arg(
|
|
1915
1939
|
{"--negative-file"}, "FNAME",
|
|
1916
|
-
|
|
1917
|
-
[](
|
|
1940
|
+
string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
|
|
1941
|
+
[](common_params & params, const std::string & value) {
|
|
1918
1942
|
params.cvector_negative_file = value;
|
|
1919
1943
|
}
|
|
1920
1944
|
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
|
1921
|
-
add_opt(
|
|
1945
|
+
add_opt(common_arg(
|
|
1922
1946
|
{"--pca-batch"}, "N",
|
|
1923
|
-
|
|
1924
|
-
[](
|
|
1947
|
+
string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
|
|
1948
|
+
[](common_params & params, int value) {
|
|
1925
1949
|
params.n_pca_batch = value;
|
|
1926
1950
|
}
|
|
1927
1951
|
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
|
1928
|
-
add_opt(
|
|
1952
|
+
add_opt(common_arg(
|
|
1929
1953
|
{"--pca-iter"}, "N",
|
|
1930
|
-
|
|
1931
|
-
[](
|
|
1954
|
+
string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
|
|
1955
|
+
[](common_params & params, int value) {
|
|
1932
1956
|
params.n_pca_iterations = value;
|
|
1933
1957
|
}
|
|
1934
1958
|
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
|
1935
|
-
add_opt(
|
|
1959
|
+
add_opt(common_arg(
|
|
1936
1960
|
{"--method"}, "{pca, mean}",
|
|
1937
1961
|
"dimensionality reduction method to be used (default: pca)",
|
|
1938
|
-
[](
|
|
1962
|
+
[](common_params & params, const std::string & value) {
|
|
1939
1963
|
/**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
|
|
1940
1964
|
else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
|
|
1941
1965
|
else { throw std::invalid_argument("invalid value"); }
|
|
1942
1966
|
}
|
|
1943
1967
|
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
|
1944
|
-
add_opt(
|
|
1968
|
+
add_opt(common_arg(
|
|
1945
1969
|
{"--output-format"}, "{md,jsonl}",
|
|
1946
1970
|
"output format for batched-bench results (default: md)",
|
|
1947
|
-
[](
|
|
1971
|
+
[](common_params & params, const std::string & value) {
|
|
1948
1972
|
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
|
|
1949
1973
|
else if (value == "md") { params.batched_bench_output_jsonl = false; }
|
|
1950
1974
|
else { std::invalid_argument("invalid value"); }
|
|
1951
1975
|
}
|
|
1952
1976
|
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
|
1953
|
-
add_opt(
|
|
1977
|
+
add_opt(common_arg(
|
|
1954
1978
|
{"--log-disable"},
|
|
1955
1979
|
"Log disable",
|
|
1956
|
-
[](
|
|
1957
|
-
|
|
1980
|
+
[](common_params &) {
|
|
1981
|
+
common_log_pause(common_log_main());
|
|
1958
1982
|
}
|
|
1959
1983
|
));
|
|
1960
|
-
add_opt(
|
|
1984
|
+
add_opt(common_arg(
|
|
1961
1985
|
{"--log-file"}, "FNAME",
|
|
1962
1986
|
"Log to file",
|
|
1963
|
-
[](
|
|
1964
|
-
|
|
1987
|
+
[](common_params &, const std::string & value) {
|
|
1988
|
+
common_log_set_file(common_log_main(), value.c_str());
|
|
1965
1989
|
}
|
|
1966
1990
|
));
|
|
1967
|
-
add_opt(
|
|
1991
|
+
add_opt(common_arg(
|
|
1968
1992
|
{"--log-colors"},
|
|
1969
1993
|
"Enable colored logging",
|
|
1970
|
-
[](
|
|
1971
|
-
|
|
1994
|
+
[](common_params &) {
|
|
1995
|
+
common_log_set_colors(common_log_main(), true);
|
|
1972
1996
|
}
|
|
1973
1997
|
).set_env("LLAMA_LOG_COLORS"));
|
|
1974
|
-
add_opt(
|
|
1998
|
+
add_opt(common_arg(
|
|
1975
1999
|
{"-v", "--verbose", "--log-verbose"},
|
|
1976
2000
|
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
|
|
1977
|
-
[](
|
|
2001
|
+
[](common_params & params) {
|
|
1978
2002
|
params.verbosity = INT_MAX;
|
|
1979
|
-
|
|
2003
|
+
common_log_set_verbosity_thold(INT_MAX);
|
|
1980
2004
|
}
|
|
1981
2005
|
));
|
|
1982
|
-
add_opt(
|
|
2006
|
+
add_opt(common_arg(
|
|
1983
2007
|
{"-lv", "--verbosity", "--log-verbosity"}, "N",
|
|
1984
2008
|
"Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
|
|
1985
|
-
[](
|
|
2009
|
+
[](common_params & params, int value) {
|
|
1986
2010
|
params.verbosity = value;
|
|
1987
|
-
|
|
2011
|
+
common_log_set_verbosity_thold(value);
|
|
1988
2012
|
}
|
|
1989
2013
|
).set_env("LLAMA_LOG_VERBOSITY"));
|
|
1990
|
-
add_opt(
|
|
2014
|
+
add_opt(common_arg(
|
|
1991
2015
|
{"--log-prefix"},
|
|
1992
2016
|
"Enable prefx in log messages",
|
|
1993
|
-
[](
|
|
1994
|
-
|
|
2017
|
+
[](common_params &) {
|
|
2018
|
+
common_log_set_prefix(common_log_main(), true);
|
|
1995
2019
|
}
|
|
1996
2020
|
).set_env("LLAMA_LOG_PREFIX"));
|
|
1997
|
-
add_opt(
|
|
2021
|
+
add_opt(common_arg(
|
|
1998
2022
|
{"--log-timestamps"},
|
|
1999
2023
|
"Enable timestamps in log messages",
|
|
2000
|
-
[](
|
|
2001
|
-
|
|
2024
|
+
[](common_params &) {
|
|
2025
|
+
common_log_set_timestamps(common_log_main(), true);
|
|
2002
2026
|
}
|
|
2003
2027
|
).set_env("LLAMA_LOG_TIMESTAMPS"));
|
|
2004
2028
|
|
|
2029
|
+
// speculative parameters
|
|
2030
|
+
add_opt(common_arg(
|
|
2031
|
+
{"-td", "--threads-draft"}, "N",
|
|
2032
|
+
"number of threads to use during generation (default: same as --threads)",
|
|
2033
|
+
[](common_params & params, int value) {
|
|
2034
|
+
params.speculative.cpuparams.n_threads = value;
|
|
2035
|
+
if (params.speculative.cpuparams.n_threads <= 0) {
|
|
2036
|
+
params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
|
|
2037
|
+
}
|
|
2038
|
+
}
|
|
2039
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2040
|
+
add_opt(common_arg(
|
|
2041
|
+
{"-tbd", "--threads-batch-draft"}, "N",
|
|
2042
|
+
"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
|
|
2043
|
+
[](common_params & params, int value) {
|
|
2044
|
+
params.speculative.cpuparams_batch.n_threads = value;
|
|
2045
|
+
if (params.speculative.cpuparams_batch.n_threads <= 0) {
|
|
2046
|
+
params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
|
2047
|
+
}
|
|
2048
|
+
}
|
|
2049
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2050
|
+
add_opt(common_arg(
|
|
2051
|
+
{"-Cd", "--cpu-mask-draft"}, "M",
|
|
2052
|
+
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
|
2053
|
+
[](common_params & params, const std::string & mask) {
|
|
2054
|
+
params.speculative.cpuparams.mask_valid = true;
|
|
2055
|
+
if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) {
|
|
2056
|
+
throw std::invalid_argument("invalid cpumask");
|
|
2057
|
+
}
|
|
2058
|
+
}
|
|
2059
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2060
|
+
add_opt(common_arg(
|
|
2061
|
+
{"-Crd", "--cpu-range-draft"}, "lo-hi",
|
|
2062
|
+
"Ranges of CPUs for affinity. Complements --cpu-mask-draft",
|
|
2063
|
+
[](common_params & params, const std::string & range) {
|
|
2064
|
+
params.speculative.cpuparams.mask_valid = true;
|
|
2065
|
+
if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) {
|
|
2066
|
+
throw std::invalid_argument("invalid range");
|
|
2067
|
+
}
|
|
2068
|
+
}
|
|
2069
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2070
|
+
add_opt(common_arg(
|
|
2071
|
+
{"--cpu-strict-draft"}, "<0|1>",
|
|
2072
|
+
"Use strict CPU placement for draft model (default: same as --cpu-strict)",
|
|
2073
|
+
[](common_params & params, int value) {
|
|
2074
|
+
params.speculative.cpuparams.strict_cpu = value;
|
|
2075
|
+
}
|
|
2076
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2077
|
+
add_opt(common_arg(
|
|
2078
|
+
{"--prio-draft"}, "N",
|
|
2079
|
+
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority),
|
|
2080
|
+
[](common_params & params, int prio) {
|
|
2081
|
+
if (prio < 0 || prio > 3) {
|
|
2082
|
+
throw std::invalid_argument("invalid value");
|
|
2083
|
+
}
|
|
2084
|
+
params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio;
|
|
2085
|
+
}
|
|
2086
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2087
|
+
add_opt(common_arg(
|
|
2088
|
+
{"--poll-draft"}, "<0|1>",
|
|
2089
|
+
"Use polling to wait for draft model work (default: same as --poll])",
|
|
2090
|
+
[](common_params & params, int value) {
|
|
2091
|
+
params.speculative.cpuparams.poll = value;
|
|
2092
|
+
}
|
|
2093
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2094
|
+
add_opt(common_arg(
|
|
2095
|
+
{"-Cbd", "--cpu-mask-batch-draft"}, "M",
|
|
2096
|
+
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
|
2097
|
+
[](common_params & params, const std::string & mask) {
|
|
2098
|
+
params.speculative.cpuparams_batch.mask_valid = true;
|
|
2099
|
+
if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) {
|
|
2100
|
+
throw std::invalid_argument("invalid cpumask");
|
|
2101
|
+
}
|
|
2102
|
+
}
|
|
2103
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2104
|
+
add_opt(common_arg(
|
|
2105
|
+
{"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
|
|
2106
|
+
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
|
|
2107
|
+
[](common_params & params, const std::string & range) {
|
|
2108
|
+
params.speculative.cpuparams_batch.mask_valid = true;
|
|
2109
|
+
if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) {
|
|
2110
|
+
throw std::invalid_argument("invalid cpumask");
|
|
2111
|
+
}
|
|
2112
|
+
}
|
|
2113
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2114
|
+
add_opt(common_arg(
|
|
2115
|
+
{"--cpu-strict-batch-draft"}, "<0|1>",
|
|
2116
|
+
"Use strict CPU placement for draft model (default: --cpu-strict-draft)",
|
|
2117
|
+
[](common_params & params, int value) {
|
|
2118
|
+
params.speculative.cpuparams_batch.strict_cpu = value;
|
|
2119
|
+
}
|
|
2120
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2121
|
+
add_opt(common_arg(
|
|
2122
|
+
{"--prio-batch-draft"}, "N",
|
|
2123
|
+
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority),
|
|
2124
|
+
[](common_params & params, int prio) {
|
|
2125
|
+
if (prio < 0 || prio > 3) {
|
|
2126
|
+
throw std::invalid_argument("invalid value");
|
|
2127
|
+
}
|
|
2128
|
+
params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
|
|
2129
|
+
}
|
|
2130
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2131
|
+
add_opt(common_arg(
|
|
2132
|
+
{"--poll-batch-draft"}, "<0|1>",
|
|
2133
|
+
"Use polling to wait for draft model work (default: --poll-draft)",
|
|
2134
|
+
[](common_params & params, int value) {
|
|
2135
|
+
params.speculative.cpuparams_batch.poll = value;
|
|
2136
|
+
}
|
|
2137
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2138
|
+
add_opt(common_arg(
|
|
2139
|
+
{"--draft-max", "--draft", "--draft-n"}, "N",
|
|
2140
|
+
string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
|
|
2141
|
+
[](common_params & params, int value) {
|
|
2142
|
+
params.speculative.n_max = value;
|
|
2143
|
+
}
|
|
2144
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
|
|
2145
|
+
add_opt(common_arg(
|
|
2146
|
+
{"--draft-min", "--draft-n-min"}, "N",
|
|
2147
|
+
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
|
|
2148
|
+
[](common_params & params, int value) {
|
|
2149
|
+
params.speculative.n_min = value;
|
|
2150
|
+
}
|
|
2151
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
|
|
2152
|
+
add_opt(common_arg(
|
|
2153
|
+
{"--draft-p-split"}, "P",
|
|
2154
|
+
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
|
|
2155
|
+
[](common_params & params, const std::string & value) {
|
|
2156
|
+
params.speculative.p_split = std::stof(value);
|
|
2157
|
+
}
|
|
2158
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
|
|
2159
|
+
add_opt(common_arg(
|
|
2160
|
+
{"--draft-p-min"}, "P",
|
|
2161
|
+
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
|
|
2162
|
+
[](common_params & params, const std::string & value) {
|
|
2163
|
+
params.speculative.p_min = std::stof(value);
|
|
2164
|
+
}
|
|
2165
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
|
|
2166
|
+
add_opt(common_arg(
|
|
2167
|
+
{"-cd", "--ctx-size-draft"}, "N",
|
|
2168
|
+
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
|
|
2169
|
+
[](common_params & params, int value) {
|
|
2170
|
+
params.speculative.n_ctx = value;
|
|
2171
|
+
}
|
|
2172
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
|
|
2173
|
+
add_opt(common_arg(
|
|
2174
|
+
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
|
|
2175
|
+
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
|
|
2176
|
+
"use --list-devices to see a list of available devices",
|
|
2177
|
+
[](common_params & params, const std::string & value) {
|
|
2178
|
+
params.speculative.devices = parse_device_list(value);
|
|
2179
|
+
}
|
|
2180
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
|
2181
|
+
add_opt(common_arg(
|
|
2182
|
+
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
|
2183
|
+
"number of layers to store in VRAM for the draft model",
|
|
2184
|
+
[](common_params & params, int value) {
|
|
2185
|
+
params.speculative.n_gpu_layers = value;
|
|
2186
|
+
if (!llama_supports_gpu_offload()) {
|
|
2187
|
+
fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
|
|
2188
|
+
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
|
2189
|
+
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
|
2190
|
+
}
|
|
2191
|
+
}
|
|
2192
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
|
|
2193
|
+
add_opt(common_arg(
|
|
2194
|
+
{"-md", "--model-draft"}, "FNAME",
|
|
2195
|
+
"draft model for speculative decoding (default: unused)",
|
|
2196
|
+
[](common_params & params, const std::string & value) {
|
|
2197
|
+
params.speculative.model = value;
|
|
2198
|
+
}
|
|
2199
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
|
2200
|
+
|
|
2201
|
+
add_opt(common_arg(
|
|
2202
|
+
{"-mv", "--model-vocoder"}, "FNAME",
|
|
2203
|
+
"vocoder model for audio generation (default: unused)",
|
|
2204
|
+
[](common_params & params, const std::string & value) {
|
|
2205
|
+
params.vocoder.model = value;
|
|
2206
|
+
}
|
|
2207
|
+
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
|
|
2208
|
+
|
|
2209
|
+
// model-specific
|
|
2210
|
+
add_opt(common_arg(
|
|
2211
|
+
{"--tts-oute-default"},
|
|
2212
|
+
string_format("use default OuteTTS models (note: can download weights from the internet)"),
|
|
2213
|
+
[](common_params & params) {
|
|
2214
|
+
params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
|
|
2215
|
+
params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
|
|
2216
|
+
params.vocoder.hf_repo = "ggml-org/WavTokenizer";
|
|
2217
|
+
params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
|
|
2218
|
+
}
|
|
2219
|
+
).set_examples({LLAMA_EXAMPLE_TTS}));
|
|
2220
|
+
|
|
2005
2221
|
return ctx_arg;
|
|
2006
2222
|
}
|