@fugood/llama.node 0.2.3 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -3
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +8 -1
- package/package.json +3 -3
- package/patches/llama.patch +12 -12
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/LlamaContext.cpp +33 -1
- package/src/LlamaContext.h +1 -0
- package/src/llama.cpp/.github/workflows/bench.yml +310 -0
- package/src/llama.cpp/.github/workflows/build.yml +1315 -0
- package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
- package/src/llama.cpp/.github/workflows/docker.yml +116 -0
- package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
- package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
- package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
- package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
- package/src/llama.cpp/.github/workflows/server.yml +183 -0
- package/src/llama.cpp/CMakeLists.txt +91 -1245
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
- package/src/llama.cpp/cmake/build-info.cmake +58 -0
- package/src/llama.cpp/cmake/git-vars.cmake +22 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -3
- package/src/llama.cpp/common/build-info.cpp.in +4 -0
- package/src/llama.cpp/common/common.cpp +1116 -877
- package/src/llama.cpp/common/common.h +191 -77
- package/src/llama.cpp/common/grammar-parser.cpp +118 -31
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
- package/src/llama.cpp/common/log.h +1 -1
- package/src/llama.cpp/common/ngram-cache.h +10 -3
- package/src/llama.cpp/common/sampling.cpp +19 -10
- package/src/llama.cpp/docs/build.md +353 -0
- package/src/llama.cpp/examples/CMakeLists.txt +22 -22
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +52 -55
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/chat-13B.bat +57 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
- package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
- package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
- package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +38 -153
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
- package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
- package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
- package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
- package/src/llama.cpp/examples/llava/clip.cpp +23 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
- package/src/llama.cpp/examples/llava/requirements.txt +3 -2
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +98 -75
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
- package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
- package/src/llama.cpp/examples/server/server.cpp +274 -671
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +28 -29
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +21 -29
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +23 -0
- package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
- package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
- package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
- package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
- package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
- package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
- package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
- package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
- package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
- package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
- package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
- package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
- package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
- package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
- package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
- package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
- package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
- package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
- package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
- package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
- package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
- package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
- package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
- package/src/llama.cpp/models/.editorconfig +1 -0
- package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/requirements/requirements-all.txt +12 -0
- package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
- package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
- package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
- package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
- package/src/llama.cpp/requirements.txt +5 -4
- package/src/llama.cpp/scripts/build-info.sh +30 -0
- package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
- package/src/llama.cpp/src/CMakeLists.txt +33 -0
- package/src/llama.cpp/src/llama-grammar.cpp +539 -0
- package/src/llama.cpp/src/llama-grammar.h +39 -0
- package/src/llama.cpp/src/llama-impl.h +26 -0
- package/src/llama.cpp/src/llama-sampling.cpp +635 -0
- package/src/llama.cpp/src/llama-sampling.h +56 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
- package/src/llama.cpp/src/llama-vocab.h +130 -0
- package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
- package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
- package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
- package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
- package/src/llama.cpp/tests/CMakeLists.txt +19 -20
- package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
- package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
- package/src/llama.cpp/tests/test-double-float.cpp +2 -2
- package/src/llama.cpp/tests/test-grad0.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
- package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
- package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
- package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
- package/src/llama.cpp/tests/test-rope.cpp +3 -4
- package/src/llama.cpp/tests/test-sampling.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
- package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
- package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
- package/src/llama.cpp/ggml-opencl.cpp +0 -2305
- package/src/llama.cpp/ggml-opencl.h +0 -36
- package/src/llama.cpp/ggml-sycl.cpp +0 -17340
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
- package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
- /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
- /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
- /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
- /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
- /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
|
@@ -17,9 +17,20 @@
|
|
|
17
17
|
#include "json.hpp"
|
|
18
18
|
|
|
19
19
|
// auto generated files (update with ./deps.sh)
|
|
20
|
+
#include "colorthemes.css.hpp"
|
|
21
|
+
#include "style.css.hpp"
|
|
22
|
+
#include "theme-beeninorder.css.hpp"
|
|
23
|
+
#include "theme-ketivah.css.hpp"
|
|
24
|
+
#include "theme-mangotango.css.hpp"
|
|
25
|
+
#include "theme-playground.css.hpp"
|
|
26
|
+
#include "theme-polarnight.css.hpp"
|
|
27
|
+
#include "theme-snowstorm.css.hpp"
|
|
20
28
|
#include "index.html.hpp"
|
|
29
|
+
#include "index-new.html.hpp"
|
|
21
30
|
#include "index.js.hpp"
|
|
22
31
|
#include "completion.js.hpp"
|
|
32
|
+
#include "system-prompts.js.hpp"
|
|
33
|
+
#include "prompt-formats.js.hpp"
|
|
23
34
|
#include "json-schema-to-grammar.mjs.hpp"
|
|
24
35
|
|
|
25
36
|
#include <atomic>
|
|
@@ -112,29 +123,6 @@ struct slot_params {
|
|
|
112
123
|
json input_suffix;
|
|
113
124
|
};
|
|
114
125
|
|
|
115
|
-
struct server_params {
|
|
116
|
-
int32_t port = 8080;
|
|
117
|
-
int32_t read_timeout = 600;
|
|
118
|
-
int32_t write_timeout = 600;
|
|
119
|
-
int32_t n_threads_http = -1;
|
|
120
|
-
|
|
121
|
-
std::string hostname = "127.0.0.1";
|
|
122
|
-
std::string public_path = "";
|
|
123
|
-
std::string chat_template = "";
|
|
124
|
-
std::string system_prompt = "";
|
|
125
|
-
|
|
126
|
-
std::vector<std::string> api_keys;
|
|
127
|
-
|
|
128
|
-
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
|
|
129
|
-
std::string ssl_key_file = "";
|
|
130
|
-
std::string ssl_cert_file = "";
|
|
131
|
-
#endif
|
|
132
|
-
|
|
133
|
-
bool slots_endpoint = true;
|
|
134
|
-
bool metrics_endpoint = false;
|
|
135
|
-
std::string slot_save_path;
|
|
136
|
-
};
|
|
137
|
-
|
|
138
126
|
struct server_slot {
|
|
139
127
|
int id;
|
|
140
128
|
int id_task = -1;
|
|
@@ -159,7 +147,7 @@ struct server_slot {
|
|
|
159
147
|
int32_t n_prompt_tokens = 0;
|
|
160
148
|
int32_t n_prompt_tokens_processed = 0;
|
|
161
149
|
|
|
162
|
-
json prompt;
|
|
150
|
+
json prompt; // can be either a string, array of strings or array of token ids
|
|
163
151
|
|
|
164
152
|
// when a task is submitted, we first tokenize the prompt and store it here
|
|
165
153
|
std::vector<llama_token> prompt_tokens;
|
|
@@ -659,6 +647,9 @@ struct server_context {
|
|
|
659
647
|
|
|
660
648
|
server_metrics metrics;
|
|
661
649
|
|
|
650
|
+
// Necessary similarity of prompt for slot selection
|
|
651
|
+
float slot_prompt_similarity = 0.0f;
|
|
652
|
+
|
|
662
653
|
~server_context() {
|
|
663
654
|
if (ctx) {
|
|
664
655
|
llama_free(ctx);
|
|
@@ -746,6 +737,8 @@ struct server_context {
|
|
|
746
737
|
slot.ga_n = ga_n;
|
|
747
738
|
slot.ga_w = ga_w;
|
|
748
739
|
|
|
740
|
+
slot.sparams = params.sparams;
|
|
741
|
+
|
|
749
742
|
slot.reset();
|
|
750
743
|
|
|
751
744
|
slots.push_back(slot);
|
|
@@ -807,29 +800,94 @@ struct server_context {
|
|
|
807
800
|
return prompt_tokens;
|
|
808
801
|
}
|
|
809
802
|
|
|
810
|
-
server_slot *
|
|
811
|
-
int64_t t_last = ggml_time_us();
|
|
812
|
-
|
|
813
|
-
server_slot * last_used = nullptr;
|
|
814
|
-
|
|
803
|
+
server_slot * get_slot_by_id(int id) {
|
|
815
804
|
for (server_slot & slot : slots) {
|
|
816
|
-
if (slot.id == id
|
|
805
|
+
if (slot.id == id) {
|
|
817
806
|
return &slot;
|
|
818
807
|
}
|
|
808
|
+
}
|
|
809
|
+
|
|
810
|
+
return nullptr;
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
server_slot * get_available_slot(const std::string & prompt) {
|
|
814
|
+
server_slot * ret = nullptr;
|
|
815
|
+
|
|
816
|
+
// find the slot that has at least n% prompt similarity
|
|
817
|
+
if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) {
|
|
818
|
+
int max_lcp_len = 0;
|
|
819
|
+
float similarity = 0;
|
|
820
|
+
|
|
821
|
+
for (server_slot & slot : slots) {
|
|
822
|
+
// skip the slot if it is not available
|
|
823
|
+
if (!slot.available()) {
|
|
824
|
+
continue;
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
// skip the slot if it does not contains prompt
|
|
828
|
+
if (!slot.prompt.is_string()) {
|
|
829
|
+
continue;
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
// current slot's prompt
|
|
833
|
+
std::string slot_prompt = slot.prompt.get<std::string>();
|
|
834
|
+
|
|
835
|
+
// length of the current slot's prompt
|
|
836
|
+
int slot_prompt_len = slot_prompt.size();
|
|
837
|
+
|
|
838
|
+
// length of the Longest Common Prefix between the current slot's prompt and the input prompt
|
|
839
|
+
int lcp_len = common_part(slot_prompt, prompt);
|
|
840
|
+
|
|
841
|
+
// fraction of the common substring length compared to the current slot's prompt length
|
|
842
|
+
similarity = static_cast<float>(lcp_len) / slot_prompt_len;
|
|
843
|
+
|
|
844
|
+
// select the current slot if the criteria match
|
|
845
|
+
if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
|
|
846
|
+
max_lcp_len = lcp_len;
|
|
847
|
+
ret = &slot;
|
|
848
|
+
}
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
if (ret != nullptr) {
|
|
852
|
+
LOG_VERBOSE("selected slot by lcp similarity", {
|
|
853
|
+
{"id_slot", ret->id},
|
|
854
|
+
{"max_lcp_len", max_lcp_len},
|
|
855
|
+
{"similarity", similarity},
|
|
856
|
+
});
|
|
857
|
+
}
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
// find the slot that has been least recently used
|
|
861
|
+
if (ret == nullptr) {
|
|
862
|
+
int64_t t_last = ggml_time_us();
|
|
863
|
+
for (server_slot & slot : slots) {
|
|
864
|
+
// skip the slot if it is not available
|
|
865
|
+
if (!slot.available()) {
|
|
866
|
+
continue;
|
|
867
|
+
}
|
|
868
|
+
|
|
869
|
+
// select the current slot if the criteria match
|
|
870
|
+
if (slot.t_last_used < t_last) {
|
|
871
|
+
t_last = slot.t_last_used;
|
|
872
|
+
ret = &slot;
|
|
873
|
+
}
|
|
874
|
+
}
|
|
819
875
|
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
876
|
+
if (ret != nullptr) {
|
|
877
|
+
LOG_VERBOSE("selected slot by lru", {
|
|
878
|
+
{"id_slot", ret->id},
|
|
879
|
+
{"t_last", t_last},
|
|
880
|
+
});
|
|
824
881
|
}
|
|
825
882
|
}
|
|
826
883
|
|
|
827
|
-
return
|
|
884
|
+
return ret;
|
|
828
885
|
}
|
|
829
886
|
|
|
830
887
|
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
|
|
831
888
|
slot_params default_params;
|
|
832
|
-
|
|
889
|
+
// Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
|
|
890
|
+
llama_sampling_params default_sparams = params.sparams;
|
|
833
891
|
auto & data = task.data;
|
|
834
892
|
|
|
835
893
|
if (data.count("__oaicompat") != 0) {
|
|
@@ -900,16 +958,19 @@ struct server_context {
|
|
|
900
958
|
slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
|
|
901
959
|
|
|
902
960
|
// get prompt
|
|
903
|
-
{
|
|
961
|
+
if (!task.infill) {
|
|
904
962
|
const auto & prompt = data.find("prompt");
|
|
905
963
|
if (prompt == data.end()) {
|
|
906
|
-
send_error(task, "
|
|
964
|
+
send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST);
|
|
907
965
|
return false;
|
|
908
|
-
} else {
|
|
909
|
-
slot.prompt = *prompt;
|
|
910
966
|
}
|
|
911
|
-
|
|
912
|
-
|
|
967
|
+
|
|
968
|
+
if ((prompt->is_string()) ||
|
|
969
|
+
(prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) ||
|
|
970
|
+
(prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer())) {
|
|
971
|
+
slot.prompt = *prompt;
|
|
972
|
+
} else {
|
|
973
|
+
send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST);
|
|
913
974
|
return false;
|
|
914
975
|
}
|
|
915
976
|
}
|
|
@@ -1121,7 +1182,7 @@ struct server_context {
|
|
|
1121
1182
|
|
|
1122
1183
|
bool process_token(completion_token_output & result, server_slot & slot) {
|
|
1123
1184
|
// remember which tokens were sampled - used for repetition penalties during sampling
|
|
1124
|
-
const std::string token_str = llama_token_to_piece(ctx, result.tok,
|
|
1185
|
+
const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special);
|
|
1125
1186
|
slot.sampled = result.tok;
|
|
1126
1187
|
|
|
1127
1188
|
// search stop word and delete it
|
|
@@ -1250,7 +1311,7 @@ struct server_context {
|
|
|
1250
1311
|
}
|
|
1251
1312
|
|
|
1252
1313
|
json get_formated_generation(const server_slot & slot) const {
|
|
1253
|
-
const auto eos_bias
|
|
1314
|
+
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
|
|
1254
1315
|
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
|
1255
1316
|
|
|
1256
1317
|
std::vector<std::string> samplers_sequence;
|
|
@@ -1527,13 +1588,33 @@ struct server_context {
|
|
|
1527
1588
|
switch (task.type) {
|
|
1528
1589
|
case SERVER_TASK_TYPE_COMPLETION:
|
|
1529
1590
|
{
|
|
1530
|
-
|
|
1591
|
+
const int id_slot = json_value(task.data, "id_slot", -1);
|
|
1592
|
+
|
|
1593
|
+
server_slot * slot;
|
|
1594
|
+
|
|
1595
|
+
if (id_slot != -1) {
|
|
1596
|
+
slot = get_slot_by_id(id_slot);
|
|
1597
|
+
} else {
|
|
1598
|
+
std::string prompt;
|
|
1599
|
+
if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
|
|
1600
|
+
prompt = json_value(task.data, "prompt", std::string());
|
|
1601
|
+
}
|
|
1602
|
+
|
|
1603
|
+
slot = get_available_slot(prompt);
|
|
1604
|
+
}
|
|
1605
|
+
|
|
1531
1606
|
if (slot == nullptr) {
|
|
1532
1607
|
// if no slot is available, we defer this task for processing later
|
|
1533
1608
|
LOG_VERBOSE("no slot is available", {{"id_task", task.id}});
|
|
1534
1609
|
queue_tasks.defer(task);
|
|
1535
1610
|
break;
|
|
1536
1611
|
}
|
|
1612
|
+
if (!slot->available()) {
|
|
1613
|
+
// if requested slot is unavailable, we defer this task for processing later
|
|
1614
|
+
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
|
|
1615
|
+
queue_tasks.defer(task);
|
|
1616
|
+
break;
|
|
1617
|
+
}
|
|
1537
1618
|
|
|
1538
1619
|
if (task.data.contains("system_prompt")) {
|
|
1539
1620
|
std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
|
|
@@ -1650,11 +1731,17 @@ struct server_context {
|
|
|
1650
1731
|
case SERVER_TASK_TYPE_SLOT_SAVE:
|
|
1651
1732
|
{
|
|
1652
1733
|
int id_slot = task.data.at("id_slot");
|
|
1653
|
-
server_slot * slot =
|
|
1734
|
+
server_slot * slot = get_slot_by_id(id_slot);
|
|
1654
1735
|
if (slot == nullptr) {
|
|
1655
1736
|
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
|
1656
1737
|
break;
|
|
1657
1738
|
}
|
|
1739
|
+
if (!slot->available()) {
|
|
1740
|
+
// if requested slot is unavailable, we defer this task for processing later
|
|
1741
|
+
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
|
|
1742
|
+
queue_tasks.defer(task);
|
|
1743
|
+
break;
|
|
1744
|
+
}
|
|
1658
1745
|
|
|
1659
1746
|
const size_t token_count = slot->cache_tokens.size();
|
|
1660
1747
|
const int64_t t_start = ggml_time_us();
|
|
@@ -1685,11 +1772,17 @@ struct server_context {
|
|
|
1685
1772
|
case SERVER_TASK_TYPE_SLOT_RESTORE:
|
|
1686
1773
|
{
|
|
1687
1774
|
int id_slot = task.data.at("id_slot");
|
|
1688
|
-
server_slot * slot =
|
|
1775
|
+
server_slot * slot = get_slot_by_id(id_slot);
|
|
1689
1776
|
if (slot == nullptr) {
|
|
1690
1777
|
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
|
1691
1778
|
break;
|
|
1692
1779
|
}
|
|
1780
|
+
if (!slot->available()) {
|
|
1781
|
+
// if requested slot is unavailable, we defer this task for processing later
|
|
1782
|
+
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
|
|
1783
|
+
queue_tasks.defer(task);
|
|
1784
|
+
break;
|
|
1785
|
+
}
|
|
1693
1786
|
|
|
1694
1787
|
const int64_t t_start = ggml_time_us();
|
|
1695
1788
|
|
|
@@ -1727,11 +1820,17 @@ struct server_context {
|
|
|
1727
1820
|
case SERVER_TASK_TYPE_SLOT_ERASE:
|
|
1728
1821
|
{
|
|
1729
1822
|
int id_slot = task.data.at("id_slot");
|
|
1730
|
-
server_slot * slot =
|
|
1823
|
+
server_slot * slot = get_slot_by_id(id_slot);
|
|
1731
1824
|
if (slot == nullptr) {
|
|
1732
1825
|
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
|
1733
1826
|
break;
|
|
1734
1827
|
}
|
|
1828
|
+
if (!slot->available()) {
|
|
1829
|
+
// if requested slot is unavailable, we defer this task for processing later
|
|
1830
|
+
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
|
|
1831
|
+
queue_tasks.defer(task);
|
|
1832
|
+
break;
|
|
1833
|
+
}
|
|
1735
1834
|
|
|
1736
1835
|
// Erase token cache
|
|
1737
1836
|
const size_t n_erased = slot->cache_tokens.size();
|
|
@@ -1906,6 +2005,11 @@ struct server_context {
|
|
|
1906
2005
|
int32_t n_batch = llama_n_batch(ctx);
|
|
1907
2006
|
int32_t n_ubatch = llama_n_ubatch(ctx);
|
|
1908
2007
|
|
|
2008
|
+
// track if this is an embedding or non-embedding batch
|
|
2009
|
+
// if we've added sampled tokens above, we are in non-embedding mode
|
|
2010
|
+
// -1: none, 0: non-embedding, 1: embedding
|
|
2011
|
+
int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
|
|
2012
|
+
|
|
1909
2013
|
// next, batch any pending prompts without exceeding n_batch
|
|
1910
2014
|
if (params.cont_batching || batch.n_tokens == 0) {
|
|
1911
2015
|
for (auto & slot : slots) {
|
|
@@ -1924,6 +2028,7 @@ struct server_context {
|
|
|
1924
2028
|
slot.t_start_generation = 0;
|
|
1925
2029
|
|
|
1926
2030
|
if (slot.infill) {
|
|
2031
|
+
const bool add_bos = llama_should_add_bos_token(model);
|
|
1927
2032
|
bool suff_rm_leading_spc = true;
|
|
1928
2033
|
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
|
1929
2034
|
params.input_suffix.erase(0, 1);
|
|
@@ -1939,11 +2044,21 @@ struct server_context {
|
|
|
1939
2044
|
}
|
|
1940
2045
|
|
|
1941
2046
|
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
prefix_tokens
|
|
1946
|
-
|
|
2047
|
+
suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
|
|
2048
|
+
|
|
2049
|
+
auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
|
|
2050
|
+
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
|
|
2051
|
+
if (add_bos) {
|
|
2052
|
+
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
|
2053
|
+
}
|
|
2054
|
+
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
|
2055
|
+
|
|
2056
|
+
const llama_token middle_token = llama_token_middle(model);
|
|
2057
|
+
if (middle_token >= 0) {
|
|
2058
|
+
embd_inp.push_back(middle_token);
|
|
2059
|
+
}
|
|
2060
|
+
|
|
2061
|
+
prompt_tokens = embd_inp;
|
|
1947
2062
|
} else {
|
|
1948
2063
|
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
|
1949
2064
|
}
|
|
@@ -2065,6 +2180,14 @@ struct server_context {
|
|
|
2065
2180
|
}
|
|
2066
2181
|
}
|
|
2067
2182
|
|
|
2183
|
+
// check that we are in the right batch_type, if not defer the slot
|
|
2184
|
+
bool slot_type = slot.embedding ? 1 : 0;
|
|
2185
|
+
if (batch_type == -1) {
|
|
2186
|
+
batch_type = slot_type;
|
|
2187
|
+
} else if (batch_type != slot_type) {
|
|
2188
|
+
continue;
|
|
2189
|
+
}
|
|
2190
|
+
|
|
2068
2191
|
// keep only the common part
|
|
2069
2192
|
int p0 = (int) system_tokens.size() + slot.n_past;
|
|
2070
2193
|
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
|
|
@@ -2166,6 +2289,9 @@ struct server_context {
|
|
|
2166
2289
|
{"n_tokens", batch.n_tokens},
|
|
2167
2290
|
});
|
|
2168
2291
|
|
|
2292
|
+
// make sure we're in the right embedding mode
|
|
2293
|
+
llama_set_embeddings(ctx, batch_type == 1);
|
|
2294
|
+
|
|
2169
2295
|
// process the created batch of tokens
|
|
2170
2296
|
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
|
2171
2297
|
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
|
@@ -2323,561 +2449,6 @@ struct server_context {
|
|
|
2323
2449
|
}
|
|
2324
2450
|
};
|
|
2325
2451
|
|
|
2326
|
-
static void server_print_usage(const char * argv0, const gpt_params & params, const server_params & sparams) {
|
|
2327
|
-
printf("usage: %s [options]\n", argv0);
|
|
2328
|
-
printf("\n");
|
|
2329
|
-
printf("options:\n");
|
|
2330
|
-
printf(" -h, --help show this help message and exit\n");
|
|
2331
|
-
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
|
2332
|
-
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
|
2333
|
-
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
|
2334
|
-
printf(" --threads-http N number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
|
|
2335
|
-
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
|
2336
|
-
printf(" --rope-scaling {none,linear,yarn}\n");
|
|
2337
|
-
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
|
|
2338
|
-
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
|
2339
|
-
printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
|
|
2340
|
-
printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
|
|
2341
|
-
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
|
2342
|
-
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
|
2343
|
-
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
|
2344
|
-
printf(" --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n");
|
|
2345
|
-
printf(" -dt N, --defrag-thold N\n");
|
|
2346
|
-
printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
|
|
2347
|
-
printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch);
|
|
2348
|
-
printf(" -ub N, --ubatch-size N physical maximum batch size (default: %d)\n", params.n_ubatch);
|
|
2349
|
-
if (llama_supports_mlock()) {
|
|
2350
|
-
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
|
2351
|
-
}
|
|
2352
|
-
if (llama_supports_mmap()) {
|
|
2353
|
-
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
|
2354
|
-
}
|
|
2355
|
-
printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
|
|
2356
|
-
printf(" - distribute: spread execution evenly over all nodes\n");
|
|
2357
|
-
printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
|
|
2358
|
-
printf(" - numactl: use the CPU map provided my numactl\n");
|
|
2359
|
-
if (llama_supports_gpu_offload()) {
|
|
2360
|
-
printf(" -ngl N, --n-gpu-layers N\n");
|
|
2361
|
-
printf(" number of layers to store in VRAM\n");
|
|
2362
|
-
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
|
|
2363
|
-
printf(" how to split the model across multiple GPUs, one of:\n");
|
|
2364
|
-
printf(" - none: use one GPU only\n");
|
|
2365
|
-
printf(" - layer (default): split layers and KV across GPUs\n");
|
|
2366
|
-
printf(" - row: split rows across GPUs\n");
|
|
2367
|
-
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
|
2368
|
-
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
|
2369
|
-
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
|
2370
|
-
printf(" or for intermediate results and KV (with split-mode = row)\n");
|
|
2371
|
-
printf(" -nkvo, --no-kv-offload\n");
|
|
2372
|
-
printf(" disable KV offload\n");
|
|
2373
|
-
}
|
|
2374
|
-
printf(" -m FNAME, --model FNAME\n");
|
|
2375
|
-
printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
|
|
2376
|
-
printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
|
|
2377
|
-
printf(" model download url (default: unused)\n");
|
|
2378
|
-
printf(" -hfr REPO, --hf-repo REPO\n");
|
|
2379
|
-
printf(" Hugging Face model repository (default: unused)\n");
|
|
2380
|
-
printf(" -hff FILE, --hf-file FILE\n");
|
|
2381
|
-
printf(" Hugging Face model file (default: unused)\n");
|
|
2382
|
-
printf(" -a ALIAS, --alias ALIAS\n");
|
|
2383
|
-
printf(" set an alias for the model, will be added as `model` field in completion response\n");
|
|
2384
|
-
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
|
2385
|
-
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
|
2386
|
-
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
|
2387
|
-
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
|
|
2388
|
-
printf(" --rpc SERVERS comma separated list of RPC servers\n");
|
|
2389
|
-
printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
|
|
2390
|
-
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
|
|
2391
|
-
printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
|
|
2392
|
-
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
|
|
2393
|
-
printf(" --ssl-key-file FNAME path to file a PEM-encoded SSL private key\n");
|
|
2394
|
-
printf(" --ssl-cert-file FNAME path to file a PEM-encoded SSL certificate\n");
|
|
2395
|
-
#endif
|
|
2396
|
-
printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
|
|
2397
|
-
printf(" --embeddings enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
|
|
2398
|
-
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
|
|
2399
|
-
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)\n");
|
|
2400
|
-
printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
|
|
2401
|
-
printf(" -spf FNAME, --system-prompt-file FNAME\n");
|
|
2402
|
-
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
|
2403
|
-
printf(" -ctk TYPE, --cache-type-k TYPE\n");
|
|
2404
|
-
printf(" KV cache data type for K (default: f16)\n");
|
|
2405
|
-
printf(" -ctv TYPE, --cache-type-v TYPE\n");
|
|
2406
|
-
printf(" KV cache data type for V (default: f16)\n");
|
|
2407
|
-
printf(" --log-format log output format: json or text (default: json)\n");
|
|
2408
|
-
printf(" --log-disable disables logging to a file.\n");
|
|
2409
|
-
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
|
2410
|
-
printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
|
|
2411
|
-
printf(" --slot-save-path PATH path to save slot kv cache (default: disabled)\n");
|
|
2412
|
-
printf("\n");
|
|
2413
|
-
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
|
2414
|
-
printf(" --override-kv KEY=TYPE:VALUE\n");
|
|
2415
|
-
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
|
2416
|
-
printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
|
2417
|
-
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
|
|
2418
|
-
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
|
|
2419
|
-
printf(" --chat-template JINJA_TEMPLATE\n");
|
|
2420
|
-
printf(" set custom jinja chat template (default: template taken from model's metadata)\n");
|
|
2421
|
-
printf(" only commonly used templates are accepted:\n");
|
|
2422
|
-
printf(" https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template\n");
|
|
2423
|
-
printf("\n");
|
|
2424
|
-
}
|
|
2425
|
-
|
|
2426
|
-
static void server_params_parse(int argc, char ** argv, server_params & sparams, gpt_params & params) {
|
|
2427
|
-
gpt_params default_params;
|
|
2428
|
-
server_params default_sparams;
|
|
2429
|
-
|
|
2430
|
-
std::string arg;
|
|
2431
|
-
bool invalid_param = false;
|
|
2432
|
-
|
|
2433
|
-
for (int i = 1; i < argc; i++) {
|
|
2434
|
-
arg = argv[i];
|
|
2435
|
-
if (arg == "--port") {
|
|
2436
|
-
if (++i >= argc) {
|
|
2437
|
-
invalid_param = true;
|
|
2438
|
-
break;
|
|
2439
|
-
}
|
|
2440
|
-
sparams.port = std::stoi(argv[i]);
|
|
2441
|
-
} else if (arg == "--rpc") {
|
|
2442
|
-
if (++i >= argc) {
|
|
2443
|
-
invalid_param = true;
|
|
2444
|
-
break;
|
|
2445
|
-
}
|
|
2446
|
-
params.rpc_servers = argv[i];
|
|
2447
|
-
} else if (arg == "--host") {
|
|
2448
|
-
if (++i >= argc) {
|
|
2449
|
-
invalid_param = true;
|
|
2450
|
-
break;
|
|
2451
|
-
}
|
|
2452
|
-
sparams.hostname = argv[i];
|
|
2453
|
-
} else if (arg == "--path") {
|
|
2454
|
-
if (++i >= argc) {
|
|
2455
|
-
invalid_param = true;
|
|
2456
|
-
break;
|
|
2457
|
-
}
|
|
2458
|
-
sparams.public_path = argv[i];
|
|
2459
|
-
} else if (arg == "--api-key") {
|
|
2460
|
-
if (++i >= argc) {
|
|
2461
|
-
invalid_param = true;
|
|
2462
|
-
break;
|
|
2463
|
-
}
|
|
2464
|
-
sparams.api_keys.push_back(argv[i]);
|
|
2465
|
-
} else if (arg == "--api-key-file") {
|
|
2466
|
-
if (++i >= argc) {
|
|
2467
|
-
invalid_param = true;
|
|
2468
|
-
break;
|
|
2469
|
-
}
|
|
2470
|
-
std::ifstream key_file(argv[i]);
|
|
2471
|
-
if (!key_file) {
|
|
2472
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
2473
|
-
invalid_param = true;
|
|
2474
|
-
break;
|
|
2475
|
-
}
|
|
2476
|
-
std::string key;
|
|
2477
|
-
while (std::getline(key_file, key)) {
|
|
2478
|
-
if (key.size() > 0) {
|
|
2479
|
-
sparams.api_keys.push_back(key);
|
|
2480
|
-
}
|
|
2481
|
-
}
|
|
2482
|
-
key_file.close();
|
|
2483
|
-
|
|
2484
|
-
}
|
|
2485
|
-
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
|
|
2486
|
-
else if (arg == "--ssl-key-file") {
|
|
2487
|
-
if (++i >= argc) {
|
|
2488
|
-
invalid_param = true;
|
|
2489
|
-
break;
|
|
2490
|
-
}
|
|
2491
|
-
sparams.ssl_key_file = argv[i];
|
|
2492
|
-
} else if (arg == "--ssl-cert-file") {
|
|
2493
|
-
if (++i >= argc) {
|
|
2494
|
-
invalid_param = true;
|
|
2495
|
-
break;
|
|
2496
|
-
}
|
|
2497
|
-
sparams.ssl_cert_file = argv[i];
|
|
2498
|
-
}
|
|
2499
|
-
#endif
|
|
2500
|
-
else if (arg == "--timeout" || arg == "-to") {
|
|
2501
|
-
if (++i >= argc) {
|
|
2502
|
-
invalid_param = true;
|
|
2503
|
-
break;
|
|
2504
|
-
}
|
|
2505
|
-
sparams.read_timeout = std::stoi(argv[i]);
|
|
2506
|
-
sparams.write_timeout = std::stoi(argv[i]);
|
|
2507
|
-
} else if (arg == "-m" || arg == "--model") {
|
|
2508
|
-
if (++i >= argc) {
|
|
2509
|
-
invalid_param = true;
|
|
2510
|
-
break;
|
|
2511
|
-
}
|
|
2512
|
-
params.model = argv[i];
|
|
2513
|
-
} else if (arg == "-mu" || arg == "--model-url") {
|
|
2514
|
-
if (++i >= argc) {
|
|
2515
|
-
invalid_param = true;
|
|
2516
|
-
break;
|
|
2517
|
-
}
|
|
2518
|
-
params.model_url = argv[i];
|
|
2519
|
-
} else if (arg == "-hfr" || arg == "--hf-repo") {
|
|
2520
|
-
if (++i >= argc) {
|
|
2521
|
-
invalid_param = true;
|
|
2522
|
-
break;
|
|
2523
|
-
}
|
|
2524
|
-
params.hf_repo = argv[i];
|
|
2525
|
-
} else if (arg == "-hff" || arg == "--hf-file") {
|
|
2526
|
-
if (++i >= argc) {
|
|
2527
|
-
invalid_param = true;
|
|
2528
|
-
break;
|
|
2529
|
-
}
|
|
2530
|
-
params.hf_file = argv[i];
|
|
2531
|
-
} else if (arg == "-a" || arg == "--alias") {
|
|
2532
|
-
if (++i >= argc) {
|
|
2533
|
-
invalid_param = true;
|
|
2534
|
-
break;
|
|
2535
|
-
}
|
|
2536
|
-
params.model_alias = argv[i];
|
|
2537
|
-
} else if (arg == "-h" || arg == "--help") {
|
|
2538
|
-
server_print_usage(argv[0], default_params, default_sparams);
|
|
2539
|
-
exit(0);
|
|
2540
|
-
} else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") {
|
|
2541
|
-
if (++i >= argc) {
|
|
2542
|
-
invalid_param = true;
|
|
2543
|
-
break;
|
|
2544
|
-
}
|
|
2545
|
-
params.n_ctx = std::stoi(argv[i]);
|
|
2546
|
-
} else if (arg == "--rope-scaling") {
|
|
2547
|
-
if (++i >= argc) {
|
|
2548
|
-
invalid_param = true;
|
|
2549
|
-
break;
|
|
2550
|
-
}
|
|
2551
|
-
std::string value(argv[i]);
|
|
2552
|
-
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
|
2553
|
-
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
|
2554
|
-
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
|
2555
|
-
else { invalid_param = true; break; }
|
|
2556
|
-
} else if (arg == "--rope-freq-base") {
|
|
2557
|
-
if (++i >= argc) {
|
|
2558
|
-
invalid_param = true;
|
|
2559
|
-
break;
|
|
2560
|
-
}
|
|
2561
|
-
params.rope_freq_base = std::stof(argv[i]);
|
|
2562
|
-
} else if (arg == "--rope-freq-scale") {
|
|
2563
|
-
if (++i >= argc) {
|
|
2564
|
-
invalid_param = true;
|
|
2565
|
-
break;
|
|
2566
|
-
}
|
|
2567
|
-
params.rope_freq_scale = std::stof(argv[i]);
|
|
2568
|
-
} else if (arg == "--yarn-ext-factor") {
|
|
2569
|
-
if (++i >= argc) {
|
|
2570
|
-
invalid_param = true;
|
|
2571
|
-
break;
|
|
2572
|
-
}
|
|
2573
|
-
params.yarn_ext_factor = std::stof(argv[i]);
|
|
2574
|
-
}
|
|
2575
|
-
else if (arg == "--yarn-attn-factor") {
|
|
2576
|
-
if (++i >= argc) {
|
|
2577
|
-
invalid_param = true;
|
|
2578
|
-
break;
|
|
2579
|
-
}
|
|
2580
|
-
params.yarn_attn_factor = std::stof(argv[i]);
|
|
2581
|
-
} else if (arg == "--yarn-beta-fast") {
|
|
2582
|
-
if (++i >= argc) {
|
|
2583
|
-
invalid_param = true;
|
|
2584
|
-
break;
|
|
2585
|
-
}
|
|
2586
|
-
params.yarn_beta_fast = std::stof(argv[i]);
|
|
2587
|
-
} else if (arg == "--yarn-beta-slow") {
|
|
2588
|
-
if (++i >= argc) {
|
|
2589
|
-
invalid_param = true;
|
|
2590
|
-
break;
|
|
2591
|
-
}
|
|
2592
|
-
params.yarn_beta_slow = std::stof(argv[i]);
|
|
2593
|
-
} else if (arg == "--pooling") {
|
|
2594
|
-
if (++i >= argc) {
|
|
2595
|
-
invalid_param = true;
|
|
2596
|
-
break;
|
|
2597
|
-
}
|
|
2598
|
-
std::string value(argv[i]);
|
|
2599
|
-
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
|
|
2600
|
-
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
|
|
2601
|
-
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
|
|
2602
|
-
else { invalid_param = true; break; }
|
|
2603
|
-
} else if (arg == "--defrag-thold" || arg == "-dt") {
|
|
2604
|
-
if (++i >= argc) {
|
|
2605
|
-
invalid_param = true;
|
|
2606
|
-
break;
|
|
2607
|
-
}
|
|
2608
|
-
params.defrag_thold = std::stof(argv[i]);
|
|
2609
|
-
} else if (arg == "--threads" || arg == "-t") {
|
|
2610
|
-
if (++i >= argc)
|
|
2611
|
-
{
|
|
2612
|
-
invalid_param = true;
|
|
2613
|
-
break;
|
|
2614
|
-
}
|
|
2615
|
-
params.n_threads = std::stoi(argv[i]);
|
|
2616
|
-
} else if (arg == "--grp-attn-n" || arg == "-gan") {
|
|
2617
|
-
if (++i >= argc) {
|
|
2618
|
-
invalid_param = true;
|
|
2619
|
-
break;
|
|
2620
|
-
}
|
|
2621
|
-
|
|
2622
|
-
params.grp_attn_n = std::stoi(argv[i]);
|
|
2623
|
-
} else if (arg == "--grp-attn-w" || arg == "-gaw") {
|
|
2624
|
-
if (++i >= argc) {
|
|
2625
|
-
invalid_param = true;
|
|
2626
|
-
break;
|
|
2627
|
-
}
|
|
2628
|
-
|
|
2629
|
-
params.grp_attn_w = std::stoi(argv[i]);
|
|
2630
|
-
} else if (arg == "--threads-batch" || arg == "-tb") {
|
|
2631
|
-
if (++i >= argc) {
|
|
2632
|
-
invalid_param = true;
|
|
2633
|
-
break;
|
|
2634
|
-
}
|
|
2635
|
-
params.n_threads_batch = std::stoi(argv[i]);
|
|
2636
|
-
} else if (arg == "--threads-http") {
|
|
2637
|
-
if (++i >= argc) {
|
|
2638
|
-
invalid_param = true;
|
|
2639
|
-
break;
|
|
2640
|
-
}
|
|
2641
|
-
sparams.n_threads_http = std::stoi(argv[i]);
|
|
2642
|
-
} else if (arg == "-b" || arg == "--batch-size") {
|
|
2643
|
-
if (++i >= argc) {
|
|
2644
|
-
invalid_param = true;
|
|
2645
|
-
break;
|
|
2646
|
-
}
|
|
2647
|
-
params.n_batch = std::stoi(argv[i]);
|
|
2648
|
-
} else if (arg == "-ub" || arg == "--ubatch-size") {
|
|
2649
|
-
if (++i >= argc) {
|
|
2650
|
-
invalid_param = true;
|
|
2651
|
-
break;
|
|
2652
|
-
}
|
|
2653
|
-
params.n_ubatch = std::stoi(argv[i]);
|
|
2654
|
-
} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
|
|
2655
|
-
if (++i >= argc) {
|
|
2656
|
-
invalid_param = true;
|
|
2657
|
-
break;
|
|
2658
|
-
}
|
|
2659
|
-
if (llama_supports_gpu_offload()) {
|
|
2660
|
-
params.n_gpu_layers = std::stoi(argv[i]);
|
|
2661
|
-
} else {
|
|
2662
|
-
LOG_WARNING(
|
|
2663
|
-
"Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
|
|
2664
|
-
"See main README.md for information on enabling GPU BLAS support",
|
|
2665
|
-
{{"n_gpu_layers", params.n_gpu_layers}});
|
|
2666
|
-
}
|
|
2667
|
-
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
|
2668
|
-
params.no_kv_offload = true;
|
|
2669
|
-
} else if (arg == "--split-mode" || arg == "-sm") {
|
|
2670
|
-
if (++i >= argc) {
|
|
2671
|
-
invalid_param = true;
|
|
2672
|
-
break;
|
|
2673
|
-
}
|
|
2674
|
-
std::string arg_next = argv[i];
|
|
2675
|
-
if (arg_next == "none") {
|
|
2676
|
-
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
|
2677
|
-
} else if (arg_next == "layer") {
|
|
2678
|
-
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
|
2679
|
-
} else if (arg_next == "row") {
|
|
2680
|
-
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
|
2681
|
-
} else {
|
|
2682
|
-
invalid_param = true;
|
|
2683
|
-
break;
|
|
2684
|
-
}
|
|
2685
|
-
#ifndef GGML_USE_CUDA
|
|
2686
|
-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
|
|
2687
|
-
#endif // GGML_USE_CUDA
|
|
2688
|
-
} else if (arg == "--tensor-split" || arg == "-ts") {
|
|
2689
|
-
if (++i >= argc) {
|
|
2690
|
-
invalid_param = true;
|
|
2691
|
-
break;
|
|
2692
|
-
}
|
|
2693
|
-
#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
|
|
2694
|
-
std::string arg_next = argv[i];
|
|
2695
|
-
|
|
2696
|
-
// split string by , and /
|
|
2697
|
-
const std::regex regex{R"([,/]+)"};
|
|
2698
|
-
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
|
2699
|
-
std::vector<std::string> split_arg{it, {}};
|
|
2700
|
-
GGML_ASSERT(split_arg.size() <= llama_max_devices());
|
|
2701
|
-
|
|
2702
|
-
for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) {
|
|
2703
|
-
if (i_device < split_arg.size()) {
|
|
2704
|
-
params.tensor_split[i_device] = std::stof(split_arg[i_device]);
|
|
2705
|
-
} else {
|
|
2706
|
-
params.tensor_split[i_device] = 0.0f;
|
|
2707
|
-
}
|
|
2708
|
-
}
|
|
2709
|
-
#else
|
|
2710
|
-
LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
|
|
2711
|
-
#endif // GGML_USE_CUDA
|
|
2712
|
-
} else if (arg == "--main-gpu" || arg == "-mg") {
|
|
2713
|
-
if (++i >= argc) {
|
|
2714
|
-
invalid_param = true;
|
|
2715
|
-
break;
|
|
2716
|
-
}
|
|
2717
|
-
#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
|
|
2718
|
-
params.main_gpu = std::stoi(argv[i]);
|
|
2719
|
-
#else
|
|
2720
|
-
LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {});
|
|
2721
|
-
#endif
|
|
2722
|
-
} else if (arg == "--lora") {
|
|
2723
|
-
if (++i >= argc) {
|
|
2724
|
-
invalid_param = true;
|
|
2725
|
-
break;
|
|
2726
|
-
}
|
|
2727
|
-
params.lora_adapter.emplace_back(argv[i], 1.0f);
|
|
2728
|
-
params.use_mmap = false;
|
|
2729
|
-
} else if (arg == "--lora-scaled") {
|
|
2730
|
-
if (++i >= argc) {
|
|
2731
|
-
invalid_param = true;
|
|
2732
|
-
break;
|
|
2733
|
-
}
|
|
2734
|
-
const char * lora_adapter = argv[i];
|
|
2735
|
-
if (++i >= argc) {
|
|
2736
|
-
invalid_param = true;
|
|
2737
|
-
break;
|
|
2738
|
-
}
|
|
2739
|
-
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
|
2740
|
-
params.use_mmap = false;
|
|
2741
|
-
} else if (arg == "--lora-base") {
|
|
2742
|
-
if (++i >= argc) {
|
|
2743
|
-
invalid_param = true;
|
|
2744
|
-
break;
|
|
2745
|
-
}
|
|
2746
|
-
params.lora_base = argv[i];
|
|
2747
|
-
} else if (arg == "-v" || arg == "--verbose") {
|
|
2748
|
-
#if SERVER_VERBOSE != 1
|
|
2749
|
-
LOG_WARNING("server.cpp is not built with verbose logging.", {});
|
|
2750
|
-
#else
|
|
2751
|
-
server_verbose = true;
|
|
2752
|
-
#endif
|
|
2753
|
-
} else if (arg == "--mlock") {
|
|
2754
|
-
params.use_mlock = true;
|
|
2755
|
-
} else if (arg == "--no-mmap") {
|
|
2756
|
-
params.use_mmap = false;
|
|
2757
|
-
} else if (arg == "--numa") {
|
|
2758
|
-
if (++i >= argc) {
|
|
2759
|
-
invalid_param = true;
|
|
2760
|
-
break;
|
|
2761
|
-
} else {
|
|
2762
|
-
std::string value(argv[i]);
|
|
2763
|
-
/**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
|
2764
|
-
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
|
|
2765
|
-
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
|
|
2766
|
-
else { invalid_param = true; break; }
|
|
2767
|
-
}
|
|
2768
|
-
} else if (arg == "--embedding" || arg == "--embeddings") {
|
|
2769
|
-
params.embedding = true;
|
|
2770
|
-
} else if (arg == "-cb" || arg == "--cont-batching") {
|
|
2771
|
-
params.cont_batching = true;
|
|
2772
|
-
} else if (arg == "-fa" || arg == "--flash-attn") {
|
|
2773
|
-
params.flash_attn = true;
|
|
2774
|
-
} else if (arg == "-np" || arg == "--parallel") {
|
|
2775
|
-
if (++i >= argc) {
|
|
2776
|
-
invalid_param = true;
|
|
2777
|
-
break;
|
|
2778
|
-
}
|
|
2779
|
-
params.n_parallel = std::stoi(argv[i]);
|
|
2780
|
-
} else if (arg == "-n" || arg == "--n-predict") {
|
|
2781
|
-
if (++i >= argc) {
|
|
2782
|
-
invalid_param = true;
|
|
2783
|
-
break;
|
|
2784
|
-
}
|
|
2785
|
-
params.n_predict = std::stoi(argv[i]);
|
|
2786
|
-
} else if (arg == "-spf" || arg == "--system-prompt-file") {
|
|
2787
|
-
if (++i >= argc) {
|
|
2788
|
-
invalid_param = true;
|
|
2789
|
-
break;
|
|
2790
|
-
}
|
|
2791
|
-
std::ifstream file(argv[i]);
|
|
2792
|
-
if (!file) {
|
|
2793
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
2794
|
-
invalid_param = true;
|
|
2795
|
-
break;
|
|
2796
|
-
}
|
|
2797
|
-
std::string system_prompt;
|
|
2798
|
-
std::copy(
|
|
2799
|
-
std::istreambuf_iterator<char>(file),
|
|
2800
|
-
std::istreambuf_iterator<char>(),
|
|
2801
|
-
std::back_inserter(system_prompt)
|
|
2802
|
-
);
|
|
2803
|
-
sparams.system_prompt = system_prompt;
|
|
2804
|
-
} else if (arg == "-ctk" || arg == "--cache-type-k") {
|
|
2805
|
-
params.cache_type_k = argv[++i];
|
|
2806
|
-
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
|
2807
|
-
params.cache_type_v = argv[++i];
|
|
2808
|
-
} else if (arg == "--log-format") {
|
|
2809
|
-
if (++i >= argc) {
|
|
2810
|
-
invalid_param = true;
|
|
2811
|
-
break;
|
|
2812
|
-
}
|
|
2813
|
-
if (std::strcmp(argv[i], "json") == 0) {
|
|
2814
|
-
server_log_json = true;
|
|
2815
|
-
} else if (std::strcmp(argv[i], "text") == 0) {
|
|
2816
|
-
server_log_json = false;
|
|
2817
|
-
} else {
|
|
2818
|
-
invalid_param = true;
|
|
2819
|
-
break;
|
|
2820
|
-
}
|
|
2821
|
-
} else if (arg == "--log-disable") {
|
|
2822
|
-
log_set_target(stdout);
|
|
2823
|
-
LOG_INFO("logging to file is disabled.", {});
|
|
2824
|
-
} else if (arg == "--slots-endpoint-disable") {
|
|
2825
|
-
sparams.slots_endpoint = false;
|
|
2826
|
-
} else if (arg == "--metrics") {
|
|
2827
|
-
sparams.metrics_endpoint = true;
|
|
2828
|
-
} else if (arg == "--slot-save-path") {
|
|
2829
|
-
if (++i >= argc) {
|
|
2830
|
-
invalid_param = true;
|
|
2831
|
-
break;
|
|
2832
|
-
}
|
|
2833
|
-
sparams.slot_save_path = argv[i];
|
|
2834
|
-
// if doesn't end with DIRECTORY_SEPARATOR, add it
|
|
2835
|
-
if (!sparams.slot_save_path.empty() && sparams.slot_save_path[sparams.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
|
|
2836
|
-
sparams.slot_save_path += DIRECTORY_SEPARATOR;
|
|
2837
|
-
}
|
|
2838
|
-
} else if (arg == "--chat-template") {
|
|
2839
|
-
if (++i >= argc) {
|
|
2840
|
-
invalid_param = true;
|
|
2841
|
-
break;
|
|
2842
|
-
}
|
|
2843
|
-
if (!verify_custom_template(argv[i])) {
|
|
2844
|
-
fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
|
|
2845
|
-
fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
|
|
2846
|
-
invalid_param = true;
|
|
2847
|
-
break;
|
|
2848
|
-
}
|
|
2849
|
-
sparams.chat_template = argv[i];
|
|
2850
|
-
} else if (arg == "--override-kv") {
|
|
2851
|
-
if (++i >= argc) {
|
|
2852
|
-
invalid_param = true;
|
|
2853
|
-
break;
|
|
2854
|
-
}
|
|
2855
|
-
if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
|
|
2856
|
-
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
|
2857
|
-
invalid_param = true;
|
|
2858
|
-
break;
|
|
2859
|
-
}
|
|
2860
|
-
} else {
|
|
2861
|
-
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
|
2862
|
-
server_print_usage(argv[0], default_params, default_sparams);
|
|
2863
|
-
exit(1);
|
|
2864
|
-
}
|
|
2865
|
-
}
|
|
2866
|
-
|
|
2867
|
-
gpt_params_handle_model_default(params);
|
|
2868
|
-
|
|
2869
|
-
if (!params.kv_overrides.empty()) {
|
|
2870
|
-
params.kv_overrides.emplace_back();
|
|
2871
|
-
params.kv_overrides.back().key[0] = 0;
|
|
2872
|
-
}
|
|
2873
|
-
|
|
2874
|
-
if (invalid_param) {
|
|
2875
|
-
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
|
2876
|
-
server_print_usage(argv[0], default_params, default_sparams);
|
|
2877
|
-
exit(1);
|
|
2878
|
-
}
|
|
2879
|
-
}
|
|
2880
|
-
|
|
2881
2452
|
static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
|
|
2882
2453
|
// skip GH copilot requests when using default port
|
|
2883
2454
|
if (req.path == "/v1/health" || req.path == "/v1/completions") {
|
|
@@ -2918,16 +2489,22 @@ int main(int argc, char ** argv) {
|
|
|
2918
2489
|
log_disable();
|
|
2919
2490
|
#endif
|
|
2920
2491
|
// own arguments required by this example
|
|
2921
|
-
gpt_params
|
|
2922
|
-
|
|
2492
|
+
gpt_params params;
|
|
2493
|
+
|
|
2494
|
+
if (!gpt_params_parse(argc, argv, params)) {
|
|
2495
|
+
gpt_params_print_usage(argc, argv, params);
|
|
2496
|
+
return 1;
|
|
2497
|
+
}
|
|
2498
|
+
|
|
2499
|
+
// TODO: not great to use extern vars
|
|
2500
|
+
server_log_json = params.log_json;
|
|
2501
|
+
server_verbose = params.verbosity > 0;
|
|
2923
2502
|
|
|
2924
2503
|
// struct that contains llama context and inference
|
|
2925
2504
|
server_context ctx_server;
|
|
2926
2505
|
|
|
2927
|
-
|
|
2928
|
-
|
|
2929
|
-
if (!sparams.system_prompt.empty()) {
|
|
2930
|
-
ctx_server.system_prompt_set(sparams.system_prompt);
|
|
2506
|
+
if (!params.system_prompt.empty()) {
|
|
2507
|
+
ctx_server.system_prompt_set(params.system_prompt);
|
|
2931
2508
|
}
|
|
2932
2509
|
|
|
2933
2510
|
if (params.model_alias == "unknown") {
|
|
@@ -2951,10 +2528,10 @@ int main(int argc, char ** argv) {
|
|
|
2951
2528
|
|
|
2952
2529
|
std::unique_ptr<httplib::Server> svr;
|
|
2953
2530
|
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
|
|
2954
|
-
if (
|
|
2955
|
-
LOG_INFO("Running with SSL", {{"key",
|
|
2531
|
+
if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
|
|
2532
|
+
LOG_INFO("Running with SSL", {{"key", params.ssl_file_key}, {"cert", params.ssl_file_cert}});
|
|
2956
2533
|
svr.reset(
|
|
2957
|
-
new httplib::SSLServer(
|
|
2534
|
+
new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
|
|
2958
2535
|
);
|
|
2959
2536
|
} else {
|
|
2960
2537
|
LOG_INFO("Running without SSL", {});
|
|
@@ -3008,26 +2585,29 @@ int main(int argc, char ** argv) {
|
|
|
3008
2585
|
});
|
|
3009
2586
|
|
|
3010
2587
|
// set timeouts and change hostname and port
|
|
3011
|
-
svr->set_read_timeout (
|
|
3012
|
-
svr->set_write_timeout(
|
|
2588
|
+
svr->set_read_timeout (params.timeout_read);
|
|
2589
|
+
svr->set_write_timeout(params.timeout_write);
|
|
3013
2590
|
|
|
3014
|
-
if (!svr->bind_to_port(
|
|
3015
|
-
fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n",
|
|
2591
|
+
if (!svr->bind_to_port(params.hostname, params.port)) {
|
|
2592
|
+
fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", params.hostname.c_str(), params.port);
|
|
3016
2593
|
return 1;
|
|
3017
2594
|
}
|
|
3018
2595
|
|
|
3019
2596
|
std::unordered_map<std::string, std::string> log_data;
|
|
3020
2597
|
|
|
3021
|
-
log_data["hostname"] =
|
|
3022
|
-
log_data["port"] = std::to_string(
|
|
2598
|
+
log_data["hostname"] = params.hostname;
|
|
2599
|
+
log_data["port"] = std::to_string(params.port);
|
|
3023
2600
|
|
|
3024
|
-
if (
|
|
3025
|
-
auto key =
|
|
2601
|
+
if (params.api_keys.size() == 1) {
|
|
2602
|
+
auto key = params.api_keys[0];
|
|
3026
2603
|
log_data["api_key"] = "api_key: ****" + key.substr(std::max((int)(key.length() - 4), 0));
|
|
3027
|
-
} else if (
|
|
3028
|
-
log_data["api_key"] = "api_key: " + std::to_string(
|
|
2604
|
+
} else if (params.api_keys.size() > 1) {
|
|
2605
|
+
log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded";
|
|
3029
2606
|
}
|
|
3030
2607
|
|
|
2608
|
+
// Necessary similarity of prompt for slot selection
|
|
2609
|
+
ctx_server.slot_prompt_similarity = params.slot_prompt_similarity;
|
|
2610
|
+
|
|
3031
2611
|
// load the model
|
|
3032
2612
|
if (!ctx_server.load_model(params)) {
|
|
3033
2613
|
state.store(SERVER_STATE_ERROR);
|
|
@@ -3042,26 +2622,18 @@ int main(int argc, char ** argv) {
|
|
|
3042
2622
|
const auto model_meta = ctx_server.model_meta();
|
|
3043
2623
|
|
|
3044
2624
|
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
|
|
3045
|
-
if (
|
|
2625
|
+
if (params.chat_template.empty()) {
|
|
3046
2626
|
if (!ctx_server.validate_model_chat_template()) {
|
|
3047
|
-
|
|
3048
|
-
|
|
2627
|
+
LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
|
|
2628
|
+
params.chat_template = "chatml";
|
|
3049
2629
|
}
|
|
3050
2630
|
}
|
|
3051
2631
|
|
|
3052
2632
|
// print sample chat example to make it clear which template is used
|
|
3053
2633
|
{
|
|
3054
|
-
json chat;
|
|
3055
|
-
chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}});
|
|
3056
|
-
chat.push_back({{"role", "user"}, {"content", "Hello"}});
|
|
3057
|
-
chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
|
|
3058
|
-
chat.push_back({{"role", "user"}, {"content", "How are you?"}});
|
|
3059
|
-
|
|
3060
|
-
const std::string chat_example = format_chat(ctx_server.model, sparams.chat_template, chat);
|
|
3061
|
-
|
|
3062
2634
|
LOG_INFO("chat template", {
|
|
3063
|
-
{"chat_example",
|
|
3064
|
-
{"built_in",
|
|
2635
|
+
{"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
|
|
2636
|
+
{"built_in", params.chat_template.empty()},
|
|
3065
2637
|
});
|
|
3066
2638
|
}
|
|
3067
2639
|
|
|
@@ -3069,7 +2641,7 @@ int main(int argc, char ** argv) {
|
|
|
3069
2641
|
// Middlewares
|
|
3070
2642
|
//
|
|
3071
2643
|
|
|
3072
|
-
auto middleware_validate_api_key = [&
|
|
2644
|
+
auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
|
|
3073
2645
|
// TODO: should we apply API key to all endpoints, including "/health" and "/models"?
|
|
3074
2646
|
static const std::set<std::string> protected_endpoints = {
|
|
3075
2647
|
"/props",
|
|
@@ -3087,7 +2659,7 @@ int main(int argc, char ** argv) {
|
|
|
3087
2659
|
};
|
|
3088
2660
|
|
|
3089
2661
|
// If API key is not set, skip validation
|
|
3090
|
-
if (
|
|
2662
|
+
if (params.api_keys.empty()) {
|
|
3091
2663
|
return true;
|
|
3092
2664
|
}
|
|
3093
2665
|
|
|
@@ -3102,7 +2674,7 @@ int main(int argc, char ** argv) {
|
|
|
3102
2674
|
std::string prefix = "Bearer ";
|
|
3103
2675
|
if (auth_header.substr(0, prefix.size()) == prefix) {
|
|
3104
2676
|
std::string received_api_key = auth_header.substr(prefix.size());
|
|
3105
|
-
if (std::find(
|
|
2677
|
+
if (std::find(params.api_keys.begin(), params.api_keys.end(), received_api_key) != params.api_keys.end()) {
|
|
3106
2678
|
return true; // API key is valid
|
|
3107
2679
|
}
|
|
3108
2680
|
}
|
|
@@ -3157,7 +2729,7 @@ int main(int argc, char ** argv) {
|
|
|
3157
2729
|
};
|
|
3158
2730
|
|
|
3159
2731
|
res.status = 200; // HTTP OK
|
|
3160
|
-
if (
|
|
2732
|
+
if (params.endpoint_slots && req.has_param("include_slots")) {
|
|
3161
2733
|
health["slots"] = result.data.at("slots");
|
|
3162
2734
|
}
|
|
3163
2735
|
|
|
@@ -3183,7 +2755,7 @@ int main(int argc, char ** argv) {
|
|
|
3183
2755
|
};
|
|
3184
2756
|
|
|
3185
2757
|
const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) {
|
|
3186
|
-
if (!
|
|
2758
|
+
if (!params.endpoint_slots) {
|
|
3187
2759
|
res_error(res, format_error_response("This server does not support slots endpoint.", ERROR_TYPE_NOT_SUPPORTED));
|
|
3188
2760
|
return;
|
|
3189
2761
|
}
|
|
@@ -3207,7 +2779,7 @@ int main(int argc, char ** argv) {
|
|
|
3207
2779
|
};
|
|
3208
2780
|
|
|
3209
2781
|
const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
|
|
3210
|
-
if (!
|
|
2782
|
+
if (!params.endpoint_metrics) {
|
|
3211
2783
|
res_error(res, format_error_response("This server does not support metrics endpoint.", ERROR_TYPE_NOT_SUPPORTED));
|
|
3212
2784
|
return;
|
|
3213
2785
|
}
|
|
@@ -3307,14 +2879,14 @@ int main(int argc, char ** argv) {
|
|
|
3307
2879
|
res.status = 200; // HTTP OK
|
|
3308
2880
|
};
|
|
3309
2881
|
|
|
3310
|
-
const auto handle_slots_save = [&ctx_server, &res_error, &
|
|
2882
|
+
const auto handle_slots_save = [&ctx_server, &res_error, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
|
3311
2883
|
json request_data = json::parse(req.body);
|
|
3312
2884
|
std::string filename = request_data.at("filename");
|
|
3313
2885
|
if (!fs_validate_filename(filename)) {
|
|
3314
2886
|
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
|
3315
2887
|
return;
|
|
3316
2888
|
}
|
|
3317
|
-
std::string filepath =
|
|
2889
|
+
std::string filepath = params.slot_save_path + filename;
|
|
3318
2890
|
|
|
3319
2891
|
server_task task;
|
|
3320
2892
|
task.type = SERVER_TASK_TYPE_SLOT_SAVE;
|
|
@@ -3337,14 +2909,14 @@ int main(int argc, char ** argv) {
|
|
|
3337
2909
|
}
|
|
3338
2910
|
};
|
|
3339
2911
|
|
|
3340
|
-
const auto handle_slots_restore = [&ctx_server, &res_error, &
|
|
2912
|
+
const auto handle_slots_restore = [&ctx_server, &res_error, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
|
3341
2913
|
json request_data = json::parse(req.body);
|
|
3342
2914
|
std::string filename = request_data.at("filename");
|
|
3343
2915
|
if (!fs_validate_filename(filename)) {
|
|
3344
2916
|
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
|
3345
2917
|
return;
|
|
3346
2918
|
}
|
|
3347
|
-
std::string filepath =
|
|
2919
|
+
std::string filepath = params.slot_save_path + filename;
|
|
3348
2920
|
|
|
3349
2921
|
server_task task;
|
|
3350
2922
|
task.type = SERVER_TASK_TYPE_SLOT_RESTORE;
|
|
@@ -3414,17 +2986,31 @@ int main(int argc, char ** argv) {
|
|
|
3414
2986
|
};
|
|
3415
2987
|
|
|
3416
2988
|
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
|
|
2989
|
+
std::string template_key = "tokenizer.chat_template", curr_tmpl;
|
|
2990
|
+
int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0);
|
|
2991
|
+
if (tlen > 0) {
|
|
2992
|
+
std::vector<char> curr_tmpl_buf(tlen + 1, 0);
|
|
2993
|
+
if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) {
|
|
2994
|
+
curr_tmpl = std::string(curr_tmpl_buf.data(), tlen);
|
|
2995
|
+
}
|
|
2996
|
+
}
|
|
3417
2997
|
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
3418
2998
|
json data = {
|
|
3419
2999
|
{ "system_prompt", ctx_server.system_prompt.c_str() },
|
|
3420
3000
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
|
3421
|
-
{ "total_slots", ctx_server.params.n_parallel }
|
|
3001
|
+
{ "total_slots", ctx_server.params.n_parallel },
|
|
3002
|
+
{ "chat_template", curr_tmpl.c_str() }
|
|
3422
3003
|
};
|
|
3423
3004
|
|
|
3424
3005
|
res.set_content(data.dump(), "application/json; charset=utf-8");
|
|
3425
3006
|
};
|
|
3426
3007
|
|
|
3427
3008
|
const auto handle_completions = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
|
|
3009
|
+
if (ctx_server.params.embedding) {
|
|
3010
|
+
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
|
3011
|
+
return;
|
|
3012
|
+
}
|
|
3013
|
+
|
|
3428
3014
|
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
3429
3015
|
|
|
3430
3016
|
json data = json::parse(req.body);
|
|
@@ -3519,9 +3105,14 @@ int main(int argc, char ** argv) {
|
|
|
3519
3105
|
res.set_content(models.dump(), "application/json; charset=utf-8");
|
|
3520
3106
|
};
|
|
3521
3107
|
|
|
3522
|
-
const auto handle_chat_completions = [&ctx_server, &
|
|
3108
|
+
const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
|
|
3109
|
+
if (ctx_server.params.embedding) {
|
|
3110
|
+
res_error(res, format_error_response("This server does not support chat completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
|
3111
|
+
return;
|
|
3112
|
+
}
|
|
3113
|
+
|
|
3523
3114
|
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
3524
|
-
json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body),
|
|
3115
|
+
json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
|
|
3525
3116
|
|
|
3526
3117
|
const int id_task = ctx_server.queue_tasks.get_new_id();
|
|
3527
3118
|
|
|
@@ -3592,6 +3183,11 @@ int main(int argc, char ** argv) {
|
|
|
3592
3183
|
};
|
|
3593
3184
|
|
|
3594
3185
|
const auto handle_infill = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
|
|
3186
|
+
if (ctx_server.params.embedding) {
|
|
3187
|
+
res_error(res, format_error_response("This server does not support infill. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
|
3188
|
+
return;
|
|
3189
|
+
}
|
|
3190
|
+
|
|
3595
3191
|
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
3596
3192
|
|
|
3597
3193
|
json data = json::parse(req.body);
|
|
@@ -3678,13 +3274,8 @@ int main(int argc, char ** argv) {
|
|
|
3678
3274
|
return res.set_content(data.dump(), "application/json; charset=utf-8");
|
|
3679
3275
|
};
|
|
3680
3276
|
|
|
3681
|
-
const auto handle_embeddings = [&
|
|
3277
|
+
const auto handle_embeddings = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
|
|
3682
3278
|
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
3683
|
-
if (!params.embedding) {
|
|
3684
|
-
res.status = 501;
|
|
3685
|
-
res.set_content("This server does not support embeddings. Start it with `--embeddings`", "text/plain; charset=utf-8");
|
|
3686
|
-
return;
|
|
3687
|
-
}
|
|
3688
3279
|
|
|
3689
3280
|
const json body = json::parse(req.body);
|
|
3690
3281
|
bool is_openai = false;
|
|
@@ -3746,17 +3337,29 @@ int main(int argc, char ** argv) {
|
|
|
3746
3337
|
//
|
|
3747
3338
|
|
|
3748
3339
|
// register static assets routes
|
|
3749
|
-
if (!
|
|
3340
|
+
if (!params.public_path.empty()) {
|
|
3750
3341
|
// Set the base directory for serving static files
|
|
3751
|
-
svr->set_base_dir(
|
|
3342
|
+
svr->set_base_dir(params.public_path);
|
|
3752
3343
|
}
|
|
3753
3344
|
|
|
3754
3345
|
// using embedded static files
|
|
3755
|
-
svr->Get("/",
|
|
3756
|
-
svr->Get("/index.js",
|
|
3757
|
-
svr->Get("/completion.js",
|
|
3758
|
-
svr->Get("/json-schema-to-grammar.mjs", handle_static_file(
|
|
3759
|
-
|
|
3346
|
+
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
|
|
3347
|
+
svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
|
|
3348
|
+
svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
|
|
3349
|
+
svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
|
|
3350
|
+
|
|
3351
|
+
// add new-ui files
|
|
3352
|
+
svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
|
|
3353
|
+
svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
|
|
3354
|
+
svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
|
|
3355
|
+
svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
|
|
3356
|
+
svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
|
|
3357
|
+
svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
|
|
3358
|
+
svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
|
|
3359
|
+
svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
|
|
3360
|
+
svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
|
|
3361
|
+
svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
|
|
3362
|
+
svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
|
|
3760
3363
|
|
|
3761
3364
|
// register API routes
|
|
3762
3365
|
svr->Get ("/health", handle_health);
|
|
@@ -3775,7 +3378,7 @@ int main(int argc, char ** argv) {
|
|
|
3775
3378
|
svr->Post("/v1/embeddings", handle_embeddings);
|
|
3776
3379
|
svr->Post("/tokenize", handle_tokenize);
|
|
3777
3380
|
svr->Post("/detokenize", handle_detokenize);
|
|
3778
|
-
if (!
|
|
3381
|
+
if (!params.slot_save_path.empty()) {
|
|
3779
3382
|
// only enable slot endpoints if slot_save_path is set
|
|
3780
3383
|
svr->Post("/slots/:id_slot", handle_slots_action);
|
|
3781
3384
|
}
|
|
@@ -3783,12 +3386,12 @@ int main(int argc, char ** argv) {
|
|
|
3783
3386
|
//
|
|
3784
3387
|
// Start the server
|
|
3785
3388
|
//
|
|
3786
|
-
if (
|
|
3389
|
+
if (params.n_threads_http < 1) {
|
|
3787
3390
|
// +2 threads for monitoring endpoints
|
|
3788
|
-
|
|
3391
|
+
params.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
|
|
3789
3392
|
}
|
|
3790
|
-
log_data["n_threads_http"] = std::to_string(
|
|
3791
|
-
svr->new_task_queue = [&
|
|
3393
|
+
log_data["n_threads_http"] = std::to_string(params.n_threads_http);
|
|
3394
|
+
svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); };
|
|
3792
3395
|
|
|
3793
3396
|
LOG_INFO("HTTP server listening", log_data);
|
|
3794
3397
|
|