@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#endif
|
|
4
4
|
|
|
5
5
|
#include "common.h"
|
|
6
|
+
#include "log.h"
|
|
6
7
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
7
8
|
#define JSON_ASSERT GGML_ASSERT
|
|
8
9
|
#include "json.hpp"
|
|
@@ -11,6 +12,7 @@
|
|
|
11
12
|
|
|
12
13
|
#include <algorithm>
|
|
13
14
|
#include <cinttypes>
|
|
15
|
+
#include <climits>
|
|
14
16
|
#include <cmath>
|
|
15
17
|
#include <codecvt>
|
|
16
18
|
#include <cstdarg>
|
|
@@ -22,6 +24,7 @@
|
|
|
22
24
|
#include <regex>
|
|
23
25
|
#include <sstream>
|
|
24
26
|
#include <string>
|
|
27
|
+
#include <thread>
|
|
25
28
|
#include <unordered_map>
|
|
26
29
|
#include <unordered_set>
|
|
27
30
|
#include <vector>
|
|
@@ -48,7 +51,6 @@
|
|
|
48
51
|
#if defined(LLAMA_USE_CURL)
|
|
49
52
|
#include <curl/curl.h>
|
|
50
53
|
#include <curl/easy.h>
|
|
51
|
-
#include <thread>
|
|
52
54
|
#include <future>
|
|
53
55
|
#endif
|
|
54
56
|
|
|
@@ -56,14 +58,6 @@
|
|
|
56
58
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
57
59
|
#endif
|
|
58
60
|
|
|
59
|
-
#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
|
|
60
|
-
#define GGML_USE_CUDA_SYCL
|
|
61
|
-
#endif
|
|
62
|
-
|
|
63
|
-
#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
|
|
64
|
-
#define GGML_USE_CUDA_SYCL_VULKAN
|
|
65
|
-
#endif
|
|
66
|
-
|
|
67
61
|
#if defined(LLAMA_USE_CURL)
|
|
68
62
|
#ifdef __linux__
|
|
69
63
|
#include <linux/limits.h>
|
|
@@ -110,8 +104,34 @@ int32_t cpu_get_num_physical_cores() {
|
|
|
110
104
|
if (result == 0) {
|
|
111
105
|
return num_physical_cores;
|
|
112
106
|
}
|
|
113
|
-
#elif defined(_WIN32)
|
|
114
|
-
//TODO:
|
|
107
|
+
#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
|
108
|
+
// TODO: windows + arm64 + mingw64
|
|
109
|
+
unsigned int n_threads_win = std::thread::hardware_concurrency();
|
|
110
|
+
unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
|
|
111
|
+
|
|
112
|
+
DWORD buffer_size = 0;
|
|
113
|
+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
|
|
114
|
+
if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
|
|
115
|
+
return default_threads;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
std::vector<char> buffer(buffer_size);
|
|
120
|
+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
|
|
121
|
+
return default_threads;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
int32_t num_physical_cores = 0;
|
|
125
|
+
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
|
|
126
|
+
while (buffer_size > 0) {
|
|
127
|
+
if (info->Relationship == RelationProcessorCore) {
|
|
128
|
+
num_physical_cores += info->Processor.GroupCount;
|
|
129
|
+
}
|
|
130
|
+
buffer_size -= info->Size;
|
|
131
|
+
info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
return num_physical_cores > 0 ? num_physical_cores : default_threads;
|
|
115
135
|
#endif
|
|
116
136
|
unsigned int n_threads = std::thread::hardware_concurrency();
|
|
117
137
|
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
|
|
@@ -186,1537 +206,193 @@ int32_t cpu_get_num_math() {
|
|
|
186
206
|
}
|
|
187
207
|
}
|
|
188
208
|
}
|
|
189
|
-
#endif
|
|
190
|
-
return cpu_get_num_physical_cores();
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
//
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
if (
|
|
199
|
-
params.hf_token = std::getenv("HF_TOKEN");
|
|
200
|
-
}
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
void gpt_params_handle_model_default(gpt_params & params) {
|
|
204
|
-
if (!params.hf_repo.empty()) {
|
|
205
|
-
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
206
|
-
if (params.hf_file.empty()) {
|
|
207
|
-
if (params.model.empty()) {
|
|
208
|
-
throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
|
|
209
|
-
}
|
|
210
|
-
params.hf_file = params.model;
|
|
211
|
-
} else if (params.model.empty()) {
|
|
212
|
-
params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
|
|
213
|
-
}
|
|
214
|
-
} else if (!params.model_url.empty()) {
|
|
215
|
-
if (params.model.empty()) {
|
|
216
|
-
auto f = string_split(params.model_url, '#').front();
|
|
217
|
-
f = string_split(f, '?').front();
|
|
218
|
-
params.model = fs_get_cache_file(string_split(f, '/').back());
|
|
219
|
-
}
|
|
220
|
-
} else if (params.model.empty()) {
|
|
221
|
-
params.model = DEFAULT_MODEL_PATH;
|
|
222
|
-
}
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|
226
|
-
bool invalid_param = false;
|
|
227
|
-
std::string arg;
|
|
228
|
-
const std::string arg_prefix = "--";
|
|
229
|
-
llama_sampling_params & sparams = params.sparams;
|
|
230
|
-
|
|
231
|
-
for (int i = 1; i < argc; i++) {
|
|
232
|
-
arg = argv[i];
|
|
233
|
-
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
|
234
|
-
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
235
|
-
}
|
|
236
|
-
if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
|
|
237
|
-
throw std::invalid_argument("error: unknown argument: " + arg);
|
|
238
|
-
}
|
|
239
|
-
if (invalid_param) {
|
|
240
|
-
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
|
241
|
-
}
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
|
|
245
|
-
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
gpt_params_handle_model_default(params);
|
|
249
|
-
|
|
250
|
-
gpt_params_handle_hf_token(params);
|
|
251
|
-
|
|
252
|
-
if (params.escape) {
|
|
253
|
-
string_process_escapes(params.prompt);
|
|
254
|
-
string_process_escapes(params.input_prefix);
|
|
255
|
-
string_process_escapes(params.input_suffix);
|
|
256
|
-
string_process_escapes(sparams.cfg_negative_prompt);
|
|
257
|
-
for (auto & antiprompt : params.antiprompt) {
|
|
258
|
-
string_process_escapes(antiprompt);
|
|
259
|
-
}
|
|
260
|
-
}
|
|
261
|
-
|
|
262
|
-
if (!params.kv_overrides.empty()) {
|
|
263
|
-
params.kv_overrides.emplace_back();
|
|
264
|
-
params.kv_overrides.back().key[0] = 0;
|
|
265
|
-
}
|
|
266
|
-
|
|
267
|
-
return true;
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
271
|
-
const auto params_org = params; // the example can modify the default params
|
|
272
|
-
|
|
273
|
-
try {
|
|
274
|
-
if (!gpt_params_parse_ex(argc, argv, params) || params.usage) {
|
|
275
|
-
params = params_org;
|
|
276
|
-
params.usage = true;
|
|
277
|
-
return false;
|
|
278
|
-
}
|
|
279
|
-
} catch (const std::invalid_argument & ex) {
|
|
280
|
-
fprintf(stderr, "%s\n", ex.what());
|
|
281
|
-
params = params_org;
|
|
282
|
-
return false;
|
|
283
|
-
}
|
|
284
|
-
|
|
285
|
-
return true;
|
|
286
|
-
}
|
|
287
|
-
|
|
288
|
-
#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
|
|
289
|
-
|
|
290
|
-
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
|
|
291
|
-
const char split_delim = ',';
|
|
292
|
-
|
|
293
|
-
llama_sampling_params & sparams = params.sparams;
|
|
294
|
-
|
|
295
|
-
if (arg == "-s" || arg == "--seed") {
|
|
296
|
-
CHECK_ARG
|
|
297
|
-
// TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
|
|
298
|
-
params.seed = std::stoul(argv[i]);
|
|
299
|
-
sparams.seed = std::stoul(argv[i]);
|
|
300
|
-
return true;
|
|
301
|
-
}
|
|
302
|
-
if (arg == "-t" || arg == "--threads") {
|
|
303
|
-
CHECK_ARG
|
|
304
|
-
params.n_threads = std::stoi(argv[i]);
|
|
305
|
-
if (params.n_threads <= 0) {
|
|
306
|
-
params.n_threads = std::thread::hardware_concurrency();
|
|
307
|
-
}
|
|
308
|
-
return true;
|
|
309
|
-
}
|
|
310
|
-
if (arg == "-tb" || arg == "--threads-batch") {
|
|
311
|
-
CHECK_ARG
|
|
312
|
-
params.n_threads_batch = std::stoi(argv[i]);
|
|
313
|
-
if (params.n_threads_batch <= 0) {
|
|
314
|
-
params.n_threads_batch = std::thread::hardware_concurrency();
|
|
315
|
-
}
|
|
316
|
-
return true;
|
|
317
|
-
}
|
|
318
|
-
if (arg == "-td" || arg == "--threads-draft") {
|
|
319
|
-
CHECK_ARG
|
|
320
|
-
params.n_threads_draft = std::stoi(argv[i]);
|
|
321
|
-
if (params.n_threads_draft <= 0) {
|
|
322
|
-
params.n_threads_draft = std::thread::hardware_concurrency();
|
|
323
|
-
}
|
|
324
|
-
return true;
|
|
325
|
-
}
|
|
326
|
-
if (arg == "-tbd" || arg == "--threads-batch-draft") {
|
|
327
|
-
CHECK_ARG
|
|
328
|
-
params.n_threads_batch_draft = std::stoi(argv[i]);
|
|
329
|
-
if (params.n_threads_batch_draft <= 0) {
|
|
330
|
-
params.n_threads_batch_draft = std::thread::hardware_concurrency();
|
|
331
|
-
}
|
|
332
|
-
return true;
|
|
333
|
-
}
|
|
334
|
-
if (arg == "-p" || arg == "--prompt") {
|
|
335
|
-
CHECK_ARG
|
|
336
|
-
params.prompt = argv[i];
|
|
337
|
-
return true;
|
|
338
|
-
}
|
|
339
|
-
if (arg == "-e" || arg == "--escape") {
|
|
340
|
-
params.escape = true;
|
|
341
|
-
return true;
|
|
342
|
-
}
|
|
343
|
-
if (arg == "--no-escape") {
|
|
344
|
-
params.escape = false;
|
|
345
|
-
return true;
|
|
346
|
-
}
|
|
347
|
-
if (arg == "--prompt-cache") {
|
|
348
|
-
CHECK_ARG
|
|
349
|
-
params.path_prompt_cache = argv[i];
|
|
350
|
-
return true;
|
|
351
|
-
}
|
|
352
|
-
if (arg == "--prompt-cache-all") {
|
|
353
|
-
params.prompt_cache_all = true;
|
|
354
|
-
return true;
|
|
355
|
-
}
|
|
356
|
-
if (arg == "--prompt-cache-ro") {
|
|
357
|
-
params.prompt_cache_ro = true;
|
|
358
|
-
return true;
|
|
359
|
-
}
|
|
360
|
-
if (arg == "-bf" || arg == "--binary-file") {
|
|
361
|
-
CHECK_ARG
|
|
362
|
-
std::ifstream file(argv[i], std::ios::binary);
|
|
363
|
-
if (!file) {
|
|
364
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
365
|
-
invalid_param = true;
|
|
366
|
-
return true;
|
|
367
|
-
}
|
|
368
|
-
// store the external file name in params
|
|
369
|
-
params.prompt_file = argv[i];
|
|
370
|
-
std::ostringstream ss;
|
|
371
|
-
ss << file.rdbuf();
|
|
372
|
-
params.prompt = ss.str();
|
|
373
|
-
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
|
|
374
|
-
return true;
|
|
375
|
-
}
|
|
376
|
-
if (arg == "-f" || arg == "--file") {
|
|
377
|
-
CHECK_ARG
|
|
378
|
-
std::ifstream file(argv[i]);
|
|
379
|
-
if (!file) {
|
|
380
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
381
|
-
invalid_param = true;
|
|
382
|
-
return true;
|
|
383
|
-
}
|
|
384
|
-
// store the external file name in params
|
|
385
|
-
params.prompt_file = argv[i];
|
|
386
|
-
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
|
387
|
-
if (!params.prompt.empty() && params.prompt.back() == '\n') {
|
|
388
|
-
params.prompt.pop_back();
|
|
389
|
-
}
|
|
390
|
-
return true;
|
|
391
|
-
}
|
|
392
|
-
if (arg == "--in-file") {
|
|
393
|
-
CHECK_ARG
|
|
394
|
-
std::ifstream file(argv[i]);
|
|
395
|
-
if (!file) {
|
|
396
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
397
|
-
invalid_param = true;
|
|
398
|
-
return true;
|
|
399
|
-
}
|
|
400
|
-
params.in_files.push_back(argv[i]);
|
|
401
|
-
return true;
|
|
402
|
-
}
|
|
403
|
-
if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
|
|
404
|
-
CHECK_ARG
|
|
405
|
-
params.n_predict = std::stoi(argv[i]);
|
|
406
|
-
return true;
|
|
407
|
-
}
|
|
408
|
-
if (arg == "--top-k") {
|
|
409
|
-
CHECK_ARG
|
|
410
|
-
sparams.top_k = std::stoi(argv[i]);
|
|
411
|
-
return true;
|
|
412
|
-
}
|
|
413
|
-
if (arg == "-c" || arg == "--ctx-size") {
|
|
414
|
-
CHECK_ARG
|
|
415
|
-
params.n_ctx = std::stoi(argv[i]);
|
|
416
|
-
return true;
|
|
417
|
-
}
|
|
418
|
-
if (arg == "--grp-attn-n" || arg == "-gan") {
|
|
419
|
-
CHECK_ARG
|
|
420
|
-
params.grp_attn_n = std::stoi(argv[i]);
|
|
421
|
-
return true;
|
|
422
|
-
}
|
|
423
|
-
if (arg == "--grp-attn-w" || arg == "-gaw") {
|
|
424
|
-
CHECK_ARG
|
|
425
|
-
params.grp_attn_w = std::stoi(argv[i]);
|
|
426
|
-
return true;
|
|
427
|
-
}
|
|
428
|
-
if (arg == "--rope-freq-base") {
|
|
429
|
-
CHECK_ARG
|
|
430
|
-
params.rope_freq_base = std::stof(argv[i]);
|
|
431
|
-
return true;
|
|
432
|
-
}
|
|
433
|
-
if (arg == "--rope-freq-scale") {
|
|
434
|
-
CHECK_ARG
|
|
435
|
-
params.rope_freq_scale = std::stof(argv[i]);
|
|
436
|
-
return true;
|
|
437
|
-
}
|
|
438
|
-
if (arg == "--rope-scaling") {
|
|
439
|
-
CHECK_ARG
|
|
440
|
-
std::string value(argv[i]);
|
|
441
|
-
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
|
442
|
-
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
|
443
|
-
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
|
444
|
-
else { invalid_param = true; }
|
|
445
|
-
return true;
|
|
446
|
-
}
|
|
447
|
-
if (arg == "--rope-scale") {
|
|
448
|
-
CHECK_ARG
|
|
449
|
-
params.rope_freq_scale = 1.0f / std::stof(argv[i]);
|
|
450
|
-
return true;
|
|
451
|
-
}
|
|
452
|
-
if (arg == "--yarn-orig-ctx") {
|
|
453
|
-
CHECK_ARG
|
|
454
|
-
params.yarn_orig_ctx = std::stoi(argv[i]);
|
|
455
|
-
return true;
|
|
456
|
-
}
|
|
457
|
-
if (arg == "--yarn-ext-factor") {
|
|
458
|
-
CHECK_ARG
|
|
459
|
-
params.yarn_ext_factor = std::stof(argv[i]);
|
|
460
|
-
return true;
|
|
461
|
-
}
|
|
462
|
-
if (arg == "--yarn-attn-factor") {
|
|
463
|
-
CHECK_ARG
|
|
464
|
-
params.yarn_attn_factor = std::stof(argv[i]);
|
|
465
|
-
return true;
|
|
466
|
-
}
|
|
467
|
-
if (arg == "--yarn-beta-fast") {
|
|
468
|
-
CHECK_ARG
|
|
469
|
-
params.yarn_beta_fast = std::stof(argv[i]);
|
|
470
|
-
return true;
|
|
471
|
-
}
|
|
472
|
-
if (arg == "--yarn-beta-slow") {
|
|
473
|
-
CHECK_ARG
|
|
474
|
-
params.yarn_beta_slow = std::stof(argv[i]);
|
|
475
|
-
return true;
|
|
476
|
-
}
|
|
477
|
-
if (arg == "--pooling") {
|
|
478
|
-
CHECK_ARG
|
|
479
|
-
std::string value(argv[i]);
|
|
480
|
-
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
|
|
481
|
-
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
|
|
482
|
-
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
|
|
483
|
-
else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
|
|
484
|
-
else { invalid_param = true; }
|
|
485
|
-
return true;
|
|
486
|
-
}
|
|
487
|
-
if (arg == "--attention") {
|
|
488
|
-
CHECK_ARG
|
|
489
|
-
std::string value(argv[i]);
|
|
490
|
-
/**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
|
|
491
|
-
else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
|
|
492
|
-
else { invalid_param = true; }
|
|
493
|
-
return true;
|
|
494
|
-
}
|
|
495
|
-
if (arg == "--defrag-thold" || arg == "-dt") {
|
|
496
|
-
CHECK_ARG
|
|
497
|
-
params.defrag_thold = std::stof(argv[i]);
|
|
498
|
-
return true;
|
|
499
|
-
}
|
|
500
|
-
if (arg == "--samplers") {
|
|
501
|
-
CHECK_ARG
|
|
502
|
-
const auto sampler_names = string_split(argv[i], ';');
|
|
503
|
-
sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
|
|
504
|
-
return true;
|
|
505
|
-
}
|
|
506
|
-
if (arg == "--sampling-seq") {
|
|
507
|
-
CHECK_ARG
|
|
508
|
-
sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]);
|
|
509
|
-
return true;
|
|
510
|
-
}
|
|
511
|
-
if (arg == "--top-p") {
|
|
512
|
-
CHECK_ARG
|
|
513
|
-
sparams.top_p = std::stof(argv[i]);
|
|
514
|
-
return true;
|
|
515
|
-
}
|
|
516
|
-
if (arg == "--min-p") {
|
|
517
|
-
CHECK_ARG
|
|
518
|
-
sparams.min_p = std::stof(argv[i]);
|
|
519
|
-
return true;
|
|
520
|
-
}
|
|
521
|
-
if (arg == "--temp") {
|
|
522
|
-
CHECK_ARG
|
|
523
|
-
sparams.temp = std::stof(argv[i]);
|
|
524
|
-
sparams.temp = std::max(sparams.temp, 0.0f);
|
|
525
|
-
return true;
|
|
526
|
-
}
|
|
527
|
-
if (arg == "--tfs") {
|
|
528
|
-
CHECK_ARG
|
|
529
|
-
sparams.tfs_z = std::stof(argv[i]);
|
|
530
|
-
return true;
|
|
531
|
-
}
|
|
532
|
-
if (arg == "--typical") {
|
|
533
|
-
CHECK_ARG
|
|
534
|
-
sparams.typical_p = std::stof(argv[i]);
|
|
535
|
-
return true;
|
|
536
|
-
}
|
|
537
|
-
if (arg == "--repeat-last-n") {
|
|
538
|
-
CHECK_ARG
|
|
539
|
-
sparams.penalty_last_n = std::stoi(argv[i]);
|
|
540
|
-
sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
|
|
541
|
-
return true;
|
|
542
|
-
}
|
|
543
|
-
if (arg == "--repeat-penalty") {
|
|
544
|
-
CHECK_ARG
|
|
545
|
-
sparams.penalty_repeat = std::stof(argv[i]);
|
|
546
|
-
return true;
|
|
547
|
-
}
|
|
548
|
-
if (arg == "--frequency-penalty") {
|
|
549
|
-
CHECK_ARG
|
|
550
|
-
sparams.penalty_freq = std::stof(argv[i]);
|
|
551
|
-
return true;
|
|
552
|
-
}
|
|
553
|
-
if (arg == "--presence-penalty") {
|
|
554
|
-
CHECK_ARG
|
|
555
|
-
sparams.penalty_present = std::stof(argv[i]);
|
|
556
|
-
return true;
|
|
557
|
-
}
|
|
558
|
-
if (arg == "--dynatemp-range") {
|
|
559
|
-
CHECK_ARG
|
|
560
|
-
sparams.dynatemp_range = std::stof(argv[i]);
|
|
561
|
-
return true;
|
|
562
|
-
}
|
|
563
|
-
if (arg == "--dynatemp-exp") {
|
|
564
|
-
CHECK_ARG
|
|
565
|
-
sparams.dynatemp_exponent = std::stof(argv[i]);
|
|
566
|
-
return true;
|
|
567
|
-
}
|
|
568
|
-
if (arg == "--mirostat") {
|
|
569
|
-
CHECK_ARG
|
|
570
|
-
sparams.mirostat = std::stoi(argv[i]);
|
|
571
|
-
return true;
|
|
572
|
-
}
|
|
573
|
-
if (arg == "--mirostat-lr") {
|
|
574
|
-
CHECK_ARG
|
|
575
|
-
sparams.mirostat_eta = std::stof(argv[i]);
|
|
576
|
-
return true;
|
|
577
|
-
}
|
|
578
|
-
if (arg == "--mirostat-ent") {
|
|
579
|
-
CHECK_ARG
|
|
580
|
-
sparams.mirostat_tau = std::stof(argv[i]);
|
|
581
|
-
return true;
|
|
582
|
-
}
|
|
583
|
-
if (arg == "--cfg-negative-prompt") {
|
|
584
|
-
CHECK_ARG
|
|
585
|
-
sparams.cfg_negative_prompt = argv[i];
|
|
586
|
-
return true;
|
|
587
|
-
}
|
|
588
|
-
if (arg == "--cfg-negative-prompt-file") {
|
|
589
|
-
CHECK_ARG
|
|
590
|
-
std::ifstream file(argv[i]);
|
|
591
|
-
if (!file) {
|
|
592
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
593
|
-
invalid_param = true;
|
|
594
|
-
return true;
|
|
595
|
-
}
|
|
596
|
-
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
|
|
597
|
-
if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
|
|
598
|
-
sparams.cfg_negative_prompt.pop_back();
|
|
599
|
-
}
|
|
600
|
-
return true;
|
|
601
|
-
}
|
|
602
|
-
if (arg == "--cfg-scale") {
|
|
603
|
-
CHECK_ARG
|
|
604
|
-
sparams.cfg_scale = std::stof(argv[i]);
|
|
605
|
-
return true;
|
|
606
|
-
}
|
|
607
|
-
if (arg == "-b" || arg == "--batch-size") {
|
|
608
|
-
CHECK_ARG
|
|
609
|
-
params.n_batch = std::stoi(argv[i]);
|
|
610
|
-
return true;
|
|
611
|
-
}
|
|
612
|
-
if (arg == "-ub" || arg == "--ubatch-size") {
|
|
613
|
-
CHECK_ARG
|
|
614
|
-
params.n_ubatch = std::stoi(argv[i]);
|
|
615
|
-
return true;
|
|
616
|
-
}
|
|
617
|
-
if (arg == "--keep") {
|
|
618
|
-
CHECK_ARG
|
|
619
|
-
params.n_keep = std::stoi(argv[i]);
|
|
620
|
-
return true;
|
|
621
|
-
}
|
|
622
|
-
if (arg == "--draft") {
|
|
623
|
-
CHECK_ARG
|
|
624
|
-
params.n_draft = std::stoi(argv[i]);
|
|
625
|
-
return true;
|
|
626
|
-
}
|
|
627
|
-
if (arg == "--chunks") {
|
|
628
|
-
CHECK_ARG
|
|
629
|
-
params.n_chunks = std::stoi(argv[i]);
|
|
630
|
-
return true;
|
|
631
|
-
}
|
|
632
|
-
if (arg == "-np" || arg == "--parallel") {
|
|
633
|
-
CHECK_ARG
|
|
634
|
-
params.n_parallel = std::stoi(argv[i]);
|
|
635
|
-
return true;
|
|
636
|
-
}
|
|
637
|
-
if (arg == "-ns" || arg == "--sequences") {
|
|
638
|
-
CHECK_ARG
|
|
639
|
-
params.n_sequences = std::stoi(argv[i]);
|
|
640
|
-
return true;
|
|
641
|
-
}
|
|
642
|
-
if (arg == "--p-split" || arg == "-ps") {
|
|
643
|
-
CHECK_ARG
|
|
644
|
-
params.p_split = std::stof(argv[i]);
|
|
645
|
-
return true;
|
|
646
|
-
}
|
|
647
|
-
if (arg == "-m" || arg == "--model") {
|
|
648
|
-
CHECK_ARG
|
|
649
|
-
params.model = argv[i];
|
|
650
|
-
return true;
|
|
651
|
-
}
|
|
652
|
-
if (arg == "-md" || arg == "--model-draft") {
|
|
653
|
-
CHECK_ARG
|
|
654
|
-
params.model_draft = argv[i];
|
|
655
|
-
return true;
|
|
656
|
-
}
|
|
657
|
-
if (arg == "-a" || arg == "--alias") {
|
|
658
|
-
CHECK_ARG
|
|
659
|
-
params.model_alias = argv[i];
|
|
660
|
-
return true;
|
|
661
|
-
}
|
|
662
|
-
if (arg == "-mu" || arg == "--model-url") {
|
|
663
|
-
CHECK_ARG
|
|
664
|
-
params.model_url = argv[i];
|
|
665
|
-
return true;
|
|
666
|
-
}
|
|
667
|
-
if (arg == "-hft" || arg == "--hf-token") {
|
|
668
|
-
if (++i >= argc) {
|
|
669
|
-
invalid_param = true;
|
|
670
|
-
return true;
|
|
671
|
-
}
|
|
672
|
-
params.hf_token = argv[i];
|
|
673
|
-
return true;
|
|
674
|
-
}
|
|
675
|
-
if (arg == "-hfr" || arg == "--hf-repo") {
|
|
676
|
-
CHECK_ARG
|
|
677
|
-
params.hf_repo = argv[i];
|
|
678
|
-
return true;
|
|
679
|
-
}
|
|
680
|
-
if (arg == "-hff" || arg == "--hf-file") {
|
|
681
|
-
CHECK_ARG
|
|
682
|
-
params.hf_file = argv[i];
|
|
683
|
-
return true;
|
|
684
|
-
}
|
|
685
|
-
if (arg == "--lora") {
|
|
686
|
-
CHECK_ARG
|
|
687
|
-
params.lora_adapter.emplace_back(argv[i], 1.0f);
|
|
688
|
-
return true;
|
|
689
|
-
}
|
|
690
|
-
if (arg == "--lora-scaled") {
|
|
691
|
-
CHECK_ARG
|
|
692
|
-
const char* lora_adapter = argv[i];
|
|
693
|
-
CHECK_ARG
|
|
694
|
-
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
|
695
|
-
return true;
|
|
696
|
-
}
|
|
697
|
-
if (arg == "--control-vector") {
|
|
698
|
-
CHECK_ARG
|
|
699
|
-
params.control_vectors.push_back({ 1.0f, argv[i], });
|
|
700
|
-
return true;
|
|
701
|
-
}
|
|
702
|
-
if (arg == "--control-vector-scaled") {
|
|
703
|
-
CHECK_ARG
|
|
704
|
-
const char* fname = argv[i];
|
|
705
|
-
CHECK_ARG
|
|
706
|
-
params.control_vectors.push_back({ std::stof(argv[i]), fname, });
|
|
707
|
-
return true;
|
|
708
|
-
}
|
|
709
|
-
if (arg == "--control-vector-layer-range") {
|
|
710
|
-
CHECK_ARG
|
|
711
|
-
params.control_vector_layer_start = std::stoi(argv[i]);
|
|
712
|
-
CHECK_ARG
|
|
713
|
-
params.control_vector_layer_end = std::stoi(argv[i]);
|
|
714
|
-
return true;
|
|
715
|
-
}
|
|
716
|
-
if (arg == "--mmproj") {
|
|
717
|
-
CHECK_ARG
|
|
718
|
-
params.mmproj = argv[i];
|
|
719
|
-
return true;
|
|
720
|
-
}
|
|
721
|
-
if (arg == "--image") {
|
|
722
|
-
CHECK_ARG
|
|
723
|
-
params.image.emplace_back(argv[i]);
|
|
724
|
-
return true;
|
|
725
|
-
}
|
|
726
|
-
if (arg == "-i" || arg == "--interactive") {
|
|
727
|
-
params.interactive = true;
|
|
728
|
-
return true;
|
|
729
|
-
}
|
|
730
|
-
if (arg == "-sp" || arg == "--special") {
|
|
731
|
-
params.special = true;
|
|
732
|
-
return true;
|
|
733
|
-
}
|
|
734
|
-
if (arg == "--embedding" || arg == "--embeddings") {
|
|
735
|
-
params.embedding = true;
|
|
736
|
-
return true;
|
|
737
|
-
}
|
|
738
|
-
if (arg == "--embd-normalize") {
|
|
739
|
-
CHECK_ARG
|
|
740
|
-
params.embd_normalize = std::stoi(argv[i]);
|
|
741
|
-
return true;
|
|
742
|
-
}
|
|
743
|
-
if (arg == "--embd-output-format") {
|
|
744
|
-
CHECK_ARG
|
|
745
|
-
params.embd_out = argv[i];
|
|
746
|
-
return true;
|
|
747
|
-
}
|
|
748
|
-
if (arg == "--embd-separator") {
|
|
749
|
-
CHECK_ARG
|
|
750
|
-
params.embd_sep = argv[i];
|
|
751
|
-
return true;
|
|
752
|
-
}
|
|
753
|
-
if (arg == "-if" || arg == "--interactive-first") {
|
|
754
|
-
params.interactive_first = true;
|
|
755
|
-
return true;
|
|
756
|
-
}
|
|
757
|
-
if (arg == "-cnv" || arg == "--conversation") {
|
|
758
|
-
params.conversation = true;
|
|
759
|
-
return true;
|
|
760
|
-
}
|
|
761
|
-
if (arg == "--infill") {
|
|
762
|
-
params.infill = true;
|
|
763
|
-
return true;
|
|
764
|
-
}
|
|
765
|
-
if (arg == "-dkvc" || arg == "--dump-kv-cache") {
|
|
766
|
-
params.dump_kv_cache = true;
|
|
767
|
-
return true;
|
|
768
|
-
}
|
|
769
|
-
if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
|
770
|
-
params.no_kv_offload = true;
|
|
771
|
-
return true;
|
|
772
|
-
}
|
|
773
|
-
if (arg == "-ctk" || arg == "--cache-type-k") {
|
|
774
|
-
params.cache_type_k = argv[++i];
|
|
775
|
-
return true;
|
|
776
|
-
}
|
|
777
|
-
if (arg == "-ctv" || arg == "--cache-type-v") {
|
|
778
|
-
params.cache_type_v = argv[++i];
|
|
779
|
-
return true;
|
|
780
|
-
}
|
|
781
|
-
if (arg == "-mli" || arg == "--multiline-input") {
|
|
782
|
-
params.multiline_input = true;
|
|
783
|
-
return true;
|
|
784
|
-
}
|
|
785
|
-
if (arg == "--simple-io") {
|
|
786
|
-
params.simple_io = true;
|
|
787
|
-
return true;
|
|
788
|
-
}
|
|
789
|
-
if (arg == "-cb" || arg == "--cont-batching") {
|
|
790
|
-
params.cont_batching = true;
|
|
791
|
-
return true;
|
|
792
|
-
}
|
|
793
|
-
if (arg == "-nocb" || arg == "--no-cont-batching") {
|
|
794
|
-
params.cont_batching = false;
|
|
795
|
-
return true;
|
|
796
|
-
}
|
|
797
|
-
if (arg == "-fa" || arg == "--flash-attn") {
|
|
798
|
-
params.flash_attn = true;
|
|
799
|
-
return true;
|
|
800
|
-
}
|
|
801
|
-
if (arg == "-co" || arg == "--color") {
|
|
802
|
-
params.use_color = true;
|
|
803
|
-
return true;
|
|
804
|
-
}
|
|
805
|
-
if (arg == "--mlock") {
|
|
806
|
-
params.use_mlock = true;
|
|
807
|
-
return true;
|
|
808
|
-
}
|
|
809
|
-
if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
|
|
810
|
-
CHECK_ARG
|
|
811
|
-
params.n_gpu_layers = std::stoi(argv[i]);
|
|
812
|
-
if (!llama_supports_gpu_offload()) {
|
|
813
|
-
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
|
|
814
|
-
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
|
815
|
-
}
|
|
816
|
-
return true;
|
|
817
|
-
}
|
|
818
|
-
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
|
|
819
|
-
CHECK_ARG
|
|
820
|
-
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
|
821
|
-
if (!llama_supports_gpu_offload()) {
|
|
822
|
-
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
|
823
|
-
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
|
824
|
-
}
|
|
825
|
-
return true;
|
|
826
|
-
}
|
|
827
|
-
if (arg == "--main-gpu" || arg == "-mg") {
|
|
828
|
-
CHECK_ARG
|
|
829
|
-
params.main_gpu = std::stoi(argv[i]);
|
|
830
|
-
#ifndef GGML_USE_CUDA_SYCL_VULKAN
|
|
831
|
-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
|
|
832
|
-
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
|
833
|
-
return true;
|
|
834
|
-
}
|
|
835
|
-
if (arg == "--split-mode" || arg == "-sm") {
|
|
836
|
-
CHECK_ARG
|
|
837
|
-
std::string arg_next = argv[i];
|
|
838
|
-
if (arg_next == "none") {
|
|
839
|
-
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
|
840
|
-
}
|
|
841
|
-
else if (arg_next == "layer") {
|
|
842
|
-
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
|
843
|
-
}
|
|
844
|
-
else if (arg_next == "row") {
|
|
845
|
-
#ifdef GGML_USE_SYCL
|
|
846
|
-
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
|
|
847
|
-
exit(1);
|
|
848
|
-
#endif // GGML_USE_SYCL
|
|
849
|
-
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
|
850
|
-
}
|
|
851
|
-
else {
|
|
852
|
-
invalid_param = true;
|
|
853
|
-
return true;
|
|
854
|
-
}
|
|
855
|
-
#ifndef GGML_USE_CUDA_SYCL_VULKAN
|
|
856
|
-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
|
|
857
|
-
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
|
858
|
-
return true;
|
|
859
|
-
}
|
|
860
|
-
if (arg == "--tensor-split" || arg == "-ts") {
|
|
861
|
-
CHECK_ARG
|
|
862
|
-
std::string arg_next = argv[i];
|
|
863
|
-
|
|
864
|
-
// split string by , and /
|
|
865
|
-
const std::regex regex{ R"([,/]+)" };
|
|
866
|
-
std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
|
|
867
|
-
std::vector<std::string> split_arg{ it, {} };
|
|
868
|
-
if (split_arg.size() >= llama_max_devices()) {
|
|
869
|
-
invalid_param = true;
|
|
870
|
-
return true;
|
|
871
|
-
}
|
|
872
|
-
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
|
873
|
-
if (i < split_arg.size()) {
|
|
874
|
-
params.tensor_split[i] = std::stof(split_arg[i]);
|
|
875
|
-
}
|
|
876
|
-
else {
|
|
877
|
-
params.tensor_split[i] = 0.0f;
|
|
878
|
-
}
|
|
879
|
-
}
|
|
880
|
-
#ifndef GGML_USE_CUDA_SYCL_VULKAN
|
|
881
|
-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
|
|
882
|
-
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
|
883
|
-
return true;
|
|
884
|
-
}
|
|
885
|
-
if (arg == "--rpc") {
|
|
886
|
-
CHECK_ARG
|
|
887
|
-
params.rpc_servers = argv[i];
|
|
888
|
-
return true;
|
|
889
|
-
}
|
|
890
|
-
if (arg == "--no-mmap") {
|
|
891
|
-
params.use_mmap = false;
|
|
892
|
-
return true;
|
|
893
|
-
}
|
|
894
|
-
if (arg == "--numa") {
|
|
895
|
-
CHECK_ARG
|
|
896
|
-
std::string value(argv[i]);
|
|
897
|
-
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
|
898
|
-
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
|
|
899
|
-
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
|
|
900
|
-
else { invalid_param = true; }
|
|
901
|
-
return true;
|
|
902
|
-
}
|
|
903
|
-
if (arg == "-v" || arg == "--verbose") {
|
|
904
|
-
params.verbosity = 1;
|
|
905
|
-
return true;
|
|
906
|
-
}
|
|
907
|
-
if (arg == "--verbosity") {
|
|
908
|
-
CHECK_ARG
|
|
909
|
-
params.verbosity = std::stoi(argv[i]);
|
|
910
|
-
return true;
|
|
911
|
-
}
|
|
912
|
-
if (arg == "--verbose-prompt") {
|
|
913
|
-
params.verbose_prompt = true;
|
|
914
|
-
return true;
|
|
915
|
-
}
|
|
916
|
-
if (arg == "--no-display-prompt") {
|
|
917
|
-
params.display_prompt = false;
|
|
918
|
-
return true;
|
|
919
|
-
}
|
|
920
|
-
if (arg == "-r" || arg == "--reverse-prompt") {
|
|
921
|
-
CHECK_ARG
|
|
922
|
-
params.antiprompt.emplace_back(argv[i]);
|
|
923
|
-
return true;
|
|
924
|
-
}
|
|
925
|
-
if (arg == "-ld" || arg == "--logdir") {
|
|
926
|
-
CHECK_ARG
|
|
927
|
-
params.logdir = argv[i];
|
|
928
|
-
|
|
929
|
-
if (params.logdir.back() != DIRECTORY_SEPARATOR) {
|
|
930
|
-
params.logdir += DIRECTORY_SEPARATOR;
|
|
931
|
-
}
|
|
932
|
-
return true;
|
|
933
|
-
}
|
|
934
|
-
if (arg == "-lcs" || arg == "--lookup-cache-static") {
|
|
935
|
-
CHECK_ARG
|
|
936
|
-
params.lookup_cache_static = argv[i];
|
|
937
|
-
return true;
|
|
938
|
-
}
|
|
939
|
-
if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
|
|
940
|
-
CHECK_ARG
|
|
941
|
-
params.lookup_cache_dynamic = argv[i];
|
|
942
|
-
return true;
|
|
943
|
-
}
|
|
944
|
-
if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
|
|
945
|
-
CHECK_ARG
|
|
946
|
-
params.logits_file = argv[i];
|
|
947
|
-
return true;
|
|
948
|
-
}
|
|
949
|
-
if (arg == "--perplexity" || arg == "--all-logits") {
|
|
950
|
-
params.logits_all = true;
|
|
951
|
-
return true;
|
|
952
|
-
}
|
|
953
|
-
if (arg == "--ppl-stride") {
|
|
954
|
-
CHECK_ARG
|
|
955
|
-
params.ppl_stride = std::stoi(argv[i]);
|
|
956
|
-
return true;
|
|
957
|
-
}
|
|
958
|
-
if (arg == "--ppl-output-type") {
|
|
959
|
-
CHECK_ARG
|
|
960
|
-
params.ppl_output_type = std::stoi(argv[i]);
|
|
961
|
-
return true;
|
|
962
|
-
}
|
|
963
|
-
if (arg == "-ptc" || arg == "--print-token-count") {
|
|
964
|
-
CHECK_ARG
|
|
965
|
-
params.n_print = std::stoi(argv[i]);
|
|
966
|
-
return true;
|
|
967
|
-
}
|
|
968
|
-
if (arg == "--check-tensors") {
|
|
969
|
-
params.check_tensors = true;
|
|
970
|
-
return true;
|
|
971
|
-
}
|
|
972
|
-
if (arg == "--hellaswag") {
|
|
973
|
-
params.hellaswag = true;
|
|
974
|
-
return true;
|
|
975
|
-
}
|
|
976
|
-
if (arg == "--hellaswag-tasks") {
|
|
977
|
-
CHECK_ARG
|
|
978
|
-
params.hellaswag_tasks = std::stoi(argv[i]);
|
|
979
|
-
return true;
|
|
980
|
-
}
|
|
981
|
-
if (arg == "--winogrande") {
|
|
982
|
-
params.winogrande = true;
|
|
983
|
-
return true;
|
|
984
|
-
}
|
|
985
|
-
if (arg == "--winogrande-tasks") {
|
|
986
|
-
CHECK_ARG
|
|
987
|
-
params.winogrande_tasks = std::stoi(argv[i]);
|
|
988
|
-
return true;
|
|
989
|
-
}
|
|
990
|
-
if (arg == "--multiple-choice") {
|
|
991
|
-
params.multiple_choice = true;
|
|
992
|
-
return true;
|
|
993
|
-
}
|
|
994
|
-
if (arg == "--multiple-choice-tasks") {
|
|
995
|
-
CHECK_ARG
|
|
996
|
-
params.multiple_choice_tasks = std::stoi(argv[i]);
|
|
997
|
-
return true;
|
|
998
|
-
}
|
|
999
|
-
if (arg == "--kl-divergence") {
|
|
1000
|
-
params.kl_divergence = true;
|
|
1001
|
-
return true;
|
|
1002
|
-
}
|
|
1003
|
-
if (arg == "--ignore-eos") {
|
|
1004
|
-
params.ignore_eos = true;
|
|
1005
|
-
return true;
|
|
1006
|
-
}
|
|
1007
|
-
if (arg == "--penalize-nl") {
|
|
1008
|
-
sparams.penalize_nl = true;
|
|
1009
|
-
return true;
|
|
1010
|
-
}
|
|
1011
|
-
if (arg == "-l" || arg == "--logit-bias") {
|
|
1012
|
-
CHECK_ARG
|
|
1013
|
-
std::stringstream ss(argv[i]);
|
|
1014
|
-
llama_token key;
|
|
1015
|
-
char sign;
|
|
1016
|
-
std::string value_str;
|
|
1017
|
-
try {
|
|
1018
|
-
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
|
1019
|
-
sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
|
1020
|
-
}
|
|
1021
|
-
else {
|
|
1022
|
-
throw std::exception();
|
|
1023
|
-
}
|
|
1024
|
-
}
|
|
1025
|
-
catch (const std::exception&) {
|
|
1026
|
-
invalid_param = true;
|
|
1027
|
-
return true;
|
|
1028
|
-
}
|
|
1029
|
-
return true;
|
|
1030
|
-
}
|
|
1031
|
-
if (arg == "-h" || arg == "--help" || arg == "--usage" ) {
|
|
1032
|
-
params.usage = true;
|
|
1033
|
-
return true;
|
|
1034
|
-
}
|
|
1035
|
-
if (arg == "--version") {
|
|
1036
|
-
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
|
1037
|
-
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
|
1038
|
-
exit(0);
|
|
1039
|
-
}
|
|
1040
|
-
if (arg == "--in-prefix-bos") {
|
|
1041
|
-
params.input_prefix_bos = true;
|
|
1042
|
-
params.enable_chat_template = false;
|
|
1043
|
-
return true;
|
|
1044
|
-
}
|
|
1045
|
-
if (arg == "--in-prefix") {
|
|
1046
|
-
CHECK_ARG
|
|
1047
|
-
params.input_prefix = argv[i];
|
|
1048
|
-
params.enable_chat_template = false;
|
|
1049
|
-
return true;
|
|
1050
|
-
}
|
|
1051
|
-
if (arg == "--in-suffix") {
|
|
1052
|
-
CHECK_ARG
|
|
1053
|
-
params.input_suffix = argv[i];
|
|
1054
|
-
params.enable_chat_template = false;
|
|
1055
|
-
return true;
|
|
1056
|
-
}
|
|
1057
|
-
if (arg == "--spm-infill") {
|
|
1058
|
-
params.spm_infill = true;
|
|
1059
|
-
return true;
|
|
1060
|
-
}
|
|
1061
|
-
if (arg == "--grammar") {
|
|
1062
|
-
CHECK_ARG
|
|
1063
|
-
sparams.grammar = argv[i];
|
|
1064
|
-
return true;
|
|
1065
|
-
}
|
|
1066
|
-
if (arg == "--grammar-file") {
|
|
1067
|
-
CHECK_ARG
|
|
1068
|
-
std::ifstream file(argv[i]);
|
|
1069
|
-
if (!file) {
|
|
1070
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
1071
|
-
invalid_param = true;
|
|
1072
|
-
return true;
|
|
1073
|
-
}
|
|
1074
|
-
std::copy(
|
|
1075
|
-
std::istreambuf_iterator<char>(file),
|
|
1076
|
-
std::istreambuf_iterator<char>(),
|
|
1077
|
-
std::back_inserter(sparams.grammar)
|
|
1078
|
-
);
|
|
1079
|
-
return true;
|
|
1080
|
-
}
|
|
1081
|
-
if (arg == "-j" || arg == "--json-schema") {
|
|
1082
|
-
CHECK_ARG
|
|
1083
|
-
sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
|
|
1084
|
-
return true;
|
|
1085
|
-
}
|
|
1086
|
-
if (arg == "--override-kv") {
|
|
1087
|
-
CHECK_ARG
|
|
1088
|
-
if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
|
|
1089
|
-
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
|
1090
|
-
invalid_param = true;
|
|
1091
|
-
return true;
|
|
1092
|
-
}
|
|
1093
|
-
return true;
|
|
1094
|
-
}
|
|
1095
|
-
if (arg == "--host") {
|
|
1096
|
-
CHECK_ARG
|
|
1097
|
-
params.hostname = argv[i];
|
|
1098
|
-
return true;
|
|
1099
|
-
}
|
|
1100
|
-
if (arg == "--port") {
|
|
1101
|
-
CHECK_ARG
|
|
1102
|
-
params.port = std::stoi(argv[i]);
|
|
1103
|
-
return true;
|
|
1104
|
-
}
|
|
1105
|
-
if (arg == "--path") {
|
|
1106
|
-
CHECK_ARG
|
|
1107
|
-
params.public_path = argv[i];
|
|
1108
|
-
return true;
|
|
1109
|
-
}
|
|
1110
|
-
if (arg == "--api-key") {
|
|
1111
|
-
CHECK_ARG
|
|
1112
|
-
params.api_keys.push_back(argv[i]);
|
|
1113
|
-
return true;
|
|
1114
|
-
}
|
|
1115
|
-
if (arg == "--api-key-file") {
|
|
1116
|
-
CHECK_ARG
|
|
1117
|
-
std::ifstream key_file(argv[i]);
|
|
1118
|
-
if (!key_file) {
|
|
1119
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
1120
|
-
invalid_param = true;
|
|
1121
|
-
return true;
|
|
1122
|
-
}
|
|
1123
|
-
std::string key;
|
|
1124
|
-
while (std::getline(key_file, key)) {
|
|
1125
|
-
if (!key.empty()) {
|
|
1126
|
-
params.api_keys.push_back(key);
|
|
1127
|
-
}
|
|
1128
|
-
}
|
|
1129
|
-
key_file.close();
|
|
1130
|
-
return true;
|
|
1131
|
-
}
|
|
1132
|
-
if (arg == "--ssl-key-file") {
|
|
1133
|
-
CHECK_ARG
|
|
1134
|
-
params.ssl_file_key = argv[i];
|
|
1135
|
-
return true;
|
|
1136
|
-
}
|
|
1137
|
-
if (arg == "--ssl-cert-file") {
|
|
1138
|
-
CHECK_ARG
|
|
1139
|
-
params.ssl_file_cert = argv[i];
|
|
1140
|
-
return true;
|
|
1141
|
-
}
|
|
1142
|
-
if (arg == "--timeout" || arg == "-to") {
|
|
1143
|
-
CHECK_ARG
|
|
1144
|
-
params.timeout_read = std::stoi(argv[i]);
|
|
1145
|
-
params.timeout_write = std::stoi(argv[i]);
|
|
209
|
+
#endif
|
|
210
|
+
return cpu_get_num_physical_cores();
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// Helper for setting process priority
|
|
214
|
+
|
|
215
|
+
#if defined(_WIN32)
|
|
216
|
+
|
|
217
|
+
bool set_process_priority(enum ggml_sched_priority prio) {
|
|
218
|
+
if (prio == GGML_SCHED_PRIO_NORMAL) {
|
|
1146
219
|
return true;
|
|
1147
220
|
}
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
221
|
+
|
|
222
|
+
DWORD p = NORMAL_PRIORITY_CLASS;
|
|
223
|
+
switch (prio) {
|
|
224
|
+
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
|
|
225
|
+
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
|
|
226
|
+
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
|
|
227
|
+
case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
|
|
1152
228
|
}
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
1158
|
-
invalid_param = true;
|
|
1159
|
-
return true;
|
|
1160
|
-
}
|
|
1161
|
-
std::string system_prompt;
|
|
1162
|
-
std::copy(
|
|
1163
|
-
std::istreambuf_iterator<char>(file),
|
|
1164
|
-
std::istreambuf_iterator<char>(),
|
|
1165
|
-
std::back_inserter(system_prompt)
|
|
1166
|
-
);
|
|
1167
|
-
params.system_prompt = system_prompt;
|
|
1168
|
-
return true;
|
|
229
|
+
|
|
230
|
+
if (!SetPriorityClass(GetCurrentProcess(), p)) {
|
|
231
|
+
LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
|
|
232
|
+
return false;
|
|
1169
233
|
}
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
234
|
+
|
|
235
|
+
return true;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
#else // MacOS and POSIX
|
|
239
|
+
#include <sys/types.h>
|
|
240
|
+
#include <sys/resource.h>
|
|
241
|
+
|
|
242
|
+
bool set_process_priority(enum ggml_sched_priority prio) {
|
|
243
|
+
if (prio == GGML_SCHED_PRIO_NORMAL) {
|
|
1180
244
|
return true;
|
|
1181
245
|
}
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
246
|
+
|
|
247
|
+
int p = 0;
|
|
248
|
+
switch (prio) {
|
|
249
|
+
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
|
|
250
|
+
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
|
|
251
|
+
case GGML_SCHED_PRIO_HIGH: p = -10; break;
|
|
252
|
+
case GGML_SCHED_PRIO_REALTIME: p = -20; break;
|
|
1185
253
|
}
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
254
|
+
|
|
255
|
+
if (!setpriority(PRIO_PROCESS, 0, p)) {
|
|
256
|
+
LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
|
|
257
|
+
return false;
|
|
1189
258
|
}
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
259
|
+
return true;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
#endif
|
|
263
|
+
|
|
264
|
+
//
|
|
265
|
+
// CLI argument parsing
|
|
266
|
+
//
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
|
|
270
|
+
int32_t n_set = 0;
|
|
271
|
+
|
|
272
|
+
if (cpuparams.n_threads < 0) {
|
|
273
|
+
// Assuming everything about cpuparams is invalid
|
|
274
|
+
if (role_model != nullptr) {
|
|
275
|
+
cpuparams = *role_model;
|
|
276
|
+
} else {
|
|
277
|
+
cpuparams.n_threads = cpu_get_num_math();
|
|
1196
278
|
}
|
|
1197
|
-
return true;
|
|
1198
279
|
}
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
if (
|
|
1202
|
-
|
|
1203
|
-
fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
|
|
1204
|
-
invalid_param = true;
|
|
1205
|
-
return true;
|
|
280
|
+
|
|
281
|
+
for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
|
|
282
|
+
if (cpuparams.cpumask[i]) {
|
|
283
|
+
n_set++;
|
|
1206
284
|
}
|
|
1207
|
-
params.chat_template = argv[i];
|
|
1208
|
-
return true;
|
|
1209
|
-
}
|
|
1210
|
-
if (arg == "--slot-prompt-similarity" || arg == "-sps") {
|
|
1211
|
-
CHECK_ARG
|
|
1212
|
-
params.slot_prompt_similarity = std::stof(argv[i]);
|
|
1213
|
-
return true;
|
|
1214
|
-
}
|
|
1215
|
-
if (arg == "-pps") {
|
|
1216
|
-
params.is_pp_shared = true;
|
|
1217
|
-
return true;
|
|
1218
|
-
}
|
|
1219
|
-
if (arg == "-npp") {
|
|
1220
|
-
CHECK_ARG
|
|
1221
|
-
auto p = string_split<int>(argv[i], split_delim);
|
|
1222
|
-
params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
|
|
1223
|
-
return true;
|
|
1224
285
|
}
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
return true;
|
|
286
|
+
|
|
287
|
+
if (n_set && n_set < cpuparams.n_threads) {
|
|
288
|
+
// Not enough set bits, may experience performance issues.
|
|
289
|
+
LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
|
|
1230
290
|
}
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
|
|
294
|
+
size_t dash_loc = range.find('-');
|
|
295
|
+
if (dash_loc == std::string::npos) {
|
|
296
|
+
LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
|
|
297
|
+
return false;
|
|
1236
298
|
}
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
299
|
+
|
|
300
|
+
size_t start_i;
|
|
301
|
+
size_t end_i;
|
|
302
|
+
|
|
303
|
+
if (dash_loc == 0) {
|
|
304
|
+
start_i = 0;
|
|
305
|
+
} else {
|
|
306
|
+
start_i = std::stoull(range.substr(0, dash_loc));
|
|
307
|
+
if (start_i >= GGML_MAX_N_THREADS) {
|
|
308
|
+
LOG_ERR("Start index out of bounds!\n");
|
|
309
|
+
return false;
|
|
1244
310
|
}
|
|
1245
|
-
params.context_files.push_back(argv[i]);
|
|
1246
|
-
return true;
|
|
1247
|
-
}
|
|
1248
|
-
if (arg == "--chunk-size") {
|
|
1249
|
-
CHECK_ARG
|
|
1250
|
-
params.chunk_size = std::stoi(argv[i]);
|
|
1251
|
-
return true;
|
|
1252
|
-
}
|
|
1253
|
-
if (arg == "--chunk-separator") {
|
|
1254
|
-
CHECK_ARG
|
|
1255
|
-
params.chunk_separator = argv[i];
|
|
1256
|
-
return true;
|
|
1257
|
-
}
|
|
1258
|
-
if (arg == "--junk") {
|
|
1259
|
-
CHECK_ARG
|
|
1260
|
-
params.n_junk = std::stoi(argv[i]);
|
|
1261
|
-
return true;
|
|
1262
|
-
}
|
|
1263
|
-
if (arg == "--pos") {
|
|
1264
|
-
CHECK_ARG
|
|
1265
|
-
params.i_pos = std::stoi(argv[i]);
|
|
1266
|
-
return true;
|
|
1267
|
-
}
|
|
1268
|
-
if (arg == "-o" || arg == "--output" || arg == "--output-file") {
|
|
1269
|
-
CHECK_ARG
|
|
1270
|
-
params.out_file = argv[i];
|
|
1271
|
-
params.cvector_outfile = argv[i];
|
|
1272
|
-
params.lora_outfile = argv[i];
|
|
1273
|
-
return true;
|
|
1274
|
-
}
|
|
1275
|
-
if (arg == "-ofreq" || arg == "--output-frequency") {
|
|
1276
|
-
CHECK_ARG
|
|
1277
|
-
params.n_out_freq = std::stoi(argv[i]);
|
|
1278
|
-
return true;
|
|
1279
|
-
}
|
|
1280
|
-
if (arg == "--save-frequency") {
|
|
1281
|
-
CHECK_ARG
|
|
1282
|
-
params.n_save_freq = std::stoi(argv[i]);
|
|
1283
|
-
return true;
|
|
1284
|
-
}
|
|
1285
|
-
if (arg == "--process-output") {
|
|
1286
|
-
params.process_output = true;
|
|
1287
|
-
return true;
|
|
1288
|
-
}
|
|
1289
|
-
if (arg == "--no-ppl") {
|
|
1290
|
-
params.compute_ppl = false;
|
|
1291
|
-
return true;
|
|
1292
|
-
}
|
|
1293
|
-
if (arg == "--chunk" || arg == "--from-chunk") {
|
|
1294
|
-
CHECK_ARG
|
|
1295
|
-
params.i_chunk = std::stoi(argv[i]);
|
|
1296
|
-
return true;
|
|
1297
|
-
}
|
|
1298
|
-
// cvector params
|
|
1299
|
-
if (arg == "--positive-file") {
|
|
1300
|
-
CHECK_ARG
|
|
1301
|
-
params.cvector_positive_file = argv[i];
|
|
1302
|
-
return true;
|
|
1303
|
-
}
|
|
1304
|
-
if (arg == "--negative-file") {
|
|
1305
|
-
CHECK_ARG
|
|
1306
|
-
params.cvector_negative_file = argv[i];
|
|
1307
|
-
return true;
|
|
1308
|
-
}
|
|
1309
|
-
if (arg == "--pca-batch") {
|
|
1310
|
-
CHECK_ARG
|
|
1311
|
-
params.n_pca_batch = std::stoi(argv[i]);
|
|
1312
|
-
return true;
|
|
1313
|
-
}
|
|
1314
|
-
if (arg == "--pca-iter") {
|
|
1315
|
-
CHECK_ARG
|
|
1316
|
-
params.n_pca_iterations = std::stoi(argv[i]);
|
|
1317
|
-
return true;
|
|
1318
|
-
}
|
|
1319
|
-
if (arg == "--method") {
|
|
1320
|
-
CHECK_ARG
|
|
1321
|
-
std::string value(argv[i]);
|
|
1322
|
-
/**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
|
|
1323
|
-
else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
|
|
1324
|
-
else { invalid_param = true; }
|
|
1325
|
-
return true;
|
|
1326
|
-
}
|
|
1327
|
-
if (arg == "--no-warmup") {
|
|
1328
|
-
params.warmup = false;
|
|
1329
|
-
return true;
|
|
1330
|
-
}
|
|
1331
|
-
#ifndef LOG_DISABLE_LOGS
|
|
1332
|
-
// Parse args for logging parameters
|
|
1333
|
-
if (log_param_single_parse(argv[i])) {
|
|
1334
|
-
// Do nothing, log_param_single_parse automatically does it's thing
|
|
1335
|
-
// and returns if a match was found and parsed.
|
|
1336
|
-
return true;
|
|
1337
311
|
}
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
if (
|
|
1344
|
-
|
|
1345
|
-
return
|
|
312
|
+
|
|
313
|
+
if (dash_loc == range.length() - 1) {
|
|
314
|
+
end_i = GGML_MAX_N_THREADS - 1;
|
|
315
|
+
} else {
|
|
316
|
+
end_i = std::stoull(range.substr(dash_loc + 1));
|
|
317
|
+
if (end_i >= GGML_MAX_N_THREADS) {
|
|
318
|
+
LOG_ERR("End index out of bounds!\n");
|
|
319
|
+
return false;
|
|
1346
320
|
}
|
|
1347
|
-
return true;
|
|
1348
321
|
}
|
|
1349
|
-
// End of Parse args for logging parameters
|
|
1350
|
-
#endif // LOG_DISABLE_LOGS
|
|
1351
322
|
|
|
1352
|
-
|
|
323
|
+
for (size_t i = start_i; i <= end_i; i++) {
|
|
324
|
+
boolmask[i] = true;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
return true;
|
|
1353
328
|
}
|
|
1354
329
|
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
#else
|
|
1362
|
-
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
|
1363
|
-
#endif
|
|
330
|
+
bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
|
|
331
|
+
// Discard potential 0x prefix
|
|
332
|
+
size_t start_i = 0;
|
|
333
|
+
if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
|
|
334
|
+
start_i = 2;
|
|
335
|
+
}
|
|
1364
336
|
|
|
1365
|
-
|
|
1366
|
-
|
|
337
|
+
size_t num_digits = mask.length() - start_i;
|
|
338
|
+
if (num_digits > 128) num_digits = 128;
|
|
1367
339
|
|
|
1368
|
-
|
|
1369
|
-
std::string sampler_type_names;
|
|
1370
|
-
for (const auto sampler_type : sparams.samplers_sequence) {
|
|
1371
|
-
sampler_type_chars += static_cast<char>(sampler_type);
|
|
1372
|
-
sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";";
|
|
1373
|
-
}
|
|
1374
|
-
sampler_type_names.pop_back();
|
|
340
|
+
size_t end_i = num_digits + start_i;
|
|
1375
341
|
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
342
|
+
for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
|
|
343
|
+
char c = mask.at(i);
|
|
344
|
+
int8_t id = c;
|
|
345
|
+
|
|
346
|
+
if ((c >= '0' && c <= '9')) {
|
|
347
|
+
id -= '0';
|
|
348
|
+
} else if (c >= 'a' && c <= 'f') {
|
|
349
|
+
id -= 'a' - 10;
|
|
350
|
+
} else if (c >= 'A' && c <= 'F') {
|
|
351
|
+
id -= 'A' - 10;
|
|
352
|
+
} else {
|
|
353
|
+
LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
|
|
354
|
+
return false;
|
|
1385
355
|
}
|
|
1386
356
|
|
|
1387
|
-
|
|
357
|
+
boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
|
|
358
|
+
boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
|
|
359
|
+
boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
|
|
360
|
+
boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
return true;
|
|
364
|
+
}
|
|
1388
365
|
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
366
|
+
void common_init() {
|
|
367
|
+
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
|
|
368
|
+
if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
|
|
369
|
+
common_log_add(common_log_main(), level, "%s", text);
|
|
370
|
+
}
|
|
371
|
+
}, NULL);
|
|
372
|
+
|
|
373
|
+
#ifdef NDEBUG
|
|
374
|
+
const char * build_type = "";
|
|
375
|
+
#else
|
|
376
|
+
const char * build_type = " (debug)";
|
|
377
|
+
#endif
|
|
1394
378
|
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
// TODO: filter by tags
|
|
1398
|
-
|
|
1399
|
-
options.push_back({ "general" });
|
|
1400
|
-
options.push_back({ "*", "-h, --help, --usage", "print usage and exit" });
|
|
1401
|
-
options.push_back({ "*", " --version", "show version and build info" });
|
|
1402
|
-
options.push_back({ "*", "-v, --verbose", "print verbose information" });
|
|
1403
|
-
options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity });
|
|
1404
|
-
options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
|
|
1405
|
-
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
|
|
1406
|
-
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
|
|
1407
|
-
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
|
|
1408
|
-
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
|
|
1409
|
-
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
|
|
1410
|
-
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
|
|
1411
|
-
options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
|
|
1412
|
-
"number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
|
|
1413
|
-
options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
|
|
1414
|
-
options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
|
|
1415
|
-
options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
|
|
1416
|
-
"path to static lookup cache to use for lookup decoding (not updated by generation)" });
|
|
1417
|
-
options.push_back({ "*", "-lcd, --lookup-cache-dynamic FNAME",
|
|
1418
|
-
"path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
|
|
1419
|
-
|
|
1420
|
-
options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
|
|
1421
|
-
options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
|
|
1422
|
-
options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch });
|
|
1423
|
-
options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch });
|
|
1424
|
-
options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
|
|
1425
|
-
options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
|
|
1426
|
-
options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
|
|
1427
|
-
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n"
|
|
1428
|
-
"in conversation mode, this will be used as system prompt\n"
|
|
1429
|
-
"(default: '%s')", params.prompt.c_str() });
|
|
1430
|
-
options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
|
|
1431
|
-
options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" });
|
|
1432
|
-
options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
|
|
1433
|
-
options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
|
|
1434
|
-
options.push_back({ "*", " --no-escape", "do not process escape sequences" });
|
|
1435
|
-
options.push_back({ "main", "-ptc, --print-token-count N", "print token count every N tokens (default: %d)", params.n_print });
|
|
1436
|
-
options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" });
|
|
1437
|
-
options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n"
|
|
1438
|
-
"not supported with --interactive or other interactive options" });
|
|
1439
|
-
options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" });
|
|
1440
|
-
options.push_back({ "main", "-r, --reverse-prompt PROMPT",
|
|
1441
|
-
"halt generation at PROMPT, return control in interactive mode\n"
|
|
1442
|
-
"can be specified more than once for multiple prompts" });
|
|
1443
|
-
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
|
|
1444
|
-
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode, does not print special tokens and suffix/prefix\n"
|
|
1445
|
-
"if suffix/prefix are not specified, default chat template will be used\n"
|
|
1446
|
-
"(default: %s)", params.conversation ? "true" : "false" });
|
|
1447
|
-
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
|
|
1448
|
-
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
|
|
1449
|
-
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
|
|
1450
|
-
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
|
|
1451
|
-
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
|
|
1452
|
-
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
|
|
1453
|
-
options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" });
|
|
1454
|
-
options.push_back({ "server infill",
|
|
1455
|
-
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
|
|
1456
|
-
|
|
1457
|
-
options.push_back({ "sampling" });
|
|
1458
|
-
options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
|
|
1459
|
-
"(default: %s)", sampler_type_names.c_str() });
|
|
1460
|
-
options.push_back({ "*", " --sampling-seq SEQUENCE",
|
|
1461
|
-
"simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() });
|
|
1462
|
-
options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" });
|
|
1463
|
-
options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" });
|
|
1464
|
-
options.push_back({ "*", " --temp N", "temperature (default: %.1f)", (double)sparams.temp });
|
|
1465
|
-
options.push_back({ "*", " --top-k N", "top-k sampling (default: %d, 0 = disabled)", sparams.top_k });
|
|
1466
|
-
options.push_back({ "*", " --top-p N", "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p });
|
|
1467
|
-
options.push_back({ "*", " --min-p N", "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p });
|
|
1468
|
-
options.push_back({ "*", " --tfs N", "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z });
|
|
1469
|
-
options.push_back({ "*", " --typical N", "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p });
|
|
1470
|
-
options.push_back({ "*", " --repeat-last-n N", "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n });
|
|
1471
|
-
options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat });
|
|
1472
|
-
options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present });
|
|
1473
|
-
options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq });
|
|
1474
|
-
options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range });
|
|
1475
|
-
options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent });
|
|
1476
|
-
options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n"
|
|
1477
|
-
"Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
|
|
1478
|
-
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat });
|
|
1479
|
-
options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
|
|
1480
|
-
options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
|
|
1481
|
-
options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n"
|
|
1482
|
-
"i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
|
|
1483
|
-
"or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
|
|
1484
|
-
options.push_back({ "main", " --cfg-negative-prompt PROMPT",
|
|
1485
|
-
"negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() });
|
|
1486
|
-
options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
|
|
1487
|
-
"negative prompt file to use for guidance" });
|
|
1488
|
-
options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
|
|
1489
|
-
options.push_back({ "main", " --chat-template JINJA_TEMPLATE",
|
|
1490
|
-
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
|
1491
|
-
"if suffix/prefix are specified, template will be disabled\n"
|
|
1492
|
-
"only commonly used templates are accepted:\n"
|
|
1493
|
-
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
|
|
1494
|
-
options.push_back({ "grammar" });
|
|
1495
|
-
options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
|
|
1496
|
-
options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
|
|
1497
|
-
options.push_back({ "*", "-j, --json-schema SCHEMA",
|
|
1498
|
-
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n"
|
|
1499
|
-
"For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
|
|
1500
|
-
|
|
1501
|
-
options.push_back({ "embedding" });
|
|
1502
|
-
options.push_back({ "embedding", " --pooling {none,mean,cls,last}",
|
|
1503
|
-
"pooling type for embeddings, use model default if unspecified" });
|
|
1504
|
-
options.push_back({ "embedding", " --attention {causal,non-causal}",
|
|
1505
|
-
"attention type for embeddings, use model default if unspecified" });
|
|
1506
|
-
|
|
1507
|
-
options.push_back({ "context hacking" });
|
|
1508
|
-
options.push_back({ "*", " --rope-scaling {none,linear,yarn}",
|
|
1509
|
-
"RoPE frequency scaling method, defaults to linear unless specified by the model" });
|
|
1510
|
-
options.push_back({ "*", " --rope-scale N", "RoPE context scaling factor, expands context by a factor of N" });
|
|
1511
|
-
options.push_back({ "*", " --rope-freq-base N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" });
|
|
1512
|
-
options.push_back({ "*", " --rope-freq-scale N", "RoPE frequency scaling factor, expands context by a factor of 1/N" });
|
|
1513
|
-
options.push_back({ "*", " --yarn-orig-ctx N", "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx });
|
|
1514
|
-
options.push_back({ "*", " --yarn-ext-factor N", "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor });
|
|
1515
|
-
options.push_back({ "*", " --yarn-attn-factor N", "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor });
|
|
1516
|
-
options.push_back({ "*", " --yarn-beta-slow N", "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow });
|
|
1517
|
-
options.push_back({ "*", " --yarn-beta-fast N", "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast });
|
|
1518
|
-
options.push_back({ "*", "-gan, --grp-attn-n N", "group-attention factor (default: %d)", params.grp_attn_n });
|
|
1519
|
-
options.push_back({ "*", "-gaw, --grp-attn-w N", "group-attention width (default: %.1f)", (double)params.grp_attn_w });
|
|
1520
|
-
options.push_back({ "*", "-dkvc, --dump-kv-cache", "verbose print of the KV cache" });
|
|
1521
|
-
options.push_back({ "*", "-nkvo, --no-kv-offload", "disable KV offload" });
|
|
1522
|
-
options.push_back({ "*", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() });
|
|
1523
|
-
options.push_back({ "*", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() });
|
|
1524
|
-
|
|
1525
|
-
options.push_back({ "perplexity" });
|
|
1526
|
-
options.push_back({ "perplexity", " --all-logits", "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" });
|
|
1527
|
-
options.push_back({ "perplexity", " --hellaswag", "compute HellaSwag score over random tasks from datafile supplied with -f" });
|
|
1528
|
-
options.push_back({ "perplexity", " --hellaswag-tasks N", "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks });
|
|
1529
|
-
options.push_back({ "perplexity", " --winogrande", "compute Winogrande score over random tasks from datafile supplied with -f" });
|
|
1530
|
-
options.push_back({ "perplexity", " --winogrande-tasks N", "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks });
|
|
1531
|
-
options.push_back({ "perplexity", " --multiple-choice", "compute multiple choice score over random tasks from datafile supplied with -f" });
|
|
1532
|
-
options.push_back({ "perplexity", " --multiple-choice-tasks N",
|
|
1533
|
-
"number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks });
|
|
1534
|
-
options.push_back({ "perplexity", " --kl-divergence", "computes KL-divergence to logits provided via --kl-divergence-base" });
|
|
1535
|
-
options.push_back({ "perplexity", " --ppl-stride N", "stride for perplexity calculation (default: %d)", params.ppl_stride });
|
|
1536
|
-
options.push_back({ "perplexity", " --ppl-output-type {0,1}",
|
|
1537
|
-
"output type for perplexity calculation (default: %d)", params.ppl_output_type });
|
|
1538
|
-
|
|
1539
|
-
options.push_back({ "parallel" });
|
|
1540
|
-
options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold });
|
|
1541
|
-
options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
|
|
1542
|
-
options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
|
|
1543
|
-
options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
|
|
1544
|
-
options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" });
|
|
1545
|
-
|
|
1546
|
-
options.push_back({ "multi-modality" });
|
|
1547
|
-
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
|
|
1548
|
-
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
|
|
1549
|
-
|
|
1550
|
-
options.push_back({ "backend" });
|
|
1551
|
-
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
|
|
1552
|
-
|
|
1553
|
-
if (llama_supports_mlock()) {
|
|
1554
|
-
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
|
|
1555
|
-
}
|
|
1556
|
-
if (llama_supports_mmap()) {
|
|
1557
|
-
options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
|
|
1558
|
-
}
|
|
1559
|
-
options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n"
|
|
1560
|
-
" - distribute: spread execution evenly over all nodes\n"
|
|
1561
|
-
" - isolate: only spawn threads on CPUs on the node that execution started on\n"
|
|
1562
|
-
" - numactl: use the CPU map provided by numactl\n"
|
|
1563
|
-
"if run without this previously, it is recommended to drop the system page cache before using this\n"
|
|
1564
|
-
"see https://github.com/ggerganov/llama.cpp/issues/1437" });
|
|
1565
|
-
|
|
1566
|
-
if (llama_supports_gpu_offload()) {
|
|
1567
|
-
options.push_back({ "*", "-ngl, --gpu-layers N",
|
|
1568
|
-
"number of layers to store in VRAM" });
|
|
1569
|
-
options.push_back({ "*", "-ngld, --gpu-layers-draft N",
|
|
1570
|
-
"number of layers to store in VRAM for the draft model" });
|
|
1571
|
-
options.push_back({ "*", "-sm, --split-mode SPLIT_MODE",
|
|
1572
|
-
"how to split the model across multiple GPUs, one of:\n"
|
|
1573
|
-
" - none: use one GPU only\n"
|
|
1574
|
-
" - layer (default): split layers and KV across GPUs\n"
|
|
1575
|
-
" - row: split rows across GPUs" });
|
|
1576
|
-
options.push_back({ "*", "-ts, --tensor-split SPLIT",
|
|
1577
|
-
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
|
|
1578
|
-
options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n"
|
|
1579
|
-
"or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
|
|
1580
|
-
}
|
|
1581
|
-
|
|
1582
|
-
options.push_back({ "model" });
|
|
1583
|
-
options.push_back({ "*", " --check-tensors", "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" });
|
|
1584
|
-
options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
|
|
1585
|
-
"advanced option to override model metadata by key. may be specified multiple times.\n"
|
|
1586
|
-
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
|
|
1587
|
-
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" });
|
|
1588
|
-
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
|
1589
|
-
options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
|
|
1590
|
-
"note: this argument can be repeated to add multiple control vectors" });
|
|
1591
|
-
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
|
|
1592
|
-
"add a control vector with user defined scaling SCALE\n"
|
|
1593
|
-
"note: this argument can be repeated to add multiple scaled control vectors" });
|
|
1594
|
-
options.push_back({ "*", " --control-vector-layer-range START END",
|
|
1595
|
-
"layer range to apply the control vector(s) to, start and end inclusive" });
|
|
1596
|
-
options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
|
|
1597
|
-
"or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
|
|
1598
|
-
options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" });
|
|
1599
|
-
options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
|
|
1600
|
-
options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
|
|
1601
|
-
options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" });
|
|
1602
|
-
options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
|
|
1603
|
-
|
|
1604
|
-
options.push_back({ "retrieval" });
|
|
1605
|
-
options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" });
|
|
1606
|
-
options.push_back({ "retrieval", " --chunk-size N", "minimum length of embedded text chunks (default: %d)", params.chunk_size });
|
|
1607
|
-
options.push_back({ "retrieval", " --chunk-separator STRING",
|
|
1608
|
-
"separator between chunks (default: '%s')", params.chunk_separator.c_str() });
|
|
1609
|
-
|
|
1610
|
-
options.push_back({ "passkey" });
|
|
1611
|
-
options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk });
|
|
1612
|
-
options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos });
|
|
1613
|
-
|
|
1614
|
-
options.push_back({ "imatrix" });
|
|
1615
|
-
options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() });
|
|
1616
|
-
options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq });
|
|
1617
|
-
options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
|
|
1618
|
-
options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
|
|
1619
|
-
options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
|
|
1620
|
-
options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk });
|
|
1621
|
-
|
|
1622
|
-
options.push_back({ "bench" });
|
|
1623
|
-
options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
|
|
1624
|
-
options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" });
|
|
1625
|
-
options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" });
|
|
1626
|
-
options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" });
|
|
1627
|
-
|
|
1628
|
-
options.push_back({ "embedding" });
|
|
1629
|
-
options.push_back({ "embedding", " --embd-normalize", "normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize });
|
|
1630
|
-
options.push_back({ "embedding", " --embd-output-format", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix" });
|
|
1631
|
-
options.push_back({ "embedding", " --embd-separator", "separator of embendings (default \\n) for example \"<#sep#>\"" });
|
|
1632
|
-
|
|
1633
|
-
options.push_back({ "server" });
|
|
1634
|
-
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
|
|
1635
|
-
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
|
|
1636
|
-
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
|
|
1637
|
-
options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
|
|
1638
|
-
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
|
|
1639
|
-
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
|
|
1640
|
-
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
|
|
1641
|
-
options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" });
|
|
1642
|
-
options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read });
|
|
1643
|
-
options.push_back({ "server", " --threads-http N", "number of threads used to process HTTP requests (default: %d)", params.n_threads_http });
|
|
1644
|
-
options.push_back({ "server", " --system-prompt-file FNAME",
|
|
1645
|
-
"set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" });
|
|
1646
|
-
options.push_back({ "server", " --log-format {text,json}",
|
|
1647
|
-
"log output format: json or text (default: json)" });
|
|
1648
|
-
options.push_back({ "server", " --metrics", "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" });
|
|
1649
|
-
options.push_back({ "server", " --no-slots", "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" });
|
|
1650
|
-
options.push_back({ "server", " --slot-save-path PATH", "path to save slot kv cache (default: disabled)" });
|
|
1651
|
-
options.push_back({ "server", " --chat-template JINJA_TEMPLATE",
|
|
1652
|
-
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
|
1653
|
-
"only commonly used templates are accepted:\n"
|
|
1654
|
-
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
|
|
1655
|
-
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
|
|
1656
|
-
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
|
|
1657
|
-
|
|
1658
|
-
#ifndef LOG_DISABLE_LOGS
|
|
1659
|
-
options.push_back({ "logging" });
|
|
1660
|
-
options.push_back({ "*", " --simple-io", "use basic IO for better compatibility in subprocesses and limited consoles" });
|
|
1661
|
-
options.push_back({ "*", "-ld, --logdir LOGDIR", "path under which to save YAML logs (no logging if unset)" });
|
|
1662
|
-
options.push_back({ "logging", " --log-test", "Run simple logging test" });
|
|
1663
|
-
options.push_back({ "logging", " --log-disable", "Disable trace logs" });
|
|
1664
|
-
options.push_back({ "logging", " --log-enable", "Enable trace logs" });
|
|
1665
|
-
options.push_back({ "logging", " --log-file FNAME", "Specify a log filename (without extension)" });
|
|
1666
|
-
options.push_back({ "logging", " --log-new", "Create a separate new log file on start. "
|
|
1667
|
-
"Each log file will have unique name: \"<name>.<ID>.log\"" });
|
|
1668
|
-
options.push_back({ "logging", " --log-append", "Don't truncate the old log file." });
|
|
1669
|
-
#endif // LOG_DISABLE_LOGS
|
|
1670
|
-
|
|
1671
|
-
options.push_back({ "cvector" });
|
|
1672
|
-
options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() });
|
|
1673
|
-
options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
|
|
1674
|
-
options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
|
|
1675
|
-
options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
|
|
1676
|
-
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
|
|
1677
|
-
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
|
|
1678
|
-
|
|
1679
|
-
options.push_back({ "export-lora" });
|
|
1680
|
-
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
|
|
1681
|
-
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
|
|
1682
|
-
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
|
1683
|
-
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
|
|
1684
|
-
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
|
|
1685
|
-
|
|
1686
|
-
printf("usage: %s [options]\n", argv[0]);
|
|
1687
|
-
|
|
1688
|
-
for (const auto & o : options) {
|
|
1689
|
-
if (!o.grp.empty()) {
|
|
1690
|
-
printf("\n%s:\n\n", o.grp.c_str());
|
|
1691
|
-
continue;
|
|
1692
|
-
}
|
|
1693
|
-
printf(" %-32s", o.args.c_str());
|
|
1694
|
-
if (o.args.length() > 30) {
|
|
1695
|
-
printf("\n%34s", "");
|
|
1696
|
-
}
|
|
1697
|
-
|
|
1698
|
-
const auto desc = o.desc;
|
|
1699
|
-
size_t start = 0;
|
|
1700
|
-
size_t end = desc.find('\n');
|
|
1701
|
-
while (end != std::string::npos) {
|
|
1702
|
-
printf("%s\n%34s", desc.substr(start, end - start).c_str(), "");
|
|
1703
|
-
start = end + 1;
|
|
1704
|
-
end = desc.find('\n', start);
|
|
1705
|
-
}
|
|
1706
|
-
|
|
1707
|
-
printf("%s\n", desc.substr(start).c_str());
|
|
1708
|
-
}
|
|
1709
|
-
printf("\n");
|
|
379
|
+
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
|
1710
380
|
}
|
|
1711
381
|
|
|
1712
|
-
std::string
|
|
382
|
+
std::string common_params_get_system_info(const common_params & params) {
|
|
1713
383
|
std::ostringstream os;
|
|
1714
384
|
|
|
1715
|
-
os << "system_info: n_threads = " << params.n_threads;
|
|
1716
|
-
if (params.
|
|
1717
|
-
os << " (n_threads_batch = " << params.
|
|
385
|
+
os << "system_info: n_threads = " << params.cpuparams.n_threads;
|
|
386
|
+
if (params.cpuparams_batch.n_threads != -1) {
|
|
387
|
+
os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
|
|
1718
388
|
}
|
|
389
|
+
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
|
390
|
+
// TODO: windows + arm64 + mingw64
|
|
391
|
+
DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
|
|
392
|
+
os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
|
|
393
|
+
#else
|
|
1719
394
|
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
|
|
395
|
+
#endif
|
|
1720
396
|
|
|
1721
397
|
return os.str();
|
|
1722
398
|
}
|
|
@@ -1725,17 +401,19 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
|
|
|
1725
401
|
// String utils
|
|
1726
402
|
//
|
|
1727
403
|
|
|
1728
|
-
std::
|
|
1729
|
-
|
|
1730
|
-
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
1738
|
-
|
|
404
|
+
std::string string_format(const char * fmt, ...) {
|
|
405
|
+
va_list ap;
|
|
406
|
+
va_list ap2;
|
|
407
|
+
va_start(ap, fmt);
|
|
408
|
+
va_copy(ap2, ap);
|
|
409
|
+
int size = vsnprintf(NULL, 0, fmt, ap);
|
|
410
|
+
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
|
411
|
+
std::vector<char> buf(size + 1);
|
|
412
|
+
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
|
413
|
+
GGML_ASSERT(size2 == size);
|
|
414
|
+
va_end(ap2);
|
|
415
|
+
va_end(ap);
|
|
416
|
+
return std::string(buf.data(), size);
|
|
1739
417
|
}
|
|
1740
418
|
|
|
1741
419
|
std::string string_strip(const std::string & str) {
|
|
@@ -1766,6 +444,111 @@ std::string string_get_sortable_timestamp() {
|
|
|
1766
444
|
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
|
|
1767
445
|
}
|
|
1768
446
|
|
|
447
|
+
void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
|
448
|
+
if (search.empty()) {
|
|
449
|
+
return;
|
|
450
|
+
}
|
|
451
|
+
std::string builder;
|
|
452
|
+
builder.reserve(s.length());
|
|
453
|
+
size_t pos = 0;
|
|
454
|
+
size_t last_pos = 0;
|
|
455
|
+
while ((pos = s.find(search, last_pos)) != std::string::npos) {
|
|
456
|
+
builder.append(s, last_pos, pos - last_pos);
|
|
457
|
+
builder.append(replace);
|
|
458
|
+
last_pos = pos + search.length();
|
|
459
|
+
}
|
|
460
|
+
builder.append(s, last_pos, std::string::npos);
|
|
461
|
+
s = std::move(builder);
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
std::string string_from(bool value) {
|
|
465
|
+
return value ? "true" : "false";
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
std::string string_from(const std::vector<int> & values) {
|
|
469
|
+
std::stringstream buf;
|
|
470
|
+
|
|
471
|
+
buf << "[ ";
|
|
472
|
+
bool first = true;
|
|
473
|
+
for (auto e : values) {
|
|
474
|
+
if (first) {
|
|
475
|
+
first = false;
|
|
476
|
+
} else {
|
|
477
|
+
buf << ", ";
|
|
478
|
+
}
|
|
479
|
+
buf << std::to_string(e);
|
|
480
|
+
}
|
|
481
|
+
buf << " ]";
|
|
482
|
+
|
|
483
|
+
return buf.str();
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
|
|
487
|
+
std::stringstream buf;
|
|
488
|
+
|
|
489
|
+
buf << "[ ";
|
|
490
|
+
|
|
491
|
+
bool first = true;
|
|
492
|
+
for (const auto & token : tokens) {
|
|
493
|
+
if (!first) {
|
|
494
|
+
buf << ", ";
|
|
495
|
+
} else {
|
|
496
|
+
first = false;
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
auto detokenized = common_token_to_piece(ctx, token);
|
|
500
|
+
|
|
501
|
+
detokenized.erase(
|
|
502
|
+
std::remove_if(
|
|
503
|
+
detokenized.begin(),
|
|
504
|
+
detokenized.end(),
|
|
505
|
+
[](const unsigned char c) { return !std::isprint(c); }),
|
|
506
|
+
detokenized.end());
|
|
507
|
+
|
|
508
|
+
buf << "'" << detokenized << "'"
|
|
509
|
+
<< ":" << std::to_string(token);
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
buf << " ]";
|
|
513
|
+
|
|
514
|
+
return buf.str();
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
|
|
518
|
+
std::stringstream buf;
|
|
519
|
+
|
|
520
|
+
buf << "[ ";
|
|
521
|
+
|
|
522
|
+
bool first = true;
|
|
523
|
+
for (int i = 0; i < batch.n_tokens; ++i) {
|
|
524
|
+
if (!first) {
|
|
525
|
+
buf << ", ";
|
|
526
|
+
} else {
|
|
527
|
+
first = false;
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
auto detokenized = common_token_to_piece(ctx, batch.token[i]);
|
|
531
|
+
|
|
532
|
+
detokenized.erase(
|
|
533
|
+
std::remove_if(
|
|
534
|
+
detokenized.begin(),
|
|
535
|
+
detokenized.end(),
|
|
536
|
+
[](const unsigned char c) { return !std::isprint(c); }),
|
|
537
|
+
detokenized.end());
|
|
538
|
+
|
|
539
|
+
buf << "\n" << std::to_string(i)
|
|
540
|
+
<< ":token '" << detokenized << "'"
|
|
541
|
+
<< ":pos " << std::to_string(batch.pos[i])
|
|
542
|
+
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
|
|
543
|
+
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
|
|
544
|
+
<< ":logits " << std::to_string(batch.logits[i]);
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
buf << " ]";
|
|
548
|
+
|
|
549
|
+
return buf.str();
|
|
550
|
+
}
|
|
551
|
+
|
|
1769
552
|
void string_process_escapes(std::string & input) {
|
|
1770
553
|
std::size_t input_len = input.length();
|
|
1771
554
|
std::size_t output_idx = 0;
|
|
@@ -1806,7 +589,7 @@ void string_process_escapes(std::string & input) {
|
|
|
1806
589
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
|
|
1807
590
|
const char * sep = strchr(data, '=');
|
|
1808
591
|
if (sep == nullptr || sep - data >= 128) {
|
|
1809
|
-
|
|
592
|
+
LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
|
|
1810
593
|
return false;
|
|
1811
594
|
}
|
|
1812
595
|
llama_model_kv_override kvo;
|
|
@@ -1829,20 +612,20 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
|
|
|
1829
612
|
} else if (std::strcmp(sep, "false") == 0) {
|
|
1830
613
|
kvo.val_bool = false;
|
|
1831
614
|
} else {
|
|
1832
|
-
|
|
615
|
+
LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
|
|
1833
616
|
return false;
|
|
1834
617
|
}
|
|
1835
618
|
} else if (strncmp(sep, "str:", 4) == 0) {
|
|
1836
619
|
sep += 4;
|
|
1837
620
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
|
1838
621
|
if (strlen(sep) > 127) {
|
|
1839
|
-
|
|
622
|
+
LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
|
|
1840
623
|
return false;
|
|
1841
624
|
}
|
|
1842
625
|
strncpy(kvo.val_str, sep, 127);
|
|
1843
626
|
kvo.val_str[127] = '\0';
|
|
1844
627
|
} else {
|
|
1845
|
-
|
|
628
|
+
LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
|
|
1846
629
|
return false;
|
|
1847
630
|
}
|
|
1848
631
|
overrides.emplace_back(std::move(kvo));
|
|
@@ -2039,43 +822,69 @@ std::string fs_get_cache_file(const std::string & filename) {
|
|
|
2039
822
|
//
|
|
2040
823
|
// Model utils
|
|
2041
824
|
//
|
|
2042
|
-
|
|
2043
|
-
|
|
2044
|
-
auto mparams =
|
|
825
|
+
struct common_init_result common_init_from_params(common_params & params) {
|
|
826
|
+
common_init_result iparams;
|
|
827
|
+
auto mparams = common_model_params_to_llama(params);
|
|
2045
828
|
|
|
2046
829
|
llama_model * model = nullptr;
|
|
2047
830
|
|
|
2048
831
|
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
|
2049
|
-
model =
|
|
832
|
+
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
|
2050
833
|
} else if (!params.model_url.empty()) {
|
|
2051
|
-
model =
|
|
834
|
+
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
|
2052
835
|
} else {
|
|
2053
836
|
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
|
2054
837
|
}
|
|
2055
838
|
|
|
2056
839
|
if (model == NULL) {
|
|
2057
|
-
|
|
2058
|
-
return
|
|
840
|
+
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
|
|
841
|
+
return iparams;
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
if (params.reranking) {
|
|
845
|
+
bool ok = true;
|
|
846
|
+
|
|
847
|
+
if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
|
|
848
|
+
LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
|
|
849
|
+
ok = false;
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
|
853
|
+
LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
|
|
854
|
+
ok = false;
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
|
|
858
|
+
LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
|
|
859
|
+
ok = false;
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
if (!ok) {
|
|
863
|
+
llama_free_model(model);
|
|
864
|
+
|
|
865
|
+
return iparams;
|
|
866
|
+
}
|
|
2059
867
|
}
|
|
2060
868
|
|
|
2061
|
-
auto cparams =
|
|
869
|
+
auto cparams = common_context_params_to_llama(params);
|
|
2062
870
|
|
|
2063
871
|
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
|
2064
872
|
if (lctx == NULL) {
|
|
2065
|
-
|
|
873
|
+
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
|
2066
874
|
llama_free_model(model);
|
|
2067
|
-
return
|
|
875
|
+
return iparams;
|
|
2068
876
|
}
|
|
2069
877
|
|
|
2070
878
|
if (!params.control_vectors.empty()) {
|
|
2071
879
|
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
|
2072
880
|
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
|
2073
881
|
|
|
2074
|
-
const auto cvec =
|
|
882
|
+
const auto cvec = common_control_vector_load(params.control_vectors);
|
|
2075
883
|
if (cvec.n_embd == -1) {
|
|
2076
884
|
llama_free(lctx);
|
|
2077
885
|
llama_free_model(model);
|
|
2078
|
-
|
|
886
|
+
|
|
887
|
+
return iparams;
|
|
2079
888
|
}
|
|
2080
889
|
|
|
2081
890
|
int err = llama_control_vector_apply(lctx,
|
|
@@ -2087,41 +896,53 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
|
2087
896
|
if (err) {
|
|
2088
897
|
llama_free(lctx);
|
|
2089
898
|
llama_free_model(model);
|
|
2090
|
-
|
|
899
|
+
|
|
900
|
+
return iparams;
|
|
2091
901
|
}
|
|
2092
902
|
}
|
|
2093
903
|
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
904
|
+
// load and optionally apply lora adapters
|
|
905
|
+
for (auto & la : params.lora_adapters) {
|
|
906
|
+
common_lora_adapter_container loaded_la;
|
|
907
|
+
loaded_la.path = la.path;
|
|
908
|
+
loaded_la.scale = la.scale;
|
|
909
|
+
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
|
910
|
+
if (loaded_la.adapter == nullptr) {
|
|
911
|
+
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
|
2100
912
|
llama_free(lctx);
|
|
2101
913
|
llama_free_model(model);
|
|
2102
|
-
return
|
|
914
|
+
return iparams;
|
|
2103
915
|
}
|
|
2104
|
-
|
|
916
|
+
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
|
917
|
+
}
|
|
918
|
+
if (!params.lora_init_without_apply) {
|
|
919
|
+
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
|
2105
920
|
}
|
|
2106
921
|
|
|
2107
|
-
if (params.ignore_eos) {
|
|
2108
|
-
|
|
922
|
+
if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
|
923
|
+
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
|
924
|
+
params.sparams.ignore_eos = false;
|
|
2109
925
|
}
|
|
2110
926
|
|
|
2111
927
|
if (params.warmup) {
|
|
2112
|
-
|
|
928
|
+
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
|
2113
929
|
|
|
2114
930
|
std::vector<llama_token> tmp;
|
|
2115
931
|
llama_token bos = llama_token_bos(model);
|
|
2116
932
|
llama_token eos = llama_token_eos(model);
|
|
2117
933
|
// some models (e.g. T5) don't have a BOS token
|
|
2118
|
-
if (bos !=
|
|
934
|
+
if (bos != LLAMA_TOKEN_NULL) {
|
|
2119
935
|
tmp.push_back(bos);
|
|
2120
936
|
}
|
|
2121
|
-
|
|
937
|
+
if (eos != LLAMA_TOKEN_NULL) {
|
|
938
|
+
tmp.push_back(eos);
|
|
939
|
+
}
|
|
940
|
+
if (tmp.empty()) {
|
|
941
|
+
tmp.push_back(0);
|
|
942
|
+
}
|
|
2122
943
|
|
|
2123
944
|
if (llama_model_has_encoder(model)) {
|
|
2124
|
-
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()
|
|
945
|
+
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
|
2125
946
|
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
|
2126
947
|
if (decoder_start_token_id == -1) {
|
|
2127
948
|
decoder_start_token_id = bos;
|
|
@@ -2129,16 +950,30 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
|
2129
950
|
tmp.clear();
|
|
2130
951
|
tmp.push_back(decoder_start_token_id);
|
|
2131
952
|
}
|
|
2132
|
-
|
|
953
|
+
if (llama_model_has_decoder(model)) {
|
|
954
|
+
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
|
955
|
+
}
|
|
2133
956
|
llama_kv_cache_clear(lctx);
|
|
2134
957
|
llama_synchronize(lctx);
|
|
2135
|
-
|
|
958
|
+
llama_perf_context_reset(lctx);
|
|
2136
959
|
}
|
|
2137
960
|
|
|
2138
|
-
|
|
961
|
+
iparams.model = model;
|
|
962
|
+
iparams.context = lctx;
|
|
963
|
+
|
|
964
|
+
return iparams;
|
|
965
|
+
}
|
|
966
|
+
|
|
967
|
+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
|
|
968
|
+
llama_lora_adapter_clear(ctx);
|
|
969
|
+
for (auto & la : lora_adapters) {
|
|
970
|
+
if (la.scale != 0.0f) {
|
|
971
|
+
llama_lora_adapter_set(ctx, la.adapter, la.scale);
|
|
972
|
+
}
|
|
973
|
+
}
|
|
2139
974
|
}
|
|
2140
975
|
|
|
2141
|
-
struct llama_model_params
|
|
976
|
+
struct llama_model_params common_model_params_to_llama(const common_params & params) {
|
|
2142
977
|
auto mparams = llama_model_default_params();
|
|
2143
978
|
|
|
2144
979
|
if (params.n_gpu_layers != -1) {
|
|
@@ -2168,6 +1003,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
|
|
|
2168
1003
|
if (s == "f16") {
|
|
2169
1004
|
return GGML_TYPE_F16;
|
|
2170
1005
|
}
|
|
1006
|
+
if (s == "bf16") {
|
|
1007
|
+
return GGML_TYPE_BF16;
|
|
1008
|
+
}
|
|
2171
1009
|
if (s == "q8_0") {
|
|
2172
1010
|
return GGML_TYPE_Q8_0;
|
|
2173
1011
|
}
|
|
@@ -2187,19 +1025,19 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
|
|
|
2187
1025
|
return GGML_TYPE_Q5_1;
|
|
2188
1026
|
}
|
|
2189
1027
|
|
|
2190
|
-
throw std::runtime_error("
|
|
1028
|
+
throw std::runtime_error("Unsupported cache type: " + s);
|
|
2191
1029
|
}
|
|
2192
1030
|
|
|
2193
|
-
struct llama_context_params
|
|
1031
|
+
struct llama_context_params common_context_params_to_llama(const common_params & params) {
|
|
2194
1032
|
auto cparams = llama_context_default_params();
|
|
2195
1033
|
|
|
2196
1034
|
cparams.n_ctx = params.n_ctx;
|
|
2197
1035
|
cparams.n_seq_max = params.n_parallel;
|
|
2198
1036
|
cparams.n_batch = params.n_batch;
|
|
2199
1037
|
cparams.n_ubatch = params.n_ubatch;
|
|
2200
|
-
cparams.n_threads = params.n_threads;
|
|
2201
|
-
cparams.n_threads_batch = params.
|
|
2202
|
-
|
|
1038
|
+
cparams.n_threads = params.cpuparams.n_threads;
|
|
1039
|
+
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
|
1040
|
+
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
|
2203
1041
|
cparams.logits_all = params.logits_all;
|
|
2204
1042
|
cparams.embeddings = params.embedding;
|
|
2205
1043
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
|
@@ -2217,6 +1055,12 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
|
2217
1055
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
|
2218
1056
|
cparams.offload_kqv = !params.no_kv_offload;
|
|
2219
1057
|
cparams.flash_attn = params.flash_attn;
|
|
1058
|
+
cparams.no_perf = params.no_perf;
|
|
1059
|
+
|
|
1060
|
+
if (params.reranking) {
|
|
1061
|
+
cparams.embeddings = true;
|
|
1062
|
+
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
|
1063
|
+
}
|
|
2220
1064
|
|
|
2221
1065
|
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
|
2222
1066
|
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
|
@@ -2224,19 +1068,62 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
|
2224
1068
|
return cparams;
|
|
2225
1069
|
}
|
|
2226
1070
|
|
|
1071
|
+
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
|
|
1072
|
+
struct ggml_threadpool_params tpp;
|
|
1073
|
+
|
|
1074
|
+
ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
|
|
1075
|
+
|
|
1076
|
+
if (params.mask_valid) {
|
|
1077
|
+
std::memcpy(&tpp.cpumask, ¶ms.cpumask, GGML_MAX_N_THREADS);
|
|
1078
|
+
}
|
|
1079
|
+
|
|
1080
|
+
tpp.prio = params.priority;
|
|
1081
|
+
tpp.poll = params.poll;
|
|
1082
|
+
tpp.strict_cpu = params.strict_cpu;
|
|
1083
|
+
|
|
1084
|
+
return tpp;
|
|
1085
|
+
}
|
|
1086
|
+
|
|
2227
1087
|
#ifdef LLAMA_USE_CURL
|
|
2228
1088
|
|
|
1089
|
+
#define CURL_MAX_RETRY 3
|
|
1090
|
+
#define CURL_RETRY_DELAY_SECONDS 2
|
|
1091
|
+
|
|
1092
|
+
|
|
2229
1093
|
static bool starts_with(const std::string & str, const std::string & prefix) {
|
|
2230
1094
|
// While we wait for C++20's std::string::starts_with...
|
|
2231
1095
|
return str.rfind(prefix, 0) == 0;
|
|
2232
1096
|
}
|
|
2233
1097
|
|
|
2234
|
-
static bool
|
|
1098
|
+
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
|
|
1099
|
+
int remaining_attempts = max_attempts;
|
|
1100
|
+
|
|
1101
|
+
while (remaining_attempts > 0) {
|
|
1102
|
+
LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
|
1103
|
+
|
|
1104
|
+
CURLcode res = curl_easy_perform(curl);
|
|
1105
|
+
if (res == CURLE_OK) {
|
|
1106
|
+
return true;
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
|
|
1110
|
+
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
|
|
1111
|
+
|
|
1112
|
+
remaining_attempts--;
|
|
1113
|
+
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
|
1114
|
+
}
|
|
1115
|
+
|
|
1116
|
+
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
|
|
1117
|
+
|
|
1118
|
+
return false;
|
|
1119
|
+
}
|
|
1120
|
+
|
|
1121
|
+
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
|
2235
1122
|
|
|
2236
1123
|
// Initialize libcurl
|
|
2237
1124
|
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
|
2238
1125
|
if (!curl) {
|
|
2239
|
-
|
|
1126
|
+
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
|
2240
1127
|
return false;
|
|
2241
1128
|
}
|
|
2242
1129
|
|
|
@@ -2277,11 +1164,11 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
2277
1164
|
if (metadata_in.good()) {
|
|
2278
1165
|
try {
|
|
2279
1166
|
metadata_in >> metadata;
|
|
2280
|
-
|
|
1167
|
+
LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
|
2281
1168
|
if (metadata.contains("url") && metadata.at("url").is_string()) {
|
|
2282
1169
|
auto previous_url = metadata.at("url").get<std::string>();
|
|
2283
1170
|
if (previous_url != url) {
|
|
2284
|
-
|
|
1171
|
+
LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
|
|
2285
1172
|
return false;
|
|
2286
1173
|
}
|
|
2287
1174
|
}
|
|
@@ -2292,24 +1179,24 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
2292
1179
|
last_modified = metadata.at("lastModified");
|
|
2293
1180
|
}
|
|
2294
1181
|
} catch (const nlohmann::json::exception & e) {
|
|
2295
|
-
|
|
1182
|
+
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
|
2296
1183
|
return false;
|
|
2297
1184
|
}
|
|
2298
1185
|
}
|
|
2299
1186
|
} else {
|
|
2300
|
-
|
|
1187
|
+
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
|
2301
1188
|
}
|
|
2302
1189
|
|
|
2303
1190
|
// Send a HEAD request to retrieve the etag and last-modified headers
|
|
2304
|
-
struct
|
|
1191
|
+
struct common_load_model_from_url_headers {
|
|
2305
1192
|
std::string etag;
|
|
2306
1193
|
std::string last_modified;
|
|
2307
1194
|
};
|
|
2308
|
-
|
|
1195
|
+
common_load_model_from_url_headers headers;
|
|
2309
1196
|
{
|
|
2310
1197
|
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
|
2311
1198
|
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
|
2312
|
-
|
|
1199
|
+
common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
|
|
2313
1200
|
|
|
2314
1201
|
static std::regex header_regex("([^:]+): (.*)\r\n");
|
|
2315
1202
|
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
|
@@ -2334,9 +1221,8 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
2334
1221
|
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
|
2335
1222
|
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
|
2336
1223
|
|
|
2337
|
-
|
|
2338
|
-
if (
|
|
2339
|
-
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
|
|
1224
|
+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
|
1225
|
+
if (!was_perform_successful) {
|
|
2340
1226
|
return false;
|
|
2341
1227
|
}
|
|
2342
1228
|
|
|
@@ -2346,26 +1232,26 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
2346
1232
|
// HEAD not supported, we don't know if the file has changed
|
|
2347
1233
|
// force trigger downloading
|
|
2348
1234
|
force_download = true;
|
|
2349
|
-
|
|
1235
|
+
LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
|
2350
1236
|
}
|
|
2351
1237
|
}
|
|
2352
1238
|
|
|
2353
1239
|
bool should_download = !file_exists || force_download;
|
|
2354
1240
|
if (!should_download) {
|
|
2355
1241
|
if (!etag.empty() && etag != headers.etag) {
|
|
2356
|
-
|
|
1242
|
+
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|
|
2357
1243
|
should_download = true;
|
|
2358
1244
|
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
|
|
2359
|
-
|
|
1245
|
+
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
|
|
2360
1246
|
should_download = true;
|
|
2361
1247
|
}
|
|
2362
1248
|
}
|
|
2363
1249
|
if (should_download) {
|
|
2364
1250
|
std::string path_temporary = path + ".downloadInProgress";
|
|
2365
1251
|
if (file_exists) {
|
|
2366
|
-
|
|
1252
|
+
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
|
2367
1253
|
if (remove(path.c_str()) != 0) {
|
|
2368
|
-
|
|
1254
|
+
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
|
2369
1255
|
return false;
|
|
2370
1256
|
}
|
|
2371
1257
|
}
|
|
@@ -2380,7 +1266,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
2380
1266
|
|
|
2381
1267
|
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
|
|
2382
1268
|
if (!outfile) {
|
|
2383
|
-
|
|
1269
|
+
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
|
|
2384
1270
|
return false;
|
|
2385
1271
|
}
|
|
2386
1272
|
|
|
@@ -2411,18 +1297,17 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
2411
1297
|
};
|
|
2412
1298
|
|
|
2413
1299
|
// start the download
|
|
2414
|
-
|
|
2415
|
-
|
|
2416
|
-
|
|
2417
|
-
if (
|
|
2418
|
-
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
|
|
1300
|
+
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
|
1301
|
+
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
|
1302
|
+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
|
1303
|
+
if (!was_perform_successful) {
|
|
2419
1304
|
return false;
|
|
2420
1305
|
}
|
|
2421
1306
|
|
|
2422
1307
|
long http_code = 0;
|
|
2423
1308
|
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
|
2424
1309
|
if (http_code < 200 || http_code >= 400) {
|
|
2425
|
-
|
|
1310
|
+
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
|
2426
1311
|
return false;
|
|
2427
1312
|
}
|
|
2428
1313
|
|
|
@@ -2436,10 +1321,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
2436
1321
|
{"lastModified", headers.last_modified}
|
|
2437
1322
|
});
|
|
2438
1323
|
std::ofstream(metadata_path) << metadata.dump(4);
|
|
2439
|
-
|
|
1324
|
+
LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
|
2440
1325
|
|
|
2441
1326
|
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
|
2442
|
-
|
|
1327
|
+
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
|
2443
1328
|
return false;
|
|
2444
1329
|
}
|
|
2445
1330
|
}
|
|
@@ -2447,18 +1332,18 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
2447
1332
|
return true;
|
|
2448
1333
|
}
|
|
2449
1334
|
|
|
2450
|
-
struct llama_model *
|
|
1335
|
+
struct llama_model * common_load_model_from_url(
|
|
2451
1336
|
const char * model_url,
|
|
2452
1337
|
const char * path_model,
|
|
2453
1338
|
const char * hf_token,
|
|
2454
1339
|
const struct llama_model_params & params) {
|
|
2455
1340
|
// Basic validation of the model_url
|
|
2456
1341
|
if (!model_url || strlen(model_url) == 0) {
|
|
2457
|
-
|
|
1342
|
+
LOG_ERR("%s: invalid model_url\n", __func__);
|
|
2458
1343
|
return NULL;
|
|
2459
1344
|
}
|
|
2460
1345
|
|
|
2461
|
-
if (!
|
|
1346
|
+
if (!common_download_file(model_url, path_model, hf_token)) {
|
|
2462
1347
|
return NULL;
|
|
2463
1348
|
}
|
|
2464
1349
|
|
|
@@ -2471,7 +1356,7 @@ struct llama_model * llama_load_model_from_url(
|
|
|
2471
1356
|
};
|
|
2472
1357
|
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
|
|
2473
1358
|
if (!ctx_gguf) {
|
|
2474
|
-
|
|
1359
|
+
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
|
|
2475
1360
|
return NULL;
|
|
2476
1361
|
}
|
|
2477
1362
|
|
|
@@ -2491,14 +1376,12 @@ struct llama_model * llama_load_model_from_url(
|
|
|
2491
1376
|
// and extract split URL and PATH prefixes
|
|
2492
1377
|
{
|
|
2493
1378
|
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
|
|
2494
|
-
|
|
2495
|
-
" n_split=%d\n", __func__, path_model, n_split);
|
|
1379
|
+
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
|
|
2496
1380
|
return NULL;
|
|
2497
1381
|
}
|
|
2498
1382
|
|
|
2499
1383
|
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
|
2500
|
-
|
|
2501
|
-
" n_split=%d\n", __func__, model_url, n_split);
|
|
1384
|
+
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
|
|
2502
1385
|
return NULL;
|
|
2503
1386
|
}
|
|
2504
1387
|
}
|
|
@@ -2513,7 +1396,7 @@ struct llama_model * llama_load_model_from_url(
|
|
|
2513
1396
|
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
|
2514
1397
|
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
|
|
2515
1398
|
|
|
2516
|
-
return
|
|
1399
|
+
return common_download_file(split_url, split_path, hf_token);
|
|
2517
1400
|
}, idx));
|
|
2518
1401
|
}
|
|
2519
1402
|
|
|
@@ -2528,7 +1411,7 @@ struct llama_model * llama_load_model_from_url(
|
|
|
2528
1411
|
return llama_load_model_from_file(path_model, params);
|
|
2529
1412
|
}
|
|
2530
1413
|
|
|
2531
|
-
struct llama_model *
|
|
1414
|
+
struct llama_model * common_load_model_from_hf(
|
|
2532
1415
|
const char * repo,
|
|
2533
1416
|
const char * model,
|
|
2534
1417
|
const char * path_model,
|
|
@@ -2548,27 +1431,27 @@ struct llama_model * llama_load_model_from_hf(
|
|
|
2548
1431
|
model_url += "/resolve/main/";
|
|
2549
1432
|
model_url += model;
|
|
2550
1433
|
|
|
2551
|
-
return
|
|
1434
|
+
return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
|
|
2552
1435
|
}
|
|
2553
1436
|
|
|
2554
1437
|
#else
|
|
2555
1438
|
|
|
2556
|
-
struct llama_model *
|
|
1439
|
+
struct llama_model * common_load_model_from_url(
|
|
2557
1440
|
const char * /*model_url*/,
|
|
2558
1441
|
const char * /*path_model*/,
|
|
2559
1442
|
const char * /*hf_token*/,
|
|
2560
1443
|
const struct llama_model_params & /*params*/) {
|
|
2561
|
-
|
|
1444
|
+
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
|
2562
1445
|
return nullptr;
|
|
2563
1446
|
}
|
|
2564
1447
|
|
|
2565
|
-
struct llama_model *
|
|
1448
|
+
struct llama_model * common_load_model_from_hf(
|
|
2566
1449
|
const char * /*repo*/,
|
|
2567
1450
|
const char * /*model*/,
|
|
2568
1451
|
const char * /*path_model*/,
|
|
2569
1452
|
const char * /*hf_token*/,
|
|
2570
1453
|
const struct llama_model_params & /*params*/) {
|
|
2571
|
-
|
|
1454
|
+
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
|
2572
1455
|
return nullptr;
|
|
2573
1456
|
}
|
|
2574
1457
|
|
|
@@ -2578,16 +1461,18 @@ struct llama_model * llama_load_model_from_hf(
|
|
|
2578
1461
|
// Batch utils
|
|
2579
1462
|
//
|
|
2580
1463
|
|
|
2581
|
-
void
|
|
1464
|
+
void common_batch_clear(struct llama_batch & batch) {
|
|
2582
1465
|
batch.n_tokens = 0;
|
|
2583
1466
|
}
|
|
2584
1467
|
|
|
2585
|
-
void
|
|
1468
|
+
void common_batch_add(
|
|
2586
1469
|
struct llama_batch & batch,
|
|
2587
1470
|
llama_token id,
|
|
2588
1471
|
llama_pos pos,
|
|
2589
1472
|
const std::vector<llama_seq_id> & seq_ids,
|
|
2590
1473
|
bool logits) {
|
|
1474
|
+
GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
|
|
1475
|
+
|
|
2591
1476
|
batch.token [batch.n_tokens] = id;
|
|
2592
1477
|
batch.pos [batch.n_tokens] = pos;
|
|
2593
1478
|
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
|
|
@@ -2603,15 +1488,15 @@ void llama_batch_add(
|
|
|
2603
1488
|
// Vocab utils
|
|
2604
1489
|
//
|
|
2605
1490
|
|
|
2606
|
-
std::vector<llama_token>
|
|
1491
|
+
std::vector<llama_token> common_tokenize(
|
|
2607
1492
|
const struct llama_context * ctx,
|
|
2608
1493
|
const std::string & text,
|
|
2609
1494
|
bool add_special,
|
|
2610
1495
|
bool parse_special) {
|
|
2611
|
-
return
|
|
1496
|
+
return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
|
|
2612
1497
|
}
|
|
2613
1498
|
|
|
2614
|
-
std::vector<llama_token>
|
|
1499
|
+
std::vector<llama_token> common_tokenize(
|
|
2615
1500
|
const struct llama_model * model,
|
|
2616
1501
|
const std::string & text,
|
|
2617
1502
|
bool add_special,
|
|
@@ -2630,7 +1515,7 @@ std::vector<llama_token> llama_tokenize(
|
|
|
2630
1515
|
return result;
|
|
2631
1516
|
}
|
|
2632
1517
|
|
|
2633
|
-
std::string
|
|
1518
|
+
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
|
2634
1519
|
std::string piece;
|
|
2635
1520
|
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
|
2636
1521
|
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
|
@@ -2646,7 +1531,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
|
|
|
2646
1531
|
return piece;
|
|
2647
1532
|
}
|
|
2648
1533
|
|
|
2649
|
-
std::string
|
|
1534
|
+
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
|
2650
1535
|
std::string text;
|
|
2651
1536
|
text.resize(std::max(text.capacity(), tokens.size()));
|
|
2652
1537
|
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
|
@@ -2662,25 +1547,19 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
|
|
|
2662
1547
|
return text;
|
|
2663
1548
|
}
|
|
2664
1549
|
|
|
2665
|
-
bool llama_should_add_bos_token(const llama_model * model) {
|
|
2666
|
-
const int add_bos = llama_add_bos_token(model);
|
|
2667
|
-
|
|
2668
|
-
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
|
|
2669
|
-
}
|
|
2670
|
-
|
|
2671
1550
|
//
|
|
2672
1551
|
// Chat template utils
|
|
2673
1552
|
//
|
|
2674
1553
|
|
|
2675
|
-
bool
|
|
1554
|
+
bool common_chat_verify_template(const std::string & tmpl) {
|
|
2676
1555
|
llama_chat_message chat[] = {{"user", "test"}};
|
|
2677
1556
|
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
|
2678
1557
|
return res >= 0;
|
|
2679
1558
|
}
|
|
2680
1559
|
|
|
2681
|
-
std::string
|
|
1560
|
+
std::string common_chat_apply_template(const struct llama_model * model,
|
|
2682
1561
|
const std::string & tmpl,
|
|
2683
|
-
const std::vector<
|
|
1562
|
+
const std::vector<common_chat_msg> & msgs,
|
|
2684
1563
|
bool add_ass) {
|
|
2685
1564
|
int alloc_size = 0;
|
|
2686
1565
|
bool fallback = false; // indicate if we must fallback to default chatml
|
|
@@ -2722,42 +1601,42 @@ std::string llama_chat_apply_template(const struct llama_model * model,
|
|
|
2722
1601
|
return formatted_chat;
|
|
2723
1602
|
}
|
|
2724
1603
|
|
|
2725
|
-
std::string
|
|
1604
|
+
std::string common_chat_format_single(const struct llama_model * model,
|
|
2726
1605
|
const std::string & tmpl,
|
|
2727
|
-
const std::vector<
|
|
2728
|
-
const
|
|
1606
|
+
const std::vector<common_chat_msg> & past_msg,
|
|
1607
|
+
const common_chat_msg & new_msg,
|
|
2729
1608
|
bool add_ass) {
|
|
2730
1609
|
std::ostringstream ss;
|
|
2731
|
-
auto fmt_past_msg = past_msg.empty() ? "" :
|
|
2732
|
-
std::vector<
|
|
1610
|
+
auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
|
|
1611
|
+
std::vector<common_chat_msg> chat_new(past_msg);
|
|
2733
1612
|
// if the past_msg ends with a newline, we must preserve it in the formatted version
|
|
2734
1613
|
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
|
|
2735
1614
|
ss << "\n";
|
|
2736
1615
|
};
|
|
2737
1616
|
// format chat with new_msg
|
|
2738
1617
|
chat_new.push_back(new_msg);
|
|
2739
|
-
auto fmt_new_msg =
|
|
1618
|
+
auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
|
|
2740
1619
|
// get the diff part
|
|
2741
1620
|
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
|
|
2742
1621
|
return ss.str();
|
|
2743
1622
|
}
|
|
2744
1623
|
|
|
2745
|
-
std::string
|
|
1624
|
+
std::string common_chat_format_example(const struct llama_model * model,
|
|
2746
1625
|
const std::string & tmpl) {
|
|
2747
|
-
std::vector<
|
|
1626
|
+
std::vector<common_chat_msg> msgs = {
|
|
2748
1627
|
{"system", "You are a helpful assistant"},
|
|
2749
1628
|
{"user", "Hello"},
|
|
2750
1629
|
{"assistant", "Hi there"},
|
|
2751
1630
|
{"user", "How are you?"},
|
|
2752
1631
|
};
|
|
2753
|
-
return
|
|
1632
|
+
return common_chat_apply_template(model, tmpl, msgs, true);
|
|
2754
1633
|
}
|
|
2755
1634
|
|
|
2756
1635
|
//
|
|
2757
1636
|
// KV cache utils
|
|
2758
1637
|
//
|
|
2759
1638
|
|
|
2760
|
-
void
|
|
1639
|
+
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
|
2761
1640
|
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
|
2762
1641
|
|
|
2763
1642
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
|
@@ -2780,7 +1659,7 @@ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
|
|
2780
1659
|
printf("\n=== Done dumping\n");
|
|
2781
1660
|
}
|
|
2782
1661
|
|
|
2783
|
-
void
|
|
1662
|
+
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
|
2784
1663
|
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
|
2785
1664
|
|
|
2786
1665
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
|
@@ -2832,7 +1711,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
|
|
|
2832
1711
|
// Embedding utils
|
|
2833
1712
|
//
|
|
2834
1713
|
|
|
2835
|
-
void
|
|
1714
|
+
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
|
|
2836
1715
|
double sum = 0.0;
|
|
2837
1716
|
|
|
2838
1717
|
switch (embd_norm) {
|
|
@@ -2866,7 +1745,7 @@ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm)
|
|
|
2866
1745
|
}
|
|
2867
1746
|
}
|
|
2868
1747
|
|
|
2869
|
-
float
|
|
1748
|
+
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
|
|
2870
1749
|
double sum = 0.0;
|
|
2871
1750
|
double sum1 = 0.0;
|
|
2872
1751
|
double sum2 = 0.0;
|
|
@@ -2892,8 +1771,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
|
|
|
2892
1771
|
// Control vector utils
|
|
2893
1772
|
//
|
|
2894
1773
|
|
|
2895
|
-
static
|
|
2896
|
-
|
|
1774
|
+
static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
|
|
1775
|
+
common_control_vector_data result = { -1, {} };
|
|
2897
1776
|
|
|
2898
1777
|
ggml_context * ctx = nullptr;
|
|
2899
1778
|
struct gguf_init_params meta_gguf_params = {
|
|
@@ -2902,13 +1781,13 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
|
2902
1781
|
};
|
|
2903
1782
|
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
|
|
2904
1783
|
if (!ctx_gguf) {
|
|
2905
|
-
|
|
1784
|
+
LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
|
|
2906
1785
|
return result;
|
|
2907
1786
|
}
|
|
2908
1787
|
|
|
2909
1788
|
int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
|
|
2910
1789
|
if (n_tensors == 0) {
|
|
2911
|
-
|
|
1790
|
+
LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
|
|
2912
1791
|
}
|
|
2913
1792
|
|
|
2914
1793
|
for (int i = 0; i < n_tensors; i++) {
|
|
@@ -2926,23 +1805,23 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
|
2926
1805
|
}
|
|
2927
1806
|
}
|
|
2928
1807
|
if (layer_idx < 0) {
|
|
2929
|
-
|
|
1808
|
+
LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
|
2930
1809
|
result.n_embd = -1;
|
|
2931
1810
|
break;
|
|
2932
1811
|
} else if (layer_idx == 0) {
|
|
2933
|
-
|
|
1812
|
+
LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
|
2934
1813
|
result.n_embd = -1;
|
|
2935
1814
|
break;
|
|
2936
1815
|
}
|
|
2937
1816
|
|
|
2938
1817
|
struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
|
|
2939
1818
|
if (tensor->type != GGML_TYPE_F32) {
|
|
2940
|
-
|
|
1819
|
+
LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
|
|
2941
1820
|
result.n_embd = -1;
|
|
2942
1821
|
break;
|
|
2943
1822
|
}
|
|
2944
1823
|
if (ggml_n_dims(tensor) != 1) {
|
|
2945
|
-
|
|
1824
|
+
LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
|
|
2946
1825
|
result.n_embd = -1;
|
|
2947
1826
|
break;
|
|
2948
1827
|
}
|
|
@@ -2950,7 +1829,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
|
2950
1829
|
if (result.n_embd == -1) {
|
|
2951
1830
|
result.n_embd = ggml_nelements(tensor);
|
|
2952
1831
|
} else if (ggml_nelements(tensor) != result.n_embd) {
|
|
2953
|
-
|
|
1832
|
+
LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
|
|
2954
1833
|
result.n_embd = -1;
|
|
2955
1834
|
break;
|
|
2956
1835
|
}
|
|
@@ -2967,7 +1846,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
|
2967
1846
|
}
|
|
2968
1847
|
|
|
2969
1848
|
if (result.n_embd == -1) {
|
|
2970
|
-
|
|
1849
|
+
LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
|
|
2971
1850
|
result.data.clear();
|
|
2972
1851
|
}
|
|
2973
1852
|
|
|
@@ -2977,18 +1856,18 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
|
2977
1856
|
return result;
|
|
2978
1857
|
}
|
|
2979
1858
|
|
|
2980
|
-
|
|
2981
|
-
|
|
1859
|
+
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
|
|
1860
|
+
common_control_vector_data result = { -1, {} };
|
|
2982
1861
|
|
|
2983
1862
|
for (const auto & info : load_infos) {
|
|
2984
|
-
auto cur =
|
|
1863
|
+
auto cur = common_control_vector_load_one(info);
|
|
2985
1864
|
|
|
2986
1865
|
if (cur.n_embd == -1) {
|
|
2987
1866
|
result.n_embd = -1;
|
|
2988
1867
|
break;
|
|
2989
1868
|
}
|
|
2990
1869
|
if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
|
|
2991
|
-
|
|
1870
|
+
LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
|
|
2992
1871
|
result.n_embd = -1;
|
|
2993
1872
|
break;
|
|
2994
1873
|
}
|
|
@@ -3004,227 +1883,10 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|
|
3004
1883
|
}
|
|
3005
1884
|
|
|
3006
1885
|
if (result.n_embd == -1) {
|
|
3007
|
-
|
|
1886
|
+
LOG_ERR("%s: no valid control vector files passed\n", __func__);
|
|
3008
1887
|
result.data.clear();
|
|
3009
1888
|
}
|
|
3010
1889
|
|
|
3011
1890
|
return result;
|
|
3012
1891
|
}
|
|
3013
1892
|
|
|
3014
|
-
//
|
|
3015
|
-
// YAML utils
|
|
3016
|
-
//
|
|
3017
|
-
|
|
3018
|
-
void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
|
|
3019
|
-
if (data.empty()) {
|
|
3020
|
-
fprintf(stream, "%s:\n", prop_name);
|
|
3021
|
-
return;
|
|
3022
|
-
}
|
|
3023
|
-
|
|
3024
|
-
fprintf(stream, "%s: [", prop_name);
|
|
3025
|
-
for (size_t i = 0; i < data.size() - 1; ++i) {
|
|
3026
|
-
fprintf(stream, "%e, ", data[i]);
|
|
3027
|
-
}
|
|
3028
|
-
fprintf(stream, "%e]\n", data.back());
|
|
3029
|
-
}
|
|
3030
|
-
|
|
3031
|
-
void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
|
|
3032
|
-
if (data.empty()) {
|
|
3033
|
-
fprintf(stream, "%s:\n", prop_name);
|
|
3034
|
-
return;
|
|
3035
|
-
}
|
|
3036
|
-
|
|
3037
|
-
fprintf(stream, "%s: [", prop_name);
|
|
3038
|
-
for (size_t i = 0; i < data.size() - 1; ++i) {
|
|
3039
|
-
fprintf(stream, "%d, ", data[i]);
|
|
3040
|
-
}
|
|
3041
|
-
fprintf(stream, "%d]\n", data.back());
|
|
3042
|
-
}
|
|
3043
|
-
|
|
3044
|
-
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
|
|
3045
|
-
std::string data_str(data == NULL ? "" : data);
|
|
3046
|
-
|
|
3047
|
-
if (data_str.empty()) {
|
|
3048
|
-
fprintf(stream, "%s:\n", prop_name);
|
|
3049
|
-
return;
|
|
3050
|
-
}
|
|
3051
|
-
|
|
3052
|
-
size_t pos_start = 0;
|
|
3053
|
-
size_t pos_found = 0;
|
|
3054
|
-
|
|
3055
|
-
if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
|
|
3056
|
-
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
|
3057
|
-
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
|
3058
|
-
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
|
|
3059
|
-
data_str = "\"" + data_str + "\"";
|
|
3060
|
-
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
|
3061
|
-
return;
|
|
3062
|
-
}
|
|
3063
|
-
|
|
3064
|
-
if (data_str.find('\n') == std::string::npos) {
|
|
3065
|
-
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
|
3066
|
-
return;
|
|
3067
|
-
}
|
|
3068
|
-
|
|
3069
|
-
fprintf(stream, "%s: |\n", prop_name);
|
|
3070
|
-
while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
|
|
3071
|
-
fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
|
|
3072
|
-
pos_start = pos_found + 1;
|
|
3073
|
-
}
|
|
3074
|
-
}
|
|
3075
|
-
|
|
3076
|
-
void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
|
3077
|
-
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
|
3078
|
-
const llama_sampling_params & sparams = params.sparams;
|
|
3079
|
-
|
|
3080
|
-
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
|
3081
|
-
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
|
3082
|
-
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
|
3083
|
-
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
|
3084
|
-
fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
|
|
3085
|
-
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
|
3086
|
-
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
|
3087
|
-
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
|
3088
|
-
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
|
3089
|
-
fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
|
|
3090
|
-
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
|
3091
|
-
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
|
3092
|
-
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
|
3093
|
-
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
|
3094
|
-
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
|
3095
|
-
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
|
|
3096
|
-
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
|
3097
|
-
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
|
3098
|
-
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
|
3099
|
-
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
|
3100
|
-
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
|
3101
|
-
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
|
3102
|
-
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
|
|
3103
|
-
|
|
3104
|
-
#ifdef NDEBUG
|
|
3105
|
-
fprintf(stream, "debug: false\n");
|
|
3106
|
-
#else
|
|
3107
|
-
fprintf(stream, "debug: true\n");
|
|
3108
|
-
#endif // NDEBUG
|
|
3109
|
-
|
|
3110
|
-
fprintf(stream, "model_desc: %s\n", model_desc);
|
|
3111
|
-
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
|
|
3112
|
-
|
|
3113
|
-
#ifdef __OPTIMIZE__
|
|
3114
|
-
fprintf(stream, "optimize: true\n");
|
|
3115
|
-
#else
|
|
3116
|
-
fprintf(stream, "optimize: false\n");
|
|
3117
|
-
#endif // __OPTIMIZE__
|
|
3118
|
-
|
|
3119
|
-
fprintf(stream, "time: %s\n", timestamp.c_str());
|
|
3120
|
-
|
|
3121
|
-
fprintf(stream, "\n");
|
|
3122
|
-
fprintf(stream, "###############\n");
|
|
3123
|
-
fprintf(stream, "# User Inputs #\n");
|
|
3124
|
-
fprintf(stream, "###############\n");
|
|
3125
|
-
fprintf(stream, "\n");
|
|
3126
|
-
|
|
3127
|
-
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
|
3128
|
-
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
|
3129
|
-
yaml_dump_string_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
|
|
3130
|
-
fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
|
|
3131
|
-
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
|
3132
|
-
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
|
3133
|
-
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
|
3134
|
-
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
|
3135
|
-
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
|
3136
|
-
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
|
|
3137
|
-
yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
|
|
3138
|
-
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
|
3139
|
-
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
|
3140
|
-
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
|
3141
|
-
|
|
3142
|
-
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
|
|
3143
|
-
const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
|
3144
|
-
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
|
3145
|
-
|
|
3146
|
-
yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
|
3147
|
-
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
|
|
3148
|
-
yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
|
|
3149
|
-
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
|
|
3150
|
-
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
|
|
3151
|
-
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
|
|
3152
|
-
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
|
3153
|
-
|
|
3154
|
-
fprintf(stream, "logit_bias:\n");
|
|
3155
|
-
for (std::pair<llama_token, float> lb : sparams.logit_bias) {
|
|
3156
|
-
if (ignore_eos && lb.first == logit_bias_eos->first) {
|
|
3157
|
-
continue;
|
|
3158
|
-
}
|
|
3159
|
-
fprintf(stream, " %d: %f", lb.first, lb.second);
|
|
3160
|
-
}
|
|
3161
|
-
|
|
3162
|
-
fprintf(stream, "lora:\n");
|
|
3163
|
-
for (std::tuple<std::string, float> la : params.lora_adapter) {
|
|
3164
|
-
if (std::get<1>(la) != 1.0f) {
|
|
3165
|
-
continue;
|
|
3166
|
-
}
|
|
3167
|
-
fprintf(stream, " - %s\n", std::get<0>(la).c_str());
|
|
3168
|
-
}
|
|
3169
|
-
fprintf(stream, "lora_scaled:\n");
|
|
3170
|
-
for (std::tuple<std::string, float> la : params.lora_adapter) {
|
|
3171
|
-
if (std::get<1>(la) == 1.0f) {
|
|
3172
|
-
continue;
|
|
3173
|
-
}
|
|
3174
|
-
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
|
|
3175
|
-
}
|
|
3176
|
-
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
|
3177
|
-
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
|
3178
|
-
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
|
3179
|
-
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
|
3180
|
-
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
|
3181
|
-
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
|
3182
|
-
fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
|
|
3183
|
-
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
|
3184
|
-
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
|
3185
|
-
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
|
3186
|
-
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
|
3187
|
-
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
|
|
3188
|
-
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
|
3189
|
-
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
|
|
3190
|
-
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
|
3191
|
-
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
|
3192
|
-
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
|
|
3193
|
-
yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
|
|
3194
|
-
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
|
3195
|
-
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
|
3196
|
-
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
|
3197
|
-
yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
|
|
3198
|
-
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
|
|
3199
|
-
|
|
3200
|
-
fprintf(stream, "reverse_prompt:\n");
|
|
3201
|
-
for (std::string ap : params.antiprompt) {
|
|
3202
|
-
size_t pos = 0;
|
|
3203
|
-
while ((pos = ap.find('\n', pos)) != std::string::npos) {
|
|
3204
|
-
ap.replace(pos, 1, "\\n");
|
|
3205
|
-
pos += 1;
|
|
3206
|
-
}
|
|
3207
|
-
|
|
3208
|
-
fprintf(stream, " - %s\n", ap.c_str());
|
|
3209
|
-
}
|
|
3210
|
-
|
|
3211
|
-
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
|
|
3212
|
-
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
|
|
3213
|
-
fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
|
|
3214
|
-
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
|
3215
|
-
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
|
3216
|
-
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
|
3217
|
-
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
|
3218
|
-
|
|
3219
|
-
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
|
|
3220
|
-
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
|
|
3221
|
-
|
|
3222
|
-
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
|
3223
|
-
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
|
|
3224
|
-
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
|
3225
|
-
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
|
3226
|
-
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
|
3227
|
-
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
|
3228
|
-
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
|
3229
|
-
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
|
3230
|
-
}
|