@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -4,18 +4,9 @@
|
|
|
4
4
|
|
|
5
5
|
#include "llama.h"
|
|
6
6
|
|
|
7
|
-
#include "sampling.h"
|
|
8
|
-
|
|
9
|
-
#define LOG_NO_FILE_LINE_FUNCTION
|
|
10
|
-
#include "log.h"
|
|
11
|
-
|
|
12
|
-
#include <cmath>
|
|
13
7
|
#include <string>
|
|
14
8
|
#include <vector>
|
|
15
|
-
#include <
|
|
16
|
-
#include <thread>
|
|
17
|
-
#include <unordered_map>
|
|
18
|
-
#include <tuple>
|
|
9
|
+
#include <sstream>
|
|
19
10
|
|
|
20
11
|
#ifdef _WIN32
|
|
21
12
|
#define DIRECTORY_SEPARATOR '\\'
|
|
@@ -33,40 +24,138 @@
|
|
|
33
24
|
|
|
34
25
|
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
|
35
26
|
|
|
27
|
+
struct common_lora_adapter_info {
|
|
28
|
+
std::string path;
|
|
29
|
+
float scale;
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
struct common_lora_adapter_container : common_lora_adapter_info {
|
|
33
|
+
struct llama_lora_adapter * adapter;
|
|
34
|
+
};
|
|
35
|
+
|
|
36
36
|
// build info
|
|
37
37
|
extern int LLAMA_BUILD_NUMBER;
|
|
38
38
|
extern char const * LLAMA_COMMIT;
|
|
39
39
|
extern char const * LLAMA_COMPILER;
|
|
40
40
|
extern char const * LLAMA_BUILD_TARGET;
|
|
41
41
|
|
|
42
|
-
struct
|
|
42
|
+
struct common_control_vector_load_info;
|
|
43
43
|
|
|
44
44
|
//
|
|
45
45
|
// CPU utils
|
|
46
46
|
//
|
|
47
47
|
|
|
48
|
+
struct cpu_params {
|
|
49
|
+
int n_threads = -1;
|
|
50
|
+
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
|
51
|
+
bool mask_valid = false; // Default: any CPU
|
|
52
|
+
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
|
53
|
+
bool strict_cpu = false; // Use strict CPU placement
|
|
54
|
+
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
|
55
|
+
};
|
|
56
|
+
|
|
48
57
|
int32_t cpu_get_num_physical_cores();
|
|
49
58
|
int32_t cpu_get_num_math();
|
|
50
59
|
|
|
51
60
|
//
|
|
52
|
-
//
|
|
61
|
+
// Common params
|
|
53
62
|
//
|
|
54
63
|
|
|
64
|
+
enum llama_example {
|
|
65
|
+
LLAMA_EXAMPLE_COMMON,
|
|
66
|
+
LLAMA_EXAMPLE_SPECULATIVE,
|
|
67
|
+
LLAMA_EXAMPLE_MAIN,
|
|
68
|
+
LLAMA_EXAMPLE_INFILL,
|
|
69
|
+
LLAMA_EXAMPLE_EMBEDDING,
|
|
70
|
+
LLAMA_EXAMPLE_PERPLEXITY,
|
|
71
|
+
LLAMA_EXAMPLE_RETRIEVAL,
|
|
72
|
+
LLAMA_EXAMPLE_PASSKEY,
|
|
73
|
+
LLAMA_EXAMPLE_IMATRIX,
|
|
74
|
+
LLAMA_EXAMPLE_BENCH,
|
|
75
|
+
LLAMA_EXAMPLE_SERVER,
|
|
76
|
+
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
|
77
|
+
LLAMA_EXAMPLE_EXPORT_LORA,
|
|
78
|
+
LLAMA_EXAMPLE_LLAVA,
|
|
79
|
+
LLAMA_EXAMPLE_LOOKUP,
|
|
80
|
+
LLAMA_EXAMPLE_PARALLEL,
|
|
81
|
+
|
|
82
|
+
LLAMA_EXAMPLE_COUNT,
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
enum common_sampler_type {
|
|
86
|
+
COMMON_SAMPLER_TYPE_NONE = 0,
|
|
87
|
+
COMMON_SAMPLER_TYPE_DRY = 1,
|
|
88
|
+
COMMON_SAMPLER_TYPE_TOP_K = 2,
|
|
89
|
+
COMMON_SAMPLER_TYPE_TOP_P = 3,
|
|
90
|
+
COMMON_SAMPLER_TYPE_MIN_P = 4,
|
|
91
|
+
//COMMON_SAMPLER_TYPE_TFS_Z = 5,
|
|
92
|
+
COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
|
|
93
|
+
COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
|
|
94
|
+
COMMON_SAMPLER_TYPE_XTC = 8,
|
|
95
|
+
COMMON_SAMPLER_TYPE_INFILL = 9,
|
|
96
|
+
};
|
|
97
|
+
|
|
55
98
|
// dimensionality reduction methods, used by cvector-generator
|
|
56
99
|
enum dimre_method {
|
|
57
100
|
DIMRE_METHOD_PCA,
|
|
58
101
|
DIMRE_METHOD_MEAN,
|
|
59
102
|
};
|
|
60
103
|
|
|
61
|
-
|
|
62
|
-
|
|
104
|
+
// sampler parameters
|
|
105
|
+
struct common_sampler_params {
|
|
106
|
+
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
|
107
|
+
|
|
108
|
+
int32_t n_prev = 64; // number of previous tokens to remember
|
|
109
|
+
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
|
110
|
+
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
|
111
|
+
int32_t top_k = 40; // <= 0 to use vocab size
|
|
112
|
+
float top_p = 0.95f; // 1.0 = disabled
|
|
113
|
+
float min_p = 0.05f; // 0.0 = disabled
|
|
114
|
+
float xtc_probability = 0.00f; // 0.0 = disabled
|
|
115
|
+
float xtc_threshold = 0.10f; // > 0.5 disables XTC
|
|
116
|
+
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
|
117
|
+
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
|
118
|
+
float dynatemp_range = 0.00f; // 0.0 = disabled
|
|
119
|
+
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
|
120
|
+
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
121
|
+
float penalty_repeat = 1.00f; // 1.0 = disabled
|
|
122
|
+
float penalty_freq = 0.00f; // 0.0 = disabled
|
|
123
|
+
float penalty_present = 0.00f; // 0.0 = disabled
|
|
124
|
+
float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
|
|
125
|
+
float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
|
|
126
|
+
int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
|
|
127
|
+
int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
|
|
128
|
+
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
129
|
+
float mirostat_tau = 5.00f; // target entropy
|
|
130
|
+
float mirostat_eta = 0.10f; // learning rate
|
|
131
|
+
bool penalize_nl = false; // consider newlines as a repeatable token
|
|
132
|
+
bool ignore_eos = false;
|
|
133
|
+
bool no_perf = false; // disable performance metrics
|
|
134
|
+
|
|
135
|
+
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
std::vector<enum common_sampler_type> samplers = {
|
|
139
|
+
COMMON_SAMPLER_TYPE_DRY,
|
|
140
|
+
COMMON_SAMPLER_TYPE_TOP_K,
|
|
141
|
+
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
|
142
|
+
COMMON_SAMPLER_TYPE_TOP_P,
|
|
143
|
+
COMMON_SAMPLER_TYPE_MIN_P,
|
|
144
|
+
COMMON_SAMPLER_TYPE_XTC,
|
|
145
|
+
COMMON_SAMPLER_TYPE_TEMPERATURE,
|
|
146
|
+
};
|
|
147
|
+
|
|
148
|
+
std::string grammar; // optional BNF-like grammar to constrain sampling
|
|
149
|
+
|
|
150
|
+
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
|
151
|
+
|
|
152
|
+
// print the parameters into a string
|
|
153
|
+
std::string print() const;
|
|
154
|
+
};
|
|
63
155
|
|
|
64
|
-
|
|
65
|
-
int32_t n_threads_draft = -1;
|
|
66
|
-
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
|
67
|
-
int32_t n_threads_batch_draft = -1;
|
|
156
|
+
struct common_params {
|
|
68
157
|
int32_t n_predict = -1; // new tokens to predict
|
|
69
|
-
int32_t n_ctx =
|
|
158
|
+
int32_t n_ctx = 4096; // context size
|
|
70
159
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
71
160
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
|
72
161
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
|
@@ -89,7 +178,12 @@ struct gpt_params {
|
|
|
89
178
|
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
|
90
179
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
|
91
180
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
|
92
|
-
float defrag_thold =
|
|
181
|
+
float defrag_thold = 0.1f; // KV cache defragmentation threshold
|
|
182
|
+
|
|
183
|
+
struct cpu_params cpuparams;
|
|
184
|
+
struct cpu_params cpuparams_batch;
|
|
185
|
+
struct cpu_params draft_cpuparams;
|
|
186
|
+
struct cpu_params draft_cpuparams_batch;
|
|
93
187
|
|
|
94
188
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
|
95
189
|
void * cb_eval_user_data = nullptr;
|
|
@@ -101,35 +195,33 @@ struct gpt_params {
|
|
|
101
195
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
|
102
196
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
|
103
197
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
std::string
|
|
108
|
-
std::string
|
|
109
|
-
std::string
|
|
110
|
-
std::string
|
|
111
|
-
std::string
|
|
112
|
-
std::string
|
|
113
|
-
std::string
|
|
114
|
-
std::string
|
|
115
|
-
std::string
|
|
116
|
-
std::string
|
|
117
|
-
std::string
|
|
118
|
-
std::string
|
|
119
|
-
std::string
|
|
120
|
-
std::string
|
|
121
|
-
std::string
|
|
122
|
-
std::string logits_file = ""; // file for saving *all* logits
|
|
123
|
-
std::string rpc_servers = ""; // comma separated list of RPC servers
|
|
198
|
+
struct common_sampler_params sparams;
|
|
199
|
+
|
|
200
|
+
std::string model = ""; // model path // NOLINT
|
|
201
|
+
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
|
|
202
|
+
std::string model_alias = "unknown"; // model alias // NOLINT
|
|
203
|
+
std::string model_url = ""; // model url to download // NOLINT
|
|
204
|
+
std::string hf_token = ""; // HF token // NOLINT
|
|
205
|
+
std::string hf_repo = ""; // HF repo // NOLINT
|
|
206
|
+
std::string hf_file = ""; // HF file // NOLINT
|
|
207
|
+
std::string prompt = ""; // NOLINT
|
|
208
|
+
std::string prompt_file = ""; // store the external prompt file name // NOLINT
|
|
209
|
+
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
|
|
210
|
+
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
|
|
211
|
+
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
|
|
212
|
+
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
|
213
|
+
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
|
214
|
+
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
|
215
|
+
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
|
|
124
216
|
|
|
125
217
|
std::vector<std::string> in_files; // all input files
|
|
126
218
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
|
127
219
|
std::vector<llama_model_kv_override> kv_overrides;
|
|
128
220
|
|
|
129
|
-
//
|
|
130
|
-
std::vector<
|
|
221
|
+
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
|
|
222
|
+
std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
|
|
131
223
|
|
|
132
|
-
std::vector<
|
|
224
|
+
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
|
133
225
|
|
|
134
226
|
int32_t verbosity = 0;
|
|
135
227
|
int32_t control_vector_layer_start = -1; // layer range for control vector
|
|
@@ -164,15 +256,15 @@ struct gpt_params {
|
|
|
164
256
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
|
165
257
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
|
166
258
|
bool flash_attn = false; // flash attention
|
|
259
|
+
bool no_perf = false; // disable performance metrics
|
|
260
|
+
bool ctx_shift = true; // context shift on inifinite text generation
|
|
167
261
|
|
|
168
262
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
|
169
|
-
bool ignore_eos = false; // ignore generated EOS tokens
|
|
170
263
|
bool logits_all = false; // return logits for all tokens in the batch
|
|
171
264
|
bool use_mmap = true; // use mmap for faster loads
|
|
172
265
|
bool use_mlock = false; // use mlock to keep model in memory
|
|
173
266
|
bool verbose_prompt = false; // print prompt tokens before generation
|
|
174
267
|
bool display_prompt = true; // print prompt before generation
|
|
175
|
-
bool infill = false; // use infill mode
|
|
176
268
|
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
|
177
269
|
bool no_kv_offload = false; // disable KV offloading
|
|
178
270
|
bool warmup = true; // warmup run
|
|
@@ -182,33 +274,37 @@ struct gpt_params {
|
|
|
182
274
|
std::string cache_type_v = "f16"; // KV cache data type for the V
|
|
183
275
|
|
|
184
276
|
// multimodal models (see examples/llava)
|
|
185
|
-
std::string mmproj = ""; // path to multimodal projector
|
|
277
|
+
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
|
186
278
|
std::vector<std::string> image; // path to image file(s)
|
|
187
279
|
|
|
188
280
|
// embedding
|
|
189
281
|
bool embedding = false; // get only sentence embedding
|
|
190
|
-
int32_t embd_normalize = 2; // normalisation for
|
|
282
|
+
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
|
191
283
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
|
192
|
-
std::string embd_sep = "\n"; // separator of
|
|
284
|
+
std::string embd_sep = "\n"; // separator of embeddings
|
|
285
|
+
bool reranking = false; // enable reranking support on server
|
|
193
286
|
|
|
194
287
|
// server params
|
|
195
288
|
int32_t port = 8080; // server listens on this network port
|
|
196
289
|
int32_t timeout_read = 600; // http read timeout in seconds
|
|
197
290
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
|
198
|
-
int32_t n_threads_http = -1; // number of threads to process HTTP requests
|
|
291
|
+
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
|
292
|
+
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
|
199
293
|
|
|
200
294
|
std::string hostname = "127.0.0.1";
|
|
201
|
-
std::string public_path = "";
|
|
202
|
-
std::string chat_template = "";
|
|
203
|
-
std::string system_prompt = "";
|
|
295
|
+
std::string public_path = ""; // NOLINT
|
|
296
|
+
std::string chat_template = ""; // NOLINT
|
|
204
297
|
bool enable_chat_template = true;
|
|
205
298
|
|
|
206
299
|
std::vector<std::string> api_keys;
|
|
207
300
|
|
|
208
|
-
std::string ssl_file_key = "";
|
|
209
|
-
std::string ssl_file_cert = "";
|
|
301
|
+
std::string ssl_file_key = ""; // NOLINT
|
|
302
|
+
std::string ssl_file_cert = ""; // NOLINT
|
|
210
303
|
|
|
211
|
-
|
|
304
|
+
// "advanced" endpoints are disabled by default for better security
|
|
305
|
+
bool webui = true;
|
|
306
|
+
bool endpoint_slots = false;
|
|
307
|
+
bool endpoint_props = false; // only control POST requests, not GET
|
|
212
308
|
bool endpoint_metrics = false;
|
|
213
309
|
|
|
214
310
|
bool log_json = false;
|
|
@@ -256,29 +352,47 @@ struct gpt_params {
|
|
|
256
352
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
|
257
353
|
|
|
258
354
|
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
|
355
|
+
|
|
356
|
+
// batched-bench params
|
|
357
|
+
bool batched_bench_output_jsonl = false;
|
|
259
358
|
};
|
|
260
359
|
|
|
261
|
-
|
|
262
|
-
|
|
360
|
+
// call once at the start of a program if it uses libcommon
|
|
361
|
+
// initializes the logging system and prints info about the build
|
|
362
|
+
void common_init();
|
|
263
363
|
|
|
264
|
-
|
|
265
|
-
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
|
|
266
|
-
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
|
|
267
|
-
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
|
|
364
|
+
std::string common_params_get_system_info(const common_params & params);
|
|
268
365
|
|
|
269
|
-
std::string
|
|
366
|
+
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
|
367
|
+
bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
|
368
|
+
void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
|
|
369
|
+
bool set_process_priority(enum ggml_sched_priority prio);
|
|
270
370
|
|
|
271
371
|
//
|
|
272
372
|
// String utils
|
|
273
373
|
//
|
|
274
374
|
|
|
275
|
-
|
|
375
|
+
#ifdef __GNUC__
|
|
376
|
+
#ifdef __MINGW32__
|
|
377
|
+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
|
378
|
+
#else
|
|
379
|
+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
|
380
|
+
#endif
|
|
381
|
+
#else
|
|
382
|
+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
|
383
|
+
#endif
|
|
384
|
+
|
|
385
|
+
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
|
386
|
+
std::string string_format(const char * fmt, ...);
|
|
276
387
|
|
|
277
388
|
std::string string_strip(const std::string & str);
|
|
278
389
|
std::string string_get_sortable_timestamp();
|
|
279
390
|
|
|
391
|
+
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
|
|
392
|
+
|
|
280
393
|
template<class T>
|
|
281
394
|
static std::vector<T> string_split(const std::string & str, char delim) {
|
|
395
|
+
static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
|
|
282
396
|
std::vector<T> values;
|
|
283
397
|
std::istringstream str_stream(str);
|
|
284
398
|
std::string token;
|
|
@@ -291,9 +405,30 @@ static std::vector<T> string_split(const std::string & str, char delim) {
|
|
|
291
405
|
return values;
|
|
292
406
|
}
|
|
293
407
|
|
|
408
|
+
template<>
|
|
409
|
+
std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
|
|
410
|
+
{
|
|
411
|
+
std::vector<std::string> parts;
|
|
412
|
+
size_t begin_pos = 0;
|
|
413
|
+
size_t separator_pos = input.find(separator);
|
|
414
|
+
while (separator_pos != std::string::npos) {
|
|
415
|
+
std::string part = input.substr(begin_pos, separator_pos - begin_pos);
|
|
416
|
+
parts.emplace_back(part);
|
|
417
|
+
begin_pos = separator_pos + 1;
|
|
418
|
+
separator_pos = input.find(separator, begin_pos);
|
|
419
|
+
}
|
|
420
|
+
parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
|
|
421
|
+
return parts;
|
|
422
|
+
}
|
|
423
|
+
|
|
294
424
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
|
295
425
|
void string_process_escapes(std::string & input);
|
|
296
426
|
|
|
427
|
+
std::string string_from(bool value);
|
|
428
|
+
std::string string_from(const std::vector<int> & values);
|
|
429
|
+
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
|
|
430
|
+
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
|
|
431
|
+
|
|
297
432
|
//
|
|
298
433
|
// Filesystem utils
|
|
299
434
|
//
|
|
@@ -308,20 +443,29 @@ std::string fs_get_cache_file(const std::string & filename);
|
|
|
308
443
|
// Model utils
|
|
309
444
|
//
|
|
310
445
|
|
|
311
|
-
|
|
312
|
-
|
|
446
|
+
struct common_init_result {
|
|
447
|
+
struct llama_model * model = nullptr;
|
|
448
|
+
struct llama_context * context = nullptr;
|
|
449
|
+
std::vector<common_lora_adapter_container> lora_adapters;
|
|
450
|
+
};
|
|
451
|
+
|
|
452
|
+
struct common_init_result common_init_from_params(common_params & params);
|
|
453
|
+
|
|
454
|
+
struct llama_model_params common_model_params_to_llama (const common_params & params);
|
|
455
|
+
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
|
456
|
+
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
|
313
457
|
|
|
314
|
-
struct
|
|
315
|
-
struct
|
|
458
|
+
struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
|
459
|
+
struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
|
316
460
|
|
|
317
|
-
|
|
318
|
-
|
|
461
|
+
// clear LoRA adapters from context, then apply new list of adapters
|
|
462
|
+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
|
319
463
|
|
|
320
464
|
// Batch utils
|
|
321
465
|
|
|
322
|
-
void
|
|
466
|
+
void common_batch_clear(struct llama_batch & batch);
|
|
323
467
|
|
|
324
|
-
void
|
|
468
|
+
void common_batch_add(
|
|
325
469
|
struct llama_batch & batch,
|
|
326
470
|
llama_token id,
|
|
327
471
|
llama_pos pos,
|
|
@@ -334,13 +478,13 @@ void llama_batch_add(
|
|
|
334
478
|
|
|
335
479
|
// tokenizes a string into a vector of tokens
|
|
336
480
|
// should work similar to Python's `tokenizer.encode`
|
|
337
|
-
std::vector<llama_token>
|
|
481
|
+
std::vector<llama_token> common_tokenize(
|
|
338
482
|
const struct llama_context * ctx,
|
|
339
483
|
const std::string & text,
|
|
340
484
|
bool add_special,
|
|
341
485
|
bool parse_special = false);
|
|
342
486
|
|
|
343
|
-
std::vector<llama_token>
|
|
487
|
+
std::vector<llama_token> common_tokenize(
|
|
344
488
|
const struct llama_model * model,
|
|
345
489
|
const std::string & text,
|
|
346
490
|
bool add_special,
|
|
@@ -348,7 +492,7 @@ std::vector<llama_token> llama_tokenize(
|
|
|
348
492
|
|
|
349
493
|
// tokenizes a token into a piece, optionally renders special/control tokens
|
|
350
494
|
// should work similar to Python's `tokenizer.id_to_piece`
|
|
351
|
-
std::string
|
|
495
|
+
std::string common_token_to_piece(
|
|
352
496
|
const struct llama_context * ctx,
|
|
353
497
|
llama_token token,
|
|
354
498
|
bool special = true);
|
|
@@ -356,45 +500,41 @@ std::string llama_token_to_piece(
|
|
|
356
500
|
// detokenizes a vector of tokens into a string
|
|
357
501
|
// should work similar to Python's `tokenizer.decode`
|
|
358
502
|
// optionally renders special/control tokens
|
|
359
|
-
std::string
|
|
503
|
+
std::string common_detokenize(
|
|
360
504
|
llama_context * ctx,
|
|
361
505
|
const std::vector<llama_token> & tokens,
|
|
362
506
|
bool special = true);
|
|
363
507
|
|
|
364
|
-
// Uses the value from the model metadata if possible, otherwise
|
|
365
|
-
// defaults to true when model type is SPM, otherwise false.
|
|
366
|
-
bool llama_should_add_bos_token(const llama_model * model);
|
|
367
|
-
|
|
368
508
|
//
|
|
369
509
|
// Chat template utils
|
|
370
510
|
//
|
|
371
511
|
|
|
372
512
|
// same with llama_chat_message, but uses std::string
|
|
373
|
-
struct
|
|
513
|
+
struct common_chat_msg {
|
|
374
514
|
std::string role;
|
|
375
515
|
std::string content;
|
|
376
516
|
};
|
|
377
517
|
|
|
378
518
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
|
379
|
-
bool
|
|
519
|
+
bool common_chat_verify_template(const std::string & tmpl);
|
|
380
520
|
|
|
381
521
|
// CPP wrapper for llama_chat_apply_template
|
|
382
522
|
// If the built-in template is not supported, we default to chatml
|
|
383
523
|
// If the custom "tmpl" is not supported, we throw an error
|
|
384
|
-
std::string
|
|
524
|
+
std::string common_chat_apply_template(const struct llama_model * model,
|
|
385
525
|
const std::string & tmpl,
|
|
386
|
-
const std::vector<
|
|
526
|
+
const std::vector<common_chat_msg> & chat,
|
|
387
527
|
bool add_ass);
|
|
388
528
|
|
|
389
529
|
// Format single message, while taking into account the position of that message in chat history
|
|
390
|
-
std::string
|
|
530
|
+
std::string common_chat_format_single(const struct llama_model * model,
|
|
391
531
|
const std::string & tmpl,
|
|
392
|
-
const std::vector<
|
|
393
|
-
const
|
|
532
|
+
const std::vector<common_chat_msg> & past_msg,
|
|
533
|
+
const common_chat_msg & new_msg,
|
|
394
534
|
bool add_ass);
|
|
395
535
|
|
|
396
536
|
// Returns an example of formatted chat
|
|
397
|
-
std::string
|
|
537
|
+
std::string common_chat_format_example(const struct llama_model * model,
|
|
398
538
|
const std::string & tmpl);
|
|
399
539
|
|
|
400
540
|
//
|
|
@@ -402,31 +542,31 @@ std::string llama_chat_format_example(const struct llama_model * model,
|
|
|
402
542
|
//
|
|
403
543
|
|
|
404
544
|
// Dump the KV cache view with the number of sequences per cell.
|
|
405
|
-
void
|
|
545
|
+
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
|
406
546
|
|
|
407
547
|
// Dump the KV cache view showing individual sequences in each cell (long output).
|
|
408
|
-
void
|
|
548
|
+
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
|
409
549
|
|
|
410
550
|
//
|
|
411
551
|
// Embedding utils
|
|
412
552
|
//
|
|
413
553
|
|
|
414
|
-
void
|
|
554
|
+
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
|
|
415
555
|
|
|
416
|
-
float
|
|
556
|
+
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
|
417
557
|
|
|
418
558
|
//
|
|
419
559
|
// Control vector utils
|
|
420
560
|
//
|
|
421
561
|
|
|
422
|
-
struct
|
|
562
|
+
struct common_control_vector_data {
|
|
423
563
|
int n_embd;
|
|
424
564
|
|
|
425
565
|
// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
|
|
426
566
|
std::vector<float> data;
|
|
427
567
|
};
|
|
428
568
|
|
|
429
|
-
struct
|
|
569
|
+
struct common_control_vector_load_info {
|
|
430
570
|
float strength;
|
|
431
571
|
|
|
432
572
|
std::string fname;
|
|
@@ -434,7 +574,7 @@ struct llama_control_vector_load_info {
|
|
|
434
574
|
|
|
435
575
|
// Load control vectors, scale each by strength, and add them together.
|
|
436
576
|
// On error, returns {-1, empty}
|
|
437
|
-
|
|
577
|
+
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
|
|
438
578
|
|
|
439
579
|
//
|
|
440
580
|
// Split utils
|
|
@@ -443,15 +583,3 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|
|
443
583
|
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
|
444
584
|
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
|
445
585
|
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
|
446
|
-
|
|
447
|
-
//
|
|
448
|
-
// YAML utils
|
|
449
|
-
//
|
|
450
|
-
|
|
451
|
-
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
|
|
452
|
-
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
|
|
453
|
-
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
|
454
|
-
|
|
455
|
-
void yaml_dump_non_result_info(
|
|
456
|
-
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
|
457
|
-
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|