@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -4,18 +4,9 @@
|
|
|
4
4
|
|
|
5
5
|
#include "llama.h"
|
|
6
6
|
|
|
7
|
-
#include "sampling.h"
|
|
8
|
-
|
|
9
|
-
#define LOG_NO_FILE_LINE_FUNCTION
|
|
10
|
-
#include "log.h"
|
|
11
|
-
|
|
12
|
-
#include <cmath>
|
|
13
7
|
#include <string>
|
|
14
8
|
#include <vector>
|
|
15
|
-
#include <
|
|
16
|
-
#include <thread>
|
|
17
|
-
#include <unordered_map>
|
|
18
|
-
#include <tuple>
|
|
9
|
+
#include <sstream>
|
|
19
10
|
|
|
20
11
|
#ifdef _WIN32
|
|
21
12
|
#define DIRECTORY_SEPARATOR '\\'
|
|
@@ -33,6 +24,15 @@
|
|
|
33
24
|
|
|
34
25
|
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
|
35
26
|
|
|
27
|
+
struct llama_lora_adapter_info {
|
|
28
|
+
std::string path;
|
|
29
|
+
float scale;
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
struct llama_lora_adapter_container : llama_lora_adapter_info {
|
|
33
|
+
struct llama_lora_adapter * adapter;
|
|
34
|
+
};
|
|
35
|
+
|
|
36
36
|
// build info
|
|
37
37
|
extern int LLAMA_BUILD_NUMBER;
|
|
38
38
|
extern char const * LLAMA_COMMIT;
|
|
@@ -45,26 +45,103 @@ struct llama_control_vector_load_info;
|
|
|
45
45
|
// CPU utils
|
|
46
46
|
//
|
|
47
47
|
|
|
48
|
+
struct cpu_params {
|
|
49
|
+
int n_threads = -1;
|
|
50
|
+
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
|
51
|
+
bool mask_valid = false; // Default: any CPU
|
|
52
|
+
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
|
53
|
+
bool strict_cpu = false; // Use strict CPU placement
|
|
54
|
+
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
|
55
|
+
};
|
|
56
|
+
|
|
48
57
|
int32_t cpu_get_num_physical_cores();
|
|
49
58
|
int32_t cpu_get_num_math();
|
|
50
59
|
|
|
51
60
|
//
|
|
52
|
-
//
|
|
61
|
+
// Common params
|
|
53
62
|
//
|
|
54
63
|
|
|
64
|
+
enum llama_example {
|
|
65
|
+
LLAMA_EXAMPLE_COMMON,
|
|
66
|
+
LLAMA_EXAMPLE_SPECULATIVE,
|
|
67
|
+
LLAMA_EXAMPLE_MAIN,
|
|
68
|
+
LLAMA_EXAMPLE_INFILL,
|
|
69
|
+
LLAMA_EXAMPLE_EMBEDDING,
|
|
70
|
+
LLAMA_EXAMPLE_PERPLEXITY,
|
|
71
|
+
LLAMA_EXAMPLE_RETRIEVAL,
|
|
72
|
+
LLAMA_EXAMPLE_PASSKEY,
|
|
73
|
+
LLAMA_EXAMPLE_IMATRIX,
|
|
74
|
+
LLAMA_EXAMPLE_BENCH,
|
|
75
|
+
LLAMA_EXAMPLE_SERVER,
|
|
76
|
+
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
|
77
|
+
LLAMA_EXAMPLE_EXPORT_LORA,
|
|
78
|
+
LLAMA_EXAMPLE_LLAVA,
|
|
79
|
+
LLAMA_EXAMPLE_LOOKUP,
|
|
80
|
+
LLAMA_EXAMPLE_PARALLEL,
|
|
81
|
+
|
|
82
|
+
LLAMA_EXAMPLE_COUNT,
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
enum gpt_sampler_type {
|
|
86
|
+
GPT_SAMPLER_TYPE_NONE = 0,
|
|
87
|
+
GPT_SAMPLER_TYPE_TOP_K = 1,
|
|
88
|
+
GPT_SAMPLER_TYPE_TOP_P = 2,
|
|
89
|
+
GPT_SAMPLER_TYPE_MIN_P = 3,
|
|
90
|
+
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
|
91
|
+
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
|
92
|
+
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
|
93
|
+
};
|
|
94
|
+
|
|
55
95
|
// dimensionality reduction methods, used by cvector-generator
|
|
56
96
|
enum dimre_method {
|
|
57
97
|
DIMRE_METHOD_PCA,
|
|
58
98
|
DIMRE_METHOD_MEAN,
|
|
59
99
|
};
|
|
60
100
|
|
|
61
|
-
|
|
62
|
-
|
|
101
|
+
// sampler parameters
|
|
102
|
+
struct gpt_sampler_params {
|
|
103
|
+
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
|
104
|
+
|
|
105
|
+
int32_t n_prev = 64; // number of previous tokens to remember
|
|
106
|
+
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
|
107
|
+
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
|
108
|
+
int32_t top_k = 40; // <= 0 to use vocab size
|
|
109
|
+
float top_p = 0.95f; // 1.0 = disabled
|
|
110
|
+
float min_p = 0.05f; // 0.0 = disabled
|
|
111
|
+
float tfs_z = 1.00f; // 1.0 = disabled
|
|
112
|
+
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
|
113
|
+
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
|
114
|
+
float dynatemp_range = 0.00f; // 0.0 = disabled
|
|
115
|
+
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
|
116
|
+
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
117
|
+
float penalty_repeat = 1.00f; // 1.0 = disabled
|
|
118
|
+
float penalty_freq = 0.00f; // 0.0 = disabled
|
|
119
|
+
float penalty_present = 0.00f; // 0.0 = disabled
|
|
120
|
+
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
121
|
+
float mirostat_tau = 5.00f; // target entropy
|
|
122
|
+
float mirostat_eta = 0.10f; // learning rate
|
|
123
|
+
bool penalize_nl = false; // consider newlines as a repeatable token
|
|
124
|
+
bool ignore_eos = false;
|
|
125
|
+
bool no_perf = false; // disable performance metrics
|
|
126
|
+
|
|
127
|
+
std::vector<enum gpt_sampler_type> samplers = {
|
|
128
|
+
GPT_SAMPLER_TYPE_TOP_K,
|
|
129
|
+
GPT_SAMPLER_TYPE_TFS_Z,
|
|
130
|
+
GPT_SAMPLER_TYPE_TYPICAL_P,
|
|
131
|
+
GPT_SAMPLER_TYPE_TOP_P,
|
|
132
|
+
GPT_SAMPLER_TYPE_MIN_P,
|
|
133
|
+
GPT_SAMPLER_TYPE_TEMPERATURE
|
|
134
|
+
};
|
|
135
|
+
|
|
136
|
+
std::string grammar; // optional BNF-like grammar to constrain sampling
|
|
137
|
+
|
|
138
|
+
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
|
139
|
+
|
|
140
|
+
// print the parameters into a string
|
|
141
|
+
std::string print() const;
|
|
142
|
+
};
|
|
63
143
|
|
|
64
|
-
|
|
65
|
-
int32_t n_threads_draft = -1;
|
|
66
|
-
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
|
67
|
-
int32_t n_threads_batch_draft = -1;
|
|
144
|
+
struct gpt_params {
|
|
68
145
|
int32_t n_predict = -1; // new tokens to predict
|
|
69
146
|
int32_t n_ctx = 0; // context size
|
|
70
147
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
@@ -91,6 +168,11 @@ struct gpt_params {
|
|
|
91
168
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
|
92
169
|
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
|
93
170
|
|
|
171
|
+
struct cpu_params cpuparams;
|
|
172
|
+
struct cpu_params cpuparams_batch;
|
|
173
|
+
struct cpu_params draft_cpuparams;
|
|
174
|
+
struct cpu_params draft_cpuparams_batch;
|
|
175
|
+
|
|
94
176
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
|
95
177
|
void * cb_eval_user_data = nullptr;
|
|
96
178
|
|
|
@@ -101,33 +183,32 @@ struct gpt_params {
|
|
|
101
183
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
|
102
184
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
|
103
185
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
std::string
|
|
108
|
-
std::string
|
|
109
|
-
std::string
|
|
110
|
-
std::string
|
|
111
|
-
std::string
|
|
112
|
-
std::string
|
|
113
|
-
std::string
|
|
114
|
-
std::string
|
|
115
|
-
std::string
|
|
116
|
-
std::string
|
|
117
|
-
std::string
|
|
118
|
-
std::string
|
|
119
|
-
std::string
|
|
120
|
-
std::string
|
|
121
|
-
std::string
|
|
122
|
-
std::string
|
|
123
|
-
std::string rpc_servers = ""; // comma separated list of RPC servers
|
|
186
|
+
struct gpt_sampler_params sparams;
|
|
187
|
+
|
|
188
|
+
std::string model = ""; // model path // NOLINT
|
|
189
|
+
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
|
|
190
|
+
std::string model_alias = "unknown"; // model alias // NOLINT
|
|
191
|
+
std::string model_url = ""; // model url to download // NOLINT
|
|
192
|
+
std::string hf_token = ""; // HF token // NOLINT
|
|
193
|
+
std::string hf_repo = ""; // HF repo // NOLINT
|
|
194
|
+
std::string hf_file = ""; // HF file // NOLINT
|
|
195
|
+
std::string prompt = ""; // NOLINT
|
|
196
|
+
std::string prompt_file = ""; // store the external prompt file name // NOLINT
|
|
197
|
+
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
|
|
198
|
+
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
|
|
199
|
+
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
|
|
200
|
+
std::string logdir = ""; // directory in which to save YAML log files // NOLINT
|
|
201
|
+
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
|
202
|
+
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
|
203
|
+
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
|
204
|
+
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
|
|
124
205
|
|
|
125
206
|
std::vector<std::string> in_files; // all input files
|
|
126
207
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
|
127
208
|
std::vector<llama_model_kv_override> kv_overrides;
|
|
128
209
|
|
|
129
|
-
//
|
|
130
|
-
std::vector<
|
|
210
|
+
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
|
|
211
|
+
std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
|
|
131
212
|
|
|
132
213
|
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
|
133
214
|
|
|
@@ -164,15 +245,15 @@ struct gpt_params {
|
|
|
164
245
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
|
165
246
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
|
166
247
|
bool flash_attn = false; // flash attention
|
|
248
|
+
bool no_perf = false; // disable performance metrics
|
|
249
|
+
bool ctx_shift = true; // context shift on inifinite text generation
|
|
167
250
|
|
|
168
251
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
|
169
|
-
bool ignore_eos = false; // ignore generated EOS tokens
|
|
170
252
|
bool logits_all = false; // return logits for all tokens in the batch
|
|
171
253
|
bool use_mmap = true; // use mmap for faster loads
|
|
172
254
|
bool use_mlock = false; // use mlock to keep model in memory
|
|
173
255
|
bool verbose_prompt = false; // print prompt tokens before generation
|
|
174
256
|
bool display_prompt = true; // print prompt before generation
|
|
175
|
-
bool infill = false; // use infill mode
|
|
176
257
|
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
|
177
258
|
bool no_kv_offload = false; // disable KV offloading
|
|
178
259
|
bool warmup = true; // warmup run
|
|
@@ -182,7 +263,7 @@ struct gpt_params {
|
|
|
182
263
|
std::string cache_type_v = "f16"; // KV cache data type for the V
|
|
183
264
|
|
|
184
265
|
// multimodal models (see examples/llava)
|
|
185
|
-
std::string mmproj = ""; // path to multimodal projector
|
|
266
|
+
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
|
186
267
|
std::vector<std::string> image; // path to image file(s)
|
|
187
268
|
|
|
188
269
|
// embedding
|
|
@@ -190,23 +271,24 @@ struct gpt_params {
|
|
|
190
271
|
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
|
191
272
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
|
192
273
|
std::string embd_sep = "\n"; // separator of embendings
|
|
274
|
+
bool reranking = false; // enable reranking support on server
|
|
193
275
|
|
|
194
276
|
// server params
|
|
195
277
|
int32_t port = 8080; // server listens on this network port
|
|
196
278
|
int32_t timeout_read = 600; // http read timeout in seconds
|
|
197
279
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
|
198
|
-
|
|
280
|
+
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
|
199
281
|
|
|
200
282
|
std::string hostname = "127.0.0.1";
|
|
201
|
-
std::string public_path = "";
|
|
202
|
-
std::string chat_template = "";
|
|
203
|
-
std::string system_prompt = "";
|
|
283
|
+
std::string public_path = ""; // NOLINT
|
|
284
|
+
std::string chat_template = ""; // NOLINT
|
|
285
|
+
std::string system_prompt = ""; // NOLINT
|
|
204
286
|
bool enable_chat_template = true;
|
|
205
287
|
|
|
206
288
|
std::vector<std::string> api_keys;
|
|
207
289
|
|
|
208
|
-
std::string ssl_file_key = "";
|
|
209
|
-
std::string ssl_file_cert = "";
|
|
290
|
+
std::string ssl_file_key = ""; // NOLINT
|
|
291
|
+
std::string ssl_file_cert = ""; // NOLINT
|
|
210
292
|
|
|
211
293
|
bool endpoint_slots = true;
|
|
212
294
|
bool endpoint_metrics = false;
|
|
@@ -256,18 +338,22 @@ struct gpt_params {
|
|
|
256
338
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
|
257
339
|
|
|
258
340
|
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
|
259
|
-
};
|
|
260
341
|
|
|
261
|
-
|
|
262
|
-
|
|
342
|
+
// batched-bench params
|
|
343
|
+
bool batched_bench_output_jsonl = false;
|
|
344
|
+
};
|
|
263
345
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
|
|
346
|
+
// call once at the start of a program if it uses libcommon
|
|
347
|
+
// initializes the logging system and prints info about the build
|
|
348
|
+
void gpt_init();
|
|
268
349
|
|
|
269
350
|
std::string gpt_params_get_system_info(const gpt_params & params);
|
|
270
351
|
|
|
352
|
+
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
|
353
|
+
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
|
354
|
+
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
|
|
355
|
+
bool set_process_priority(enum ggml_sched_priority prio);
|
|
356
|
+
|
|
271
357
|
//
|
|
272
358
|
// String utils
|
|
273
359
|
//
|
|
@@ -277,6 +363,8 @@ std::vector<std::string> string_split(std::string input, char separator);
|
|
|
277
363
|
std::string string_strip(const std::string & str);
|
|
278
364
|
std::string string_get_sortable_timestamp();
|
|
279
365
|
|
|
366
|
+
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
|
|
367
|
+
|
|
280
368
|
template<class T>
|
|
281
369
|
static std::vector<T> string_split(const std::string & str, char delim) {
|
|
282
370
|
std::vector<T> values;
|
|
@@ -294,6 +382,11 @@ static std::vector<T> string_split(const std::string & str, char delim) {
|
|
|
294
382
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
|
295
383
|
void string_process_escapes(std::string & input);
|
|
296
384
|
|
|
385
|
+
std::string string_from(bool value);
|
|
386
|
+
std::string string_from(const std::vector<int> & values);
|
|
387
|
+
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
|
|
388
|
+
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
|
|
389
|
+
|
|
297
390
|
//
|
|
298
391
|
// Filesystem utils
|
|
299
392
|
//
|
|
@@ -308,15 +401,24 @@ std::string fs_get_cache_file(const std::string & filename);
|
|
|
308
401
|
// Model utils
|
|
309
402
|
//
|
|
310
403
|
|
|
311
|
-
|
|
312
|
-
|
|
404
|
+
struct llama_init_result {
|
|
405
|
+
struct llama_model * model = nullptr;
|
|
406
|
+
struct llama_context * context = nullptr;
|
|
407
|
+
std::vector<llama_lora_adapter_container> lora_adapters;
|
|
408
|
+
};
|
|
409
|
+
|
|
410
|
+
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
|
|
313
411
|
|
|
314
|
-
struct llama_model_params
|
|
315
|
-
struct llama_context_params
|
|
412
|
+
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
|
413
|
+
struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
|
|
414
|
+
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
|
316
415
|
|
|
317
416
|
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
|
318
417
|
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
|
319
418
|
|
|
419
|
+
// clear LoRA adapters from context, then apply new list of adapters
|
|
420
|
+
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
|
|
421
|
+
|
|
320
422
|
// Batch utils
|
|
321
423
|
|
|
322
424
|
void llama_batch_clear(struct llama_batch & batch);
|
|
@@ -361,10 +463,6 @@ std::string llama_detokenize(
|
|
|
361
463
|
const std::vector<llama_token> & tokens,
|
|
362
464
|
bool special = true);
|
|
363
465
|
|
|
364
|
-
// Uses the value from the model metadata if possible, otherwise
|
|
365
|
-
// defaults to true when model type is SPM, otherwise false.
|
|
366
|
-
bool llama_should_add_bos_token(const llama_model * model);
|
|
367
|
-
|
|
368
466
|
//
|
|
369
467
|
// Chat template utils
|
|
370
468
|
//
|