@fugood/llama.node 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +8 -9
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +43 -9
- package/src/llama.cpp/.github/workflows/docker.yml +3 -0
- package/src/llama.cpp/CMakeLists.txt +7 -4
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +0 -2
- package/src/llama.cpp/common/arg.cpp +642 -607
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +79 -281
- package/src/llama.cpp/common/common.h +130 -100
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +116 -108
- package/src/llama.cpp/common/sampling.h +20 -20
- package/src/llama.cpp/docs/build.md +37 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +14 -14
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
- package/src/llama.cpp/examples/infill/infill.cpp +40 -86
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/clip.cpp +1 -0
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +37 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
- package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
- package/src/llama.cpp/examples/main/main.cpp +64 -109
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
- package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
- package/src/llama.cpp/examples/server/server.cpp +553 -691
- package/src/llama.cpp/examples/server/utils.hpp +312 -25
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +128 -96
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +53 -393
- package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
- package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
- package/src/llama.cpp/include/llama.h +67 -33
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-sampling.cpp +745 -105
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +49 -9
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +2636 -2406
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
- package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +1 -0
- package/src/llama.cpp/tests/test-sampling.cpp +162 -137
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
- /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
|
@@ -17,27 +17,27 @@
|
|
|
17
17
|
|
|
18
18
|
using json = nlohmann::ordered_json;
|
|
19
19
|
|
|
20
|
-
|
|
20
|
+
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
|
|
21
21
|
this->examples = std::move(examples);
|
|
22
22
|
return *this;
|
|
23
23
|
}
|
|
24
24
|
|
|
25
|
-
|
|
25
|
+
common_arg & common_arg::set_env(const char * env) {
|
|
26
26
|
help = help + "\n(env: " + env + ")";
|
|
27
27
|
this->env = env;
|
|
28
28
|
return *this;
|
|
29
29
|
}
|
|
30
30
|
|
|
31
|
-
|
|
31
|
+
common_arg & common_arg::set_sparam() {
|
|
32
32
|
is_sparam = true;
|
|
33
33
|
return *this;
|
|
34
34
|
}
|
|
35
35
|
|
|
36
|
-
bool
|
|
36
|
+
bool common_arg::in_example(enum llama_example ex) {
|
|
37
37
|
return examples.find(ex) != examples.end();
|
|
38
38
|
}
|
|
39
39
|
|
|
40
|
-
bool
|
|
40
|
+
bool common_arg::get_value_from_env(std::string & output) {
|
|
41
41
|
if (env == nullptr) return false;
|
|
42
42
|
char * value = std::getenv(env);
|
|
43
43
|
if (value) {
|
|
@@ -47,7 +47,7 @@ bool llama_arg::get_value_from_env(std::string & output) {
|
|
|
47
47
|
return false;
|
|
48
48
|
}
|
|
49
49
|
|
|
50
|
-
bool
|
|
50
|
+
bool common_arg::has_value_from_env() {
|
|
51
51
|
return env != nullptr && std::getenv(env);
|
|
52
52
|
}
|
|
53
53
|
|
|
@@ -78,7 +78,7 @@ static std::vector<std::string> break_str_into_lines(std::string input, size_t m
|
|
|
78
78
|
return result;
|
|
79
79
|
}
|
|
80
80
|
|
|
81
|
-
std::string
|
|
81
|
+
std::string common_arg::to_string() {
|
|
82
82
|
// params for printing to console
|
|
83
83
|
const static int n_leading_spaces = 40;
|
|
84
84
|
const static int n_char_per_line_help = 70; // TODO: detect this based on current console
|
|
@@ -119,33 +119,7 @@ std::string llama_arg::to_string() {
|
|
|
119
119
|
// utils
|
|
120
120
|
//
|
|
121
121
|
|
|
122
|
-
|
|
123
|
-
#ifdef __MINGW32__
|
|
124
|
-
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
|
125
|
-
#else
|
|
126
|
-
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
|
127
|
-
#endif
|
|
128
|
-
#else
|
|
129
|
-
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
|
130
|
-
#endif
|
|
131
|
-
|
|
132
|
-
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
|
133
|
-
static std::string format(const char * fmt, ...) {
|
|
134
|
-
va_list ap;
|
|
135
|
-
va_list ap2;
|
|
136
|
-
va_start(ap, fmt);
|
|
137
|
-
va_copy(ap2, ap);
|
|
138
|
-
int size = vsnprintf(NULL, 0, fmt, ap);
|
|
139
|
-
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
|
140
|
-
std::vector<char> buf(size + 1);
|
|
141
|
-
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
|
142
|
-
GGML_ASSERT(size2 == size);
|
|
143
|
-
va_end(ap2);
|
|
144
|
-
va_end(ap);
|
|
145
|
-
return std::string(buf.data(), size);
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
static void gpt_params_handle_model_default(gpt_params & params) {
|
|
122
|
+
static void common_params_handle_model_default(common_params & params) {
|
|
149
123
|
if (!params.hf_repo.empty()) {
|
|
150
124
|
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
151
125
|
if (params.hf_file.empty()) {
|
|
@@ -154,13 +128,13 @@ static void gpt_params_handle_model_default(gpt_params & params) {
|
|
|
154
128
|
}
|
|
155
129
|
params.hf_file = params.model;
|
|
156
130
|
} else if (params.model.empty()) {
|
|
157
|
-
params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
|
|
131
|
+
params.model = fs_get_cache_file(string_split<std::string>(params.hf_file, '/').back());
|
|
158
132
|
}
|
|
159
133
|
} else if (!params.model_url.empty()) {
|
|
160
134
|
if (params.model.empty()) {
|
|
161
|
-
auto f = string_split(params.model_url, '#').front();
|
|
162
|
-
f = string_split(f, '?').front();
|
|
163
|
-
params.model = fs_get_cache_file(string_split(f, '/').back());
|
|
135
|
+
auto f = string_split<std::string>(params.model_url, '#').front();
|
|
136
|
+
f = string_split<std::string>(f, '?').front();
|
|
137
|
+
params.model = fs_get_cache_file(string_split<std::string>(f, '/').back());
|
|
164
138
|
}
|
|
165
139
|
} else if (params.model.empty()) {
|
|
166
140
|
params.model = DEFAULT_MODEL_PATH;
|
|
@@ -171,12 +145,12 @@ static void gpt_params_handle_model_default(gpt_params & params) {
|
|
|
171
145
|
// CLI argument parsing functions
|
|
172
146
|
//
|
|
173
147
|
|
|
174
|
-
static bool
|
|
148
|
+
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
|
|
175
149
|
std::string arg;
|
|
176
150
|
const std::string arg_prefix = "--";
|
|
177
|
-
|
|
151
|
+
common_params & params = ctx_arg.params;
|
|
178
152
|
|
|
179
|
-
std::unordered_map<std::string,
|
|
153
|
+
std::unordered_map<std::string, common_arg *> arg_to_options;
|
|
180
154
|
for (auto & opt : ctx_arg.options) {
|
|
181
155
|
for (const auto & arg : opt.args) {
|
|
182
156
|
arg_to_options[arg] = &opt;
|
|
@@ -199,7 +173,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
|
|
|
199
173
|
continue;
|
|
200
174
|
}
|
|
201
175
|
} catch (std::exception & e) {
|
|
202
|
-
throw std::invalid_argument(
|
|
176
|
+
throw std::invalid_argument(string_format(
|
|
203
177
|
"error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
|
|
204
178
|
}
|
|
205
179
|
}
|
|
@@ -220,7 +194,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
|
|
|
220
194
|
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
221
195
|
}
|
|
222
196
|
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
|
223
|
-
throw std::invalid_argument(
|
|
197
|
+
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
|
224
198
|
}
|
|
225
199
|
auto opt = *arg_to_options[arg];
|
|
226
200
|
if (opt.has_value_from_env()) {
|
|
@@ -252,7 +226,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
|
|
|
252
226
|
continue;
|
|
253
227
|
}
|
|
254
228
|
} catch (std::exception & e) {
|
|
255
|
-
throw std::invalid_argument(
|
|
229
|
+
throw std::invalid_argument(string_format(
|
|
256
230
|
"error while handling argument \"%s\": %s\n\n"
|
|
257
231
|
"usage:\n%s\n\nto show complete usage, run with -h",
|
|
258
232
|
arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
|
|
@@ -268,7 +242,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
|
|
|
268
242
|
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
|
269
243
|
}
|
|
270
244
|
|
|
271
|
-
|
|
245
|
+
common_params_handle_model_default(params);
|
|
272
246
|
|
|
273
247
|
if (params.escape) {
|
|
274
248
|
string_process_escapes(params.prompt);
|
|
@@ -277,6 +251,9 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
|
|
|
277
251
|
for (auto & antiprompt : params.antiprompt) {
|
|
278
252
|
string_process_escapes(antiprompt);
|
|
279
253
|
}
|
|
254
|
+
for (auto & seq_breaker : params.sparams.dry_sequence_breakers) {
|
|
255
|
+
string_process_escapes(seq_breaker);
|
|
256
|
+
}
|
|
280
257
|
}
|
|
281
258
|
|
|
282
259
|
if (!params.kv_overrides.empty()) {
|
|
@@ -291,16 +268,16 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
|
|
|
291
268
|
return true;
|
|
292
269
|
}
|
|
293
270
|
|
|
294
|
-
static void
|
|
295
|
-
auto print_options = [](std::vector<
|
|
296
|
-
for (
|
|
271
|
+
static void common_params_print_usage(common_params_context & ctx_arg) {
|
|
272
|
+
auto print_options = [](std::vector<common_arg *> & options) {
|
|
273
|
+
for (common_arg * opt : options) {
|
|
297
274
|
printf("%s", opt->to_string().c_str());
|
|
298
275
|
}
|
|
299
276
|
};
|
|
300
277
|
|
|
301
|
-
std::vector<
|
|
302
|
-
std::vector<
|
|
303
|
-
std::vector<
|
|
278
|
+
std::vector<common_arg *> common_options;
|
|
279
|
+
std::vector<common_arg *> sparam_options;
|
|
280
|
+
std::vector<common_arg *> specific_options;
|
|
304
281
|
for (auto & opt : ctx_arg.options) {
|
|
305
282
|
// in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
|
|
306
283
|
if (opt.is_sparam) {
|
|
@@ -320,17 +297,17 @@ static void gpt_params_print_usage(gpt_params_context & ctx_arg) {
|
|
|
320
297
|
print_options(specific_options);
|
|
321
298
|
}
|
|
322
299
|
|
|
323
|
-
bool
|
|
324
|
-
auto ctx_arg =
|
|
325
|
-
const
|
|
300
|
+
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
|
301
|
+
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
|
|
302
|
+
const common_params params_org = ctx_arg.params; // the example can modify the default params
|
|
326
303
|
|
|
327
304
|
try {
|
|
328
|
-
if (!
|
|
305
|
+
if (!common_params_parse_ex(argc, argv, ctx_arg)) {
|
|
329
306
|
ctx_arg.params = params_org;
|
|
330
307
|
return false;
|
|
331
308
|
}
|
|
332
309
|
if (ctx_arg.params.usage) {
|
|
333
|
-
|
|
310
|
+
common_params_print_usage(ctx_arg);
|
|
334
311
|
if (ctx_arg.print_usage) {
|
|
335
312
|
ctx_arg.print_usage(argc, argv);
|
|
336
313
|
}
|
|
@@ -345,16 +322,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example
|
|
|
345
322
|
return true;
|
|
346
323
|
}
|
|
347
324
|
|
|
348
|
-
|
|
349
|
-
|
|
325
|
+
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
|
326
|
+
common_params_context ctx_arg(params);
|
|
350
327
|
ctx_arg.print_usage = print_usage;
|
|
351
328
|
ctx_arg.ex = ex;
|
|
352
329
|
|
|
353
330
|
std::string sampler_type_chars;
|
|
354
331
|
std::string sampler_type_names;
|
|
355
332
|
for (const auto & sampler : params.sparams.samplers) {
|
|
356
|
-
sampler_type_chars +=
|
|
357
|
-
sampler_type_names +=
|
|
333
|
+
sampler_type_chars += common_sampler_type_to_chr(sampler);
|
|
334
|
+
sampler_type_names += common_sampler_type_to_str(sampler) + ";";
|
|
358
335
|
}
|
|
359
336
|
sampler_type_names.pop_back();
|
|
360
337
|
|
|
@@ -366,374 +343,374 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
366
343
|
* - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
|
|
367
344
|
* - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
|
|
368
345
|
*/
|
|
369
|
-
auto add_opt = [&](
|
|
346
|
+
auto add_opt = [&](common_arg arg) {
|
|
370
347
|
if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
|
|
371
348
|
ctx_arg.options.push_back(std::move(arg));
|
|
372
349
|
}
|
|
373
350
|
};
|
|
374
351
|
|
|
375
352
|
|
|
376
|
-
add_opt(
|
|
353
|
+
add_opt(common_arg(
|
|
377
354
|
{"-h", "--help", "--usage"},
|
|
378
355
|
"print usage and exit",
|
|
379
|
-
[](
|
|
356
|
+
[](common_params & params) {
|
|
380
357
|
params.usage = true;
|
|
381
358
|
}
|
|
382
359
|
));
|
|
383
|
-
add_opt(
|
|
360
|
+
add_opt(common_arg(
|
|
384
361
|
{"--version"},
|
|
385
362
|
"show version and build info",
|
|
386
|
-
[](
|
|
363
|
+
[](common_params &) {
|
|
387
364
|
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
|
388
365
|
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
|
389
366
|
exit(0);
|
|
390
367
|
}
|
|
391
368
|
));
|
|
392
|
-
add_opt(
|
|
369
|
+
add_opt(common_arg(
|
|
393
370
|
{"--verbose-prompt"},
|
|
394
|
-
|
|
395
|
-
[](
|
|
371
|
+
string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
|
|
372
|
+
[](common_params & params) {
|
|
396
373
|
params.verbose_prompt = true;
|
|
397
374
|
}
|
|
398
375
|
));
|
|
399
|
-
add_opt(
|
|
376
|
+
add_opt(common_arg(
|
|
400
377
|
{"--no-display-prompt"},
|
|
401
|
-
|
|
402
|
-
[](
|
|
378
|
+
string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
|
|
379
|
+
[](common_params & params) {
|
|
403
380
|
params.display_prompt = false;
|
|
404
381
|
}
|
|
405
382
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
406
|
-
add_opt(
|
|
383
|
+
add_opt(common_arg(
|
|
407
384
|
{"-co", "--color"},
|
|
408
|
-
|
|
409
|
-
[](
|
|
385
|
+
string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
|
|
386
|
+
[](common_params & params) {
|
|
410
387
|
params.use_color = true;
|
|
411
388
|
}
|
|
412
389
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
|
413
|
-
add_opt(
|
|
390
|
+
add_opt(common_arg(
|
|
414
391
|
{"-t", "--threads"}, "N",
|
|
415
|
-
|
|
416
|
-
[](
|
|
392
|
+
string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
|
393
|
+
[](common_params & params, int value) {
|
|
417
394
|
params.cpuparams.n_threads = value;
|
|
418
395
|
if (params.cpuparams.n_threads <= 0) {
|
|
419
396
|
params.cpuparams.n_threads = std::thread::hardware_concurrency();
|
|
420
397
|
}
|
|
421
398
|
}
|
|
422
399
|
).set_env("LLAMA_ARG_THREADS"));
|
|
423
|
-
add_opt(
|
|
400
|
+
add_opt(common_arg(
|
|
424
401
|
{"-tb", "--threads-batch"}, "N",
|
|
425
402
|
"number of threads to use during batch and prompt processing (default: same as --threads)",
|
|
426
|
-
[](
|
|
403
|
+
[](common_params & params, int value) {
|
|
427
404
|
params.cpuparams_batch.n_threads = value;
|
|
428
405
|
if (params.cpuparams_batch.n_threads <= 0) {
|
|
429
406
|
params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
|
430
407
|
}
|
|
431
408
|
}
|
|
432
409
|
));
|
|
433
|
-
add_opt(
|
|
410
|
+
add_opt(common_arg(
|
|
434
411
|
{"-td", "--threads-draft"}, "N",
|
|
435
412
|
"number of threads to use during generation (default: same as --threads)",
|
|
436
|
-
[](
|
|
413
|
+
[](common_params & params, int value) {
|
|
437
414
|
params.draft_cpuparams.n_threads = value;
|
|
438
415
|
if (params.draft_cpuparams.n_threads <= 0) {
|
|
439
416
|
params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
|
|
440
417
|
}
|
|
441
418
|
}
|
|
442
419
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
443
|
-
add_opt(
|
|
420
|
+
add_opt(common_arg(
|
|
444
421
|
{"-tbd", "--threads-batch-draft"}, "N",
|
|
445
422
|
"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
|
|
446
|
-
[](
|
|
423
|
+
[](common_params & params, int value) {
|
|
447
424
|
params.draft_cpuparams_batch.n_threads = value;
|
|
448
425
|
if (params.draft_cpuparams_batch.n_threads <= 0) {
|
|
449
426
|
params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
|
450
427
|
}
|
|
451
428
|
}
|
|
452
429
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
453
|
-
add_opt(
|
|
430
|
+
add_opt(common_arg(
|
|
454
431
|
{"-C", "--cpu-mask"}, "M",
|
|
455
432
|
"CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
|
|
456
|
-
[](
|
|
433
|
+
[](common_params & params, const std::string & mask) {
|
|
457
434
|
params.cpuparams.mask_valid = true;
|
|
458
435
|
if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) {
|
|
459
436
|
throw std::invalid_argument("invalid cpumask");
|
|
460
437
|
}
|
|
461
438
|
}
|
|
462
439
|
));
|
|
463
|
-
add_opt(
|
|
440
|
+
add_opt(common_arg(
|
|
464
441
|
{"-Cr", "--cpu-range"}, "lo-hi",
|
|
465
442
|
"range of CPUs for affinity. Complements --cpu-mask",
|
|
466
|
-
[](
|
|
443
|
+
[](common_params & params, const std::string & range) {
|
|
467
444
|
params.cpuparams.mask_valid = true;
|
|
468
445
|
if (!parse_cpu_range(range, params.cpuparams.cpumask)) {
|
|
469
446
|
throw std::invalid_argument("invalid range");
|
|
470
447
|
}
|
|
471
448
|
}
|
|
472
449
|
));
|
|
473
|
-
add_opt(
|
|
450
|
+
add_opt(common_arg(
|
|
474
451
|
{"--cpu-strict"}, "<0|1>",
|
|
475
|
-
|
|
476
|
-
[](
|
|
452
|
+
string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
|
|
453
|
+
[](common_params & params, const std::string & value) {
|
|
477
454
|
params.cpuparams.strict_cpu = std::stoul(value);
|
|
478
455
|
}
|
|
479
456
|
));
|
|
480
|
-
add_opt(
|
|
457
|
+
add_opt(common_arg(
|
|
481
458
|
{"--prio"}, "N",
|
|
482
|
-
|
|
483
|
-
[](
|
|
459
|
+
string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
|
|
460
|
+
[](common_params & params, int prio) {
|
|
484
461
|
if (prio < 0 || prio > 3) {
|
|
485
462
|
throw std::invalid_argument("invalid value");
|
|
486
463
|
}
|
|
487
464
|
params.cpuparams.priority = (enum ggml_sched_priority) prio;
|
|
488
465
|
}
|
|
489
466
|
));
|
|
490
|
-
add_opt(
|
|
467
|
+
add_opt(common_arg(
|
|
491
468
|
{"--poll"}, "<0...100>",
|
|
492
|
-
|
|
493
|
-
[](
|
|
469
|
+
string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
|
|
470
|
+
[](common_params & params, const std::string & value) {
|
|
494
471
|
params.cpuparams.poll = std::stoul(value);
|
|
495
472
|
}
|
|
496
473
|
));
|
|
497
|
-
add_opt(
|
|
474
|
+
add_opt(common_arg(
|
|
498
475
|
{"-Cb", "--cpu-mask-batch"}, "M",
|
|
499
476
|
"CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
|
|
500
|
-
[](
|
|
477
|
+
[](common_params & params, const std::string & mask) {
|
|
501
478
|
params.cpuparams_batch.mask_valid = true;
|
|
502
479
|
if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) {
|
|
503
480
|
throw std::invalid_argument("invalid cpumask");
|
|
504
481
|
}
|
|
505
482
|
}
|
|
506
483
|
));
|
|
507
|
-
add_opt(
|
|
484
|
+
add_opt(common_arg(
|
|
508
485
|
{"-Crb", "--cpu-range-batch"}, "lo-hi",
|
|
509
486
|
"ranges of CPUs for affinity. Complements --cpu-mask-batch",
|
|
510
|
-
[](
|
|
487
|
+
[](common_params & params, const std::string & range) {
|
|
511
488
|
params.cpuparams_batch.mask_valid = true;
|
|
512
489
|
if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) {
|
|
513
490
|
throw std::invalid_argument("invalid range");
|
|
514
491
|
}
|
|
515
492
|
}
|
|
516
493
|
));
|
|
517
|
-
add_opt(
|
|
494
|
+
add_opt(common_arg(
|
|
518
495
|
{"--cpu-strict-batch"}, "<0|1>",
|
|
519
496
|
"use strict CPU placement (default: same as --cpu-strict)",
|
|
520
|
-
[](
|
|
497
|
+
[](common_params & params, int value) {
|
|
521
498
|
params.cpuparams_batch.strict_cpu = value;
|
|
522
499
|
}
|
|
523
500
|
));
|
|
524
|
-
add_opt(
|
|
501
|
+
add_opt(common_arg(
|
|
525
502
|
{"--prio-batch"}, "N",
|
|
526
|
-
|
|
527
|
-
[](
|
|
503
|
+
string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
|
|
504
|
+
[](common_params & params, int prio) {
|
|
528
505
|
if (prio < 0 || prio > 3) {
|
|
529
506
|
throw std::invalid_argument("invalid value");
|
|
530
507
|
}
|
|
531
508
|
params.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
|
|
532
509
|
}
|
|
533
510
|
));
|
|
534
|
-
add_opt(
|
|
511
|
+
add_opt(common_arg(
|
|
535
512
|
{"--poll-batch"}, "<0|1>",
|
|
536
513
|
"use polling to wait for work (default: same as --poll)",
|
|
537
|
-
[](
|
|
514
|
+
[](common_params & params, int value) {
|
|
538
515
|
params.cpuparams_batch.poll = value;
|
|
539
516
|
}
|
|
540
517
|
));
|
|
541
|
-
add_opt(
|
|
518
|
+
add_opt(common_arg(
|
|
542
519
|
{"-Cd", "--cpu-mask-draft"}, "M",
|
|
543
520
|
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
|
544
|
-
[](
|
|
521
|
+
[](common_params & params, const std::string & mask) {
|
|
545
522
|
params.draft_cpuparams.mask_valid = true;
|
|
546
523
|
if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
|
|
547
524
|
throw std::invalid_argument("invalid cpumask");
|
|
548
525
|
}
|
|
549
526
|
}
|
|
550
527
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
551
|
-
add_opt(
|
|
528
|
+
add_opt(common_arg(
|
|
552
529
|
{"-Crd", "--cpu-range-draft"}, "lo-hi",
|
|
553
530
|
"Ranges of CPUs for affinity. Complements --cpu-mask-draft",
|
|
554
|
-
[](
|
|
531
|
+
[](common_params & params, const std::string & range) {
|
|
555
532
|
params.draft_cpuparams.mask_valid = true;
|
|
556
533
|
if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
|
|
557
534
|
throw std::invalid_argument("invalid range");
|
|
558
535
|
}
|
|
559
536
|
}
|
|
560
537
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
561
|
-
add_opt(
|
|
538
|
+
add_opt(common_arg(
|
|
562
539
|
{"--cpu-strict-draft"}, "<0|1>",
|
|
563
540
|
"Use strict CPU placement for draft model (default: same as --cpu-strict)",
|
|
564
|
-
[](
|
|
541
|
+
[](common_params & params, int value) {
|
|
565
542
|
params.draft_cpuparams.strict_cpu = value;
|
|
566
543
|
}
|
|
567
544
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
568
|
-
add_opt(
|
|
545
|
+
add_opt(common_arg(
|
|
569
546
|
{"--prio-draft"}, "N",
|
|
570
|
-
|
|
571
|
-
[](
|
|
547
|
+
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
|
|
548
|
+
[](common_params & params, int prio) {
|
|
572
549
|
if (prio < 0 || prio > 3) {
|
|
573
550
|
throw std::invalid_argument("invalid value");
|
|
574
551
|
}
|
|
575
552
|
params.draft_cpuparams.priority = (enum ggml_sched_priority) prio;
|
|
576
553
|
}
|
|
577
554
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
578
|
-
add_opt(
|
|
555
|
+
add_opt(common_arg(
|
|
579
556
|
{"--poll-draft"}, "<0|1>",
|
|
580
557
|
"Use polling to wait for draft model work (default: same as --poll])",
|
|
581
|
-
[](
|
|
558
|
+
[](common_params & params, int value) {
|
|
582
559
|
params.draft_cpuparams.poll = value;
|
|
583
560
|
}
|
|
584
561
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
585
|
-
add_opt(
|
|
562
|
+
add_opt(common_arg(
|
|
586
563
|
{"-Cbd", "--cpu-mask-batch-draft"}, "M",
|
|
587
564
|
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
|
588
|
-
[](
|
|
565
|
+
[](common_params & params, const std::string & mask) {
|
|
589
566
|
params.draft_cpuparams_batch.mask_valid = true;
|
|
590
567
|
if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) {
|
|
591
568
|
throw std::invalid_argument("invalid cpumask");
|
|
592
569
|
}
|
|
593
570
|
}
|
|
594
571
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
595
|
-
add_opt(
|
|
572
|
+
add_opt(common_arg(
|
|
596
573
|
{"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
|
|
597
574
|
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
|
|
598
|
-
[](
|
|
575
|
+
[](common_params & params, const std::string & range) {
|
|
599
576
|
params.draft_cpuparams_batch.mask_valid = true;
|
|
600
577
|
if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
|
|
601
578
|
throw std::invalid_argument("invalid cpumask");
|
|
602
579
|
}
|
|
603
580
|
}
|
|
604
581
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
605
|
-
add_opt(
|
|
582
|
+
add_opt(common_arg(
|
|
606
583
|
{"--cpu-strict-batch-draft"}, "<0|1>",
|
|
607
584
|
"Use strict CPU placement for draft model (default: --cpu-strict-draft)",
|
|
608
|
-
[](
|
|
585
|
+
[](common_params & params, int value) {
|
|
609
586
|
params.draft_cpuparams_batch.strict_cpu = value;
|
|
610
587
|
}
|
|
611
588
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
612
|
-
add_opt(
|
|
589
|
+
add_opt(common_arg(
|
|
613
590
|
{"--prio-batch-draft"}, "N",
|
|
614
|
-
|
|
615
|
-
[](
|
|
591
|
+
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
|
|
592
|
+
[](common_params & params, int prio) {
|
|
616
593
|
if (prio < 0 || prio > 3) {
|
|
617
594
|
throw std::invalid_argument("invalid value");
|
|
618
595
|
}
|
|
619
596
|
params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio;
|
|
620
597
|
}
|
|
621
598
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
622
|
-
add_opt(
|
|
599
|
+
add_opt(common_arg(
|
|
623
600
|
{"--poll-batch-draft"}, "<0|1>",
|
|
624
601
|
"Use polling to wait for draft model work (default: --poll-draft)",
|
|
625
|
-
[](
|
|
602
|
+
[](common_params & params, int value) {
|
|
626
603
|
params.draft_cpuparams_batch.poll = value;
|
|
627
604
|
}
|
|
628
605
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
629
|
-
add_opt(
|
|
606
|
+
add_opt(common_arg(
|
|
630
607
|
{"--draft"}, "N",
|
|
631
|
-
|
|
632
|
-
[](
|
|
608
|
+
string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
|
|
609
|
+
[](common_params & params, int value) {
|
|
633
610
|
params.n_draft = value;
|
|
634
611
|
}
|
|
635
612
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
|
636
|
-
add_opt(
|
|
613
|
+
add_opt(common_arg(
|
|
637
614
|
{"-ps", "--p-split"}, "N",
|
|
638
|
-
|
|
639
|
-
[](
|
|
615
|
+
string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
|
|
616
|
+
[](common_params & params, const std::string & value) {
|
|
640
617
|
params.p_split = std::stof(value);
|
|
641
618
|
}
|
|
642
619
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
643
|
-
add_opt(
|
|
620
|
+
add_opt(common_arg(
|
|
644
621
|
{"-lcs", "--lookup-cache-static"}, "FNAME",
|
|
645
622
|
"path to static lookup cache to use for lookup decoding (not updated by generation)",
|
|
646
|
-
[](
|
|
623
|
+
[](common_params & params, const std::string & value) {
|
|
647
624
|
params.lookup_cache_static = value;
|
|
648
625
|
}
|
|
649
626
|
).set_examples({LLAMA_EXAMPLE_LOOKUP}));
|
|
650
|
-
add_opt(
|
|
627
|
+
add_opt(common_arg(
|
|
651
628
|
{"-lcd", "--lookup-cache-dynamic"}, "FNAME",
|
|
652
629
|
"path to dynamic lookup cache to use for lookup decoding (updated by generation)",
|
|
653
|
-
[](
|
|
630
|
+
[](common_params & params, const std::string & value) {
|
|
654
631
|
params.lookup_cache_dynamic = value;
|
|
655
632
|
}
|
|
656
633
|
).set_examples({LLAMA_EXAMPLE_LOOKUP}));
|
|
657
|
-
add_opt(
|
|
634
|
+
add_opt(common_arg(
|
|
658
635
|
{"-c", "--ctx-size"}, "N",
|
|
659
|
-
|
|
660
|
-
[](
|
|
636
|
+
string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
|
|
637
|
+
[](common_params & params, int value) {
|
|
661
638
|
params.n_ctx = value;
|
|
662
639
|
}
|
|
663
640
|
).set_env("LLAMA_ARG_CTX_SIZE"));
|
|
664
|
-
add_opt(
|
|
641
|
+
add_opt(common_arg(
|
|
665
642
|
{"-n", "--predict", "--n-predict"}, "N",
|
|
666
|
-
|
|
667
|
-
[](
|
|
643
|
+
string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
|
|
644
|
+
[](common_params & params, int value) {
|
|
668
645
|
params.n_predict = value;
|
|
669
646
|
}
|
|
670
647
|
).set_env("LLAMA_ARG_N_PREDICT"));
|
|
671
|
-
add_opt(
|
|
648
|
+
add_opt(common_arg(
|
|
672
649
|
{"-b", "--batch-size"}, "N",
|
|
673
|
-
|
|
674
|
-
[](
|
|
650
|
+
string_format("logical maximum batch size (default: %d)", params.n_batch),
|
|
651
|
+
[](common_params & params, int value) {
|
|
675
652
|
params.n_batch = value;
|
|
676
653
|
}
|
|
677
654
|
).set_env("LLAMA_ARG_BATCH"));
|
|
678
|
-
add_opt(
|
|
655
|
+
add_opt(common_arg(
|
|
679
656
|
{"-ub", "--ubatch-size"}, "N",
|
|
680
|
-
|
|
681
|
-
[](
|
|
657
|
+
string_format("physical maximum batch size (default: %d)", params.n_ubatch),
|
|
658
|
+
[](common_params & params, int value) {
|
|
682
659
|
params.n_ubatch = value;
|
|
683
660
|
}
|
|
684
661
|
).set_env("LLAMA_ARG_UBATCH"));
|
|
685
|
-
add_opt(
|
|
662
|
+
add_opt(common_arg(
|
|
686
663
|
{"--keep"}, "N",
|
|
687
|
-
|
|
688
|
-
[](
|
|
664
|
+
string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
|
|
665
|
+
[](common_params & params, int value) {
|
|
689
666
|
params.n_keep = value;
|
|
690
667
|
}
|
|
691
668
|
));
|
|
692
|
-
add_opt(
|
|
669
|
+
add_opt(common_arg(
|
|
693
670
|
{"--no-context-shift"},
|
|
694
|
-
|
|
695
|
-
[](
|
|
671
|
+
string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
|
672
|
+
[](common_params & params) {
|
|
696
673
|
params.ctx_shift = false;
|
|
697
674
|
}
|
|
698
675
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
|
699
|
-
add_opt(
|
|
676
|
+
add_opt(common_arg(
|
|
700
677
|
{"--chunks"}, "N",
|
|
701
|
-
|
|
702
|
-
[](
|
|
678
|
+
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
|
|
679
|
+
[](common_params & params, int value) {
|
|
703
680
|
params.n_chunks = value;
|
|
704
681
|
}
|
|
705
682
|
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
|
|
706
|
-
add_opt(
|
|
683
|
+
add_opt(common_arg(
|
|
707
684
|
{"-fa", "--flash-attn"},
|
|
708
|
-
|
|
709
|
-
[](
|
|
685
|
+
string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
|
|
686
|
+
[](common_params & params) {
|
|
710
687
|
params.flash_attn = true;
|
|
711
688
|
}
|
|
712
689
|
).set_env("LLAMA_ARG_FLASH_ATTN"));
|
|
713
|
-
add_opt(
|
|
690
|
+
add_opt(common_arg(
|
|
714
691
|
{"-p", "--prompt"}, "PROMPT",
|
|
715
692
|
ex == LLAMA_EXAMPLE_MAIN
|
|
716
693
|
? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
|
|
717
694
|
: "prompt to start generation with",
|
|
718
|
-
[](
|
|
695
|
+
[](common_params & params, const std::string & value) {
|
|
719
696
|
params.prompt = value;
|
|
720
697
|
}
|
|
721
698
|
));
|
|
722
|
-
add_opt(
|
|
699
|
+
add_opt(common_arg(
|
|
723
700
|
{"--no-perf"},
|
|
724
|
-
|
|
725
|
-
[](
|
|
701
|
+
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
|
702
|
+
[](common_params & params) {
|
|
726
703
|
params.no_perf = true;
|
|
727
704
|
params.sparams.no_perf = true;
|
|
728
705
|
}
|
|
729
706
|
).set_env("LLAMA_ARG_NO_PERF"));
|
|
730
|
-
add_opt(
|
|
707
|
+
add_opt(common_arg(
|
|
731
708
|
{"-f", "--file"}, "FNAME",
|
|
732
709
|
"a file containing the prompt (default: none)",
|
|
733
|
-
[](
|
|
710
|
+
[](common_params & params, const std::string & value) {
|
|
734
711
|
std::ifstream file(value);
|
|
735
712
|
if (!file) {
|
|
736
|
-
throw std::runtime_error(
|
|
713
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
737
714
|
}
|
|
738
715
|
// store the external file name in params
|
|
739
716
|
params.prompt_file = value;
|
|
@@ -743,24 +720,24 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
743
720
|
}
|
|
744
721
|
}
|
|
745
722
|
));
|
|
746
|
-
add_opt(
|
|
723
|
+
add_opt(common_arg(
|
|
747
724
|
{"--in-file"}, "FNAME",
|
|
748
725
|
"an input file (repeat to specify multiple files)",
|
|
749
|
-
[](
|
|
726
|
+
[](common_params & params, const std::string & value) {
|
|
750
727
|
std::ifstream file(value);
|
|
751
728
|
if (!file) {
|
|
752
|
-
throw std::runtime_error(
|
|
729
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
753
730
|
}
|
|
754
731
|
params.in_files.push_back(value);
|
|
755
732
|
}
|
|
756
733
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
757
|
-
add_opt(
|
|
734
|
+
add_opt(common_arg(
|
|
758
735
|
{"-bf", "--binary-file"}, "FNAME",
|
|
759
736
|
"binary file containing the prompt (default: none)",
|
|
760
|
-
[](
|
|
737
|
+
[](common_params & params, const std::string & value) {
|
|
761
738
|
std::ifstream file(value, std::ios::binary);
|
|
762
739
|
if (!file) {
|
|
763
|
-
throw std::runtime_error(
|
|
740
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
764
741
|
}
|
|
765
742
|
// store the external file name in params
|
|
766
743
|
params.prompt_file = value;
|
|
@@ -770,287 +747,352 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
770
747
|
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
|
|
771
748
|
}
|
|
772
749
|
));
|
|
773
|
-
add_opt(
|
|
750
|
+
add_opt(common_arg(
|
|
774
751
|
{"-e", "--escape"},
|
|
775
|
-
|
|
776
|
-
[](
|
|
752
|
+
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
|
753
|
+
[](common_params & params) {
|
|
777
754
|
params.escape = true;
|
|
778
755
|
}
|
|
779
756
|
));
|
|
780
|
-
add_opt(
|
|
757
|
+
add_opt(common_arg(
|
|
781
758
|
{"--no-escape"},
|
|
782
759
|
"do not process escape sequences",
|
|
783
|
-
[](
|
|
760
|
+
[](common_params & params) {
|
|
784
761
|
params.escape = false;
|
|
785
762
|
}
|
|
786
763
|
));
|
|
787
|
-
add_opt(
|
|
764
|
+
add_opt(common_arg(
|
|
788
765
|
{"-ptc", "--print-token-count"}, "N",
|
|
789
|
-
|
|
790
|
-
[](
|
|
766
|
+
string_format("print token count every N tokens (default: %d)", params.n_print),
|
|
767
|
+
[](common_params & params, int value) {
|
|
791
768
|
params.n_print = value;
|
|
792
769
|
}
|
|
793
770
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
794
|
-
add_opt(
|
|
771
|
+
add_opt(common_arg(
|
|
795
772
|
{"--prompt-cache"}, "FNAME",
|
|
796
773
|
"file to cache prompt state for faster startup (default: none)",
|
|
797
|
-
[](
|
|
774
|
+
[](common_params & params, const std::string & value) {
|
|
798
775
|
params.path_prompt_cache = value;
|
|
799
776
|
}
|
|
800
777
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
801
|
-
add_opt(
|
|
778
|
+
add_opt(common_arg(
|
|
802
779
|
{"--prompt-cache-all"},
|
|
803
780
|
"if specified, saves user input and generations to cache as well\n",
|
|
804
|
-
[](
|
|
781
|
+
[](common_params & params) {
|
|
805
782
|
params.prompt_cache_all = true;
|
|
806
783
|
}
|
|
807
784
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
808
|
-
add_opt(
|
|
785
|
+
add_opt(common_arg(
|
|
809
786
|
{"--prompt-cache-ro"},
|
|
810
787
|
"if specified, uses the prompt cache but does not update it",
|
|
811
|
-
[](
|
|
788
|
+
[](common_params & params) {
|
|
812
789
|
params.prompt_cache_ro = true;
|
|
813
790
|
}
|
|
814
791
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
815
|
-
add_opt(
|
|
792
|
+
add_opt(common_arg(
|
|
816
793
|
{"-r", "--reverse-prompt"}, "PROMPT",
|
|
817
794
|
"halt generation at PROMPT, return control in interactive mode\n",
|
|
818
|
-
[](
|
|
795
|
+
[](common_params & params, const std::string & value) {
|
|
819
796
|
params.antiprompt.emplace_back(value);
|
|
820
797
|
}
|
|
821
798
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
822
|
-
add_opt(
|
|
799
|
+
add_opt(common_arg(
|
|
823
800
|
{"-sp", "--special"},
|
|
824
|
-
|
|
825
|
-
[](
|
|
801
|
+
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
|
|
802
|
+
[](common_params & params) {
|
|
826
803
|
params.special = true;
|
|
827
804
|
}
|
|
828
805
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
|
|
829
|
-
add_opt(
|
|
806
|
+
add_opt(common_arg(
|
|
830
807
|
{"-cnv", "--conversation"},
|
|
831
|
-
|
|
808
|
+
string_format(
|
|
832
809
|
"run in conversation mode:\n"
|
|
833
810
|
"- does not print special tokens and suffix/prefix\n"
|
|
834
811
|
"- interactive mode is also enabled\n"
|
|
835
812
|
"(default: %s)",
|
|
836
813
|
params.conversation ? "true" : "false"
|
|
837
814
|
),
|
|
838
|
-
[](
|
|
815
|
+
[](common_params & params) {
|
|
839
816
|
params.conversation = true;
|
|
840
817
|
}
|
|
841
818
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
842
|
-
add_opt(
|
|
819
|
+
add_opt(common_arg(
|
|
843
820
|
{"-i", "--interactive"},
|
|
844
|
-
|
|
845
|
-
[](
|
|
821
|
+
string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
|
|
822
|
+
[](common_params & params) {
|
|
846
823
|
params.interactive = true;
|
|
847
824
|
}
|
|
848
825
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
849
|
-
add_opt(
|
|
826
|
+
add_opt(common_arg(
|
|
850
827
|
{"-if", "--interactive-first"},
|
|
851
|
-
|
|
852
|
-
[](
|
|
828
|
+
string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
|
|
829
|
+
[](common_params & params) {
|
|
853
830
|
params.interactive_first = true;
|
|
854
831
|
}
|
|
855
832
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
856
|
-
add_opt(
|
|
833
|
+
add_opt(common_arg(
|
|
857
834
|
{"-mli", "--multiline-input"},
|
|
858
835
|
"allows you to write or paste multiple lines without ending each in '\\'",
|
|
859
|
-
[](
|
|
836
|
+
[](common_params & params) {
|
|
860
837
|
params.multiline_input = true;
|
|
861
838
|
}
|
|
862
839
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
863
|
-
add_opt(
|
|
840
|
+
add_opt(common_arg(
|
|
864
841
|
{"--in-prefix-bos"},
|
|
865
842
|
"prefix BOS to user inputs, preceding the `--in-prefix` string",
|
|
866
|
-
[](
|
|
843
|
+
[](common_params & params) {
|
|
867
844
|
params.input_prefix_bos = true;
|
|
868
845
|
params.enable_chat_template = false;
|
|
869
846
|
}
|
|
870
847
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
871
|
-
add_opt(
|
|
848
|
+
add_opt(common_arg(
|
|
872
849
|
{"--in-prefix"}, "STRING",
|
|
873
850
|
"string to prefix user inputs with (default: empty)",
|
|
874
|
-
[](
|
|
851
|
+
[](common_params & params, const std::string & value) {
|
|
875
852
|
params.input_prefix = value;
|
|
876
853
|
params.enable_chat_template = false;
|
|
877
854
|
}
|
|
878
855
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
|
879
|
-
add_opt(
|
|
856
|
+
add_opt(common_arg(
|
|
880
857
|
{"--in-suffix"}, "STRING",
|
|
881
858
|
"string to suffix after user inputs with (default: empty)",
|
|
882
|
-
[](
|
|
859
|
+
[](common_params & params, const std::string & value) {
|
|
883
860
|
params.input_suffix = value;
|
|
884
861
|
params.enable_chat_template = false;
|
|
885
862
|
}
|
|
886
863
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
|
887
|
-
add_opt(
|
|
864
|
+
add_opt(common_arg(
|
|
888
865
|
{"--no-warmup"},
|
|
889
866
|
"skip warming up the model with an empty run",
|
|
890
|
-
[](
|
|
867
|
+
[](common_params & params) {
|
|
891
868
|
params.warmup = false;
|
|
892
869
|
}
|
|
893
870
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
894
|
-
add_opt(
|
|
871
|
+
add_opt(common_arg(
|
|
895
872
|
{"--spm-infill"},
|
|
896
|
-
|
|
873
|
+
string_format(
|
|
897
874
|
"use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
|
|
898
875
|
params.spm_infill ? "enabled" : "disabled"
|
|
899
876
|
),
|
|
900
|
-
[](
|
|
877
|
+
[](common_params & params) {
|
|
901
878
|
params.spm_infill = true;
|
|
902
879
|
}
|
|
903
880
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
|
|
904
|
-
add_opt(
|
|
881
|
+
add_opt(common_arg(
|
|
905
882
|
{"--samplers"}, "SAMPLERS",
|
|
906
|
-
|
|
907
|
-
[](
|
|
908
|
-
const auto sampler_names = string_split(value, ';');
|
|
909
|
-
params.sparams.samplers =
|
|
883
|
+
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
|
884
|
+
[](common_params & params, const std::string & value) {
|
|
885
|
+
const auto sampler_names = string_split<std::string>(value, ';');
|
|
886
|
+
params.sparams.samplers = common_sampler_types_from_names(sampler_names, true);
|
|
910
887
|
}
|
|
911
888
|
).set_sparam());
|
|
912
|
-
add_opt(
|
|
889
|
+
add_opt(common_arg(
|
|
913
890
|
{"-s", "--seed"}, "SEED",
|
|
914
|
-
|
|
915
|
-
[](
|
|
891
|
+
string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
|
|
892
|
+
[](common_params & params, const std::string & value) {
|
|
916
893
|
params.sparams.seed = std::stoul(value);
|
|
917
894
|
}
|
|
918
895
|
).set_sparam());
|
|
919
|
-
add_opt(
|
|
896
|
+
add_opt(common_arg(
|
|
920
897
|
{"--sampling-seq"}, "SEQUENCE",
|
|
921
|
-
|
|
922
|
-
[](
|
|
923
|
-
params.sparams.samplers =
|
|
898
|
+
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
|
|
899
|
+
[](common_params & params, const std::string & value) {
|
|
900
|
+
params.sparams.samplers = common_sampler_types_from_chars(value);
|
|
924
901
|
}
|
|
925
902
|
).set_sparam());
|
|
926
|
-
add_opt(
|
|
903
|
+
add_opt(common_arg(
|
|
927
904
|
{"--ignore-eos"},
|
|
928
905
|
"ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
|
|
929
|
-
[](
|
|
906
|
+
[](common_params & params) {
|
|
930
907
|
params.sparams.ignore_eos = true;
|
|
931
908
|
}
|
|
932
909
|
).set_sparam());
|
|
933
|
-
add_opt(
|
|
910
|
+
add_opt(common_arg(
|
|
934
911
|
{"--penalize-nl"},
|
|
935
|
-
|
|
936
|
-
[](
|
|
912
|
+
string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
|
|
913
|
+
[](common_params & params) {
|
|
937
914
|
params.sparams.penalize_nl = true;
|
|
938
915
|
}
|
|
939
916
|
).set_sparam());
|
|
940
|
-
add_opt(
|
|
917
|
+
add_opt(common_arg(
|
|
941
918
|
{"--temp"}, "N",
|
|
942
|
-
|
|
943
|
-
[](
|
|
919
|
+
string_format("temperature (default: %.1f)", (double)params.sparams.temp),
|
|
920
|
+
[](common_params & params, const std::string & value) {
|
|
944
921
|
params.sparams.temp = std::stof(value);
|
|
945
922
|
params.sparams.temp = std::max(params.sparams.temp, 0.0f);
|
|
946
923
|
}
|
|
947
924
|
).set_sparam());
|
|
948
|
-
add_opt(
|
|
925
|
+
add_opt(common_arg(
|
|
949
926
|
{"--top-k"}, "N",
|
|
950
|
-
|
|
951
|
-
[](
|
|
927
|
+
string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
|
|
928
|
+
[](common_params & params, int value) {
|
|
952
929
|
params.sparams.top_k = value;
|
|
953
930
|
}
|
|
954
931
|
).set_sparam());
|
|
955
|
-
add_opt(
|
|
932
|
+
add_opt(common_arg(
|
|
956
933
|
{"--top-p"}, "N",
|
|
957
|
-
|
|
958
|
-
[](
|
|
934
|
+
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
|
|
935
|
+
[](common_params & params, const std::string & value) {
|
|
959
936
|
params.sparams.top_p = std::stof(value);
|
|
960
937
|
}
|
|
961
938
|
).set_sparam());
|
|
962
|
-
add_opt(
|
|
939
|
+
add_opt(common_arg(
|
|
963
940
|
{"--min-p"}, "N",
|
|
964
|
-
|
|
965
|
-
[](
|
|
941
|
+
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
|
|
942
|
+
[](common_params & params, const std::string & value) {
|
|
966
943
|
params.sparams.min_p = std::stof(value);
|
|
967
944
|
}
|
|
968
945
|
).set_sparam());
|
|
969
|
-
add_opt(
|
|
970
|
-
{"--
|
|
971
|
-
|
|
972
|
-
[](
|
|
973
|
-
params.sparams.
|
|
946
|
+
add_opt(common_arg(
|
|
947
|
+
{"--xtc-probability"}, "N",
|
|
948
|
+
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
|
|
949
|
+
[](common_params & params, const std::string & value) {
|
|
950
|
+
params.sparams.xtc_probability = std::stof(value);
|
|
974
951
|
}
|
|
975
952
|
).set_sparam());
|
|
976
|
-
add_opt(
|
|
953
|
+
add_opt(common_arg(
|
|
954
|
+
{"--xtc-threshold"}, "N",
|
|
955
|
+
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
|
|
956
|
+
[](common_params & params, const std::string & value) {
|
|
957
|
+
params.sparams.xtc_threshold = std::stof(value);
|
|
958
|
+
}
|
|
959
|
+
).set_sparam());
|
|
960
|
+
add_opt(common_arg(
|
|
977
961
|
{"--typical"}, "N",
|
|
978
|
-
|
|
979
|
-
[](
|
|
962
|
+
string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
|
|
963
|
+
[](common_params & params, const std::string & value) {
|
|
980
964
|
params.sparams.typ_p = std::stof(value);
|
|
981
965
|
}
|
|
982
966
|
).set_sparam());
|
|
983
|
-
add_opt(
|
|
967
|
+
add_opt(common_arg(
|
|
984
968
|
{"--repeat-last-n"}, "N",
|
|
985
|
-
|
|
986
|
-
[](
|
|
969
|
+
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
|
|
970
|
+
[](common_params & params, int value) {
|
|
987
971
|
params.sparams.penalty_last_n = value;
|
|
988
972
|
params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
|
|
989
973
|
}
|
|
990
974
|
).set_sparam());
|
|
991
|
-
add_opt(
|
|
975
|
+
add_opt(common_arg(
|
|
992
976
|
{"--repeat-penalty"}, "N",
|
|
993
|
-
|
|
994
|
-
[](
|
|
977
|
+
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
|
|
978
|
+
[](common_params & params, const std::string & value) {
|
|
995
979
|
params.sparams.penalty_repeat = std::stof(value);
|
|
996
980
|
}
|
|
997
981
|
).set_sparam());
|
|
998
|
-
add_opt(
|
|
982
|
+
add_opt(common_arg(
|
|
999
983
|
{"--presence-penalty"}, "N",
|
|
1000
|
-
|
|
1001
|
-
[](
|
|
984
|
+
string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
|
|
985
|
+
[](common_params & params, const std::string & value) {
|
|
1002
986
|
params.sparams.penalty_present = std::stof(value);
|
|
1003
987
|
}
|
|
1004
988
|
).set_sparam());
|
|
1005
|
-
add_opt(
|
|
989
|
+
add_opt(common_arg(
|
|
1006
990
|
{"--frequency-penalty"}, "N",
|
|
1007
|
-
|
|
1008
|
-
[](
|
|
991
|
+
string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
|
|
992
|
+
[](common_params & params, const std::string & value) {
|
|
1009
993
|
params.sparams.penalty_freq = std::stof(value);
|
|
1010
994
|
}
|
|
1011
995
|
).set_sparam());
|
|
1012
|
-
add_opt(
|
|
996
|
+
add_opt(common_arg(
|
|
997
|
+
{"--dry-multiplier"}, "N",
|
|
998
|
+
string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sparams.dry_multiplier),
|
|
999
|
+
[](common_params & params, const std::string & value) {
|
|
1000
|
+
params.sparams.dry_multiplier = std::stof(value);
|
|
1001
|
+
}
|
|
1002
|
+
).set_sparam());
|
|
1003
|
+
add_opt(common_arg(
|
|
1004
|
+
{"--dry-base"}, "N",
|
|
1005
|
+
string_format("set DRY sampling base value (default: %.2f)", (double)params.sparams.dry_base),
|
|
1006
|
+
[](common_params & params, const std::string & value) {
|
|
1007
|
+
float potential_base = std::stof(value);
|
|
1008
|
+
if (potential_base >= 1.0f)
|
|
1009
|
+
{
|
|
1010
|
+
params.sparams.dry_base = potential_base;
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
).set_sparam());
|
|
1014
|
+
add_opt(common_arg(
|
|
1015
|
+
{"--dry-allowed-length"}, "N",
|
|
1016
|
+
string_format("set allowed length for DRY sampling (default: %d)", params.sparams.dry_allowed_length),
|
|
1017
|
+
[](common_params & params, int value) {
|
|
1018
|
+
params.sparams.dry_allowed_length = value;
|
|
1019
|
+
}
|
|
1020
|
+
).set_sparam());
|
|
1021
|
+
add_opt(common_arg(
|
|
1022
|
+
{"--dry-penalty-last-n"}, "N",
|
|
1023
|
+
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sparams.dry_penalty_last_n),
|
|
1024
|
+
[](common_params & params, int value) {
|
|
1025
|
+
params.sparams.dry_penalty_last_n = value;
|
|
1026
|
+
}
|
|
1027
|
+
).set_sparam());
|
|
1028
|
+
add_opt(common_arg(
|
|
1029
|
+
{"--dry-sequence-breaker"}, "STRING",
|
|
1030
|
+
string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
|
|
1031
|
+
params.sparams.dry_sequence_breakers.empty() ? "none" :
|
|
1032
|
+
std::accumulate(std::next(params.sparams.dry_sequence_breakers.begin()),
|
|
1033
|
+
params.sparams.dry_sequence_breakers.end(),
|
|
1034
|
+
std::string("'") + (params.sparams.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sparams.dry_sequence_breakers[0]) + "'",
|
|
1035
|
+
[](const std::string& a, const std::string& b) {
|
|
1036
|
+
std::string formatted_b = (b == "\n") ? "\\n" : b;
|
|
1037
|
+
return a + ", '" + formatted_b + "'";
|
|
1038
|
+
}).c_str()),
|
|
1039
|
+
[](common_params & params, const std::string & value) {
|
|
1040
|
+
static bool defaults_cleared = false;
|
|
1041
|
+
|
|
1042
|
+
if (!defaults_cleared) {
|
|
1043
|
+
params.sparams.dry_sequence_breakers.clear();
|
|
1044
|
+
defaults_cleared = true;
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
if (value == "none") {
|
|
1048
|
+
params.sparams.dry_sequence_breakers.clear();
|
|
1049
|
+
} else {
|
|
1050
|
+
params.sparams.dry_sequence_breakers.emplace_back(value);
|
|
1051
|
+
}
|
|
1052
|
+
}
|
|
1053
|
+
).set_sparam());
|
|
1054
|
+
add_opt(common_arg(
|
|
1013
1055
|
{"--dynatemp-range"}, "N",
|
|
1014
|
-
|
|
1015
|
-
[](
|
|
1056
|
+
string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
|
|
1057
|
+
[](common_params & params, const std::string & value) {
|
|
1016
1058
|
params.sparams.dynatemp_range = std::stof(value);
|
|
1017
1059
|
}
|
|
1018
1060
|
).set_sparam());
|
|
1019
|
-
add_opt(
|
|
1061
|
+
add_opt(common_arg(
|
|
1020
1062
|
{"--dynatemp-exp"}, "N",
|
|
1021
|
-
|
|
1022
|
-
[](
|
|
1063
|
+
string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
|
|
1064
|
+
[](common_params & params, const std::string & value) {
|
|
1023
1065
|
params.sparams.dynatemp_exponent = std::stof(value);
|
|
1024
1066
|
}
|
|
1025
1067
|
).set_sparam());
|
|
1026
|
-
add_opt(
|
|
1068
|
+
add_opt(common_arg(
|
|
1027
1069
|
{"--mirostat"}, "N",
|
|
1028
|
-
|
|
1070
|
+
string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
|
|
1029
1071
|
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
|
|
1030
|
-
[](
|
|
1072
|
+
[](common_params & params, int value) {
|
|
1031
1073
|
params.sparams.mirostat = value;
|
|
1032
1074
|
}
|
|
1033
1075
|
).set_sparam());
|
|
1034
|
-
add_opt(
|
|
1076
|
+
add_opt(common_arg(
|
|
1035
1077
|
{"--mirostat-lr"}, "N",
|
|
1036
|
-
|
|
1037
|
-
[](
|
|
1078
|
+
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
|
|
1079
|
+
[](common_params & params, const std::string & value) {
|
|
1038
1080
|
params.sparams.mirostat_eta = std::stof(value);
|
|
1039
1081
|
}
|
|
1040
1082
|
).set_sparam());
|
|
1041
|
-
add_opt(
|
|
1083
|
+
add_opt(common_arg(
|
|
1042
1084
|
{"--mirostat-ent"}, "N",
|
|
1043
|
-
|
|
1044
|
-
[](
|
|
1085
|
+
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
|
|
1086
|
+
[](common_params & params, const std::string & value) {
|
|
1045
1087
|
params.sparams.mirostat_tau = std::stof(value);
|
|
1046
1088
|
}
|
|
1047
1089
|
).set_sparam());
|
|
1048
|
-
add_opt(
|
|
1090
|
+
add_opt(common_arg(
|
|
1049
1091
|
{"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS",
|
|
1050
1092
|
"modifies the likelihood of token appearing in the completion,\n"
|
|
1051
1093
|
"i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
|
|
1052
1094
|
"or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
|
|
1053
|
-
[](
|
|
1095
|
+
[](common_params & params, const std::string & value) {
|
|
1054
1096
|
std::stringstream ss(value);
|
|
1055
1097
|
llama_token key;
|
|
1056
1098
|
char sign;
|
|
@@ -1067,20 +1109,20 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1067
1109
|
}
|
|
1068
1110
|
}
|
|
1069
1111
|
).set_sparam());
|
|
1070
|
-
add_opt(
|
|
1112
|
+
add_opt(common_arg(
|
|
1071
1113
|
{"--grammar"}, "GRAMMAR",
|
|
1072
|
-
|
|
1073
|
-
[](
|
|
1114
|
+
string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
|
|
1115
|
+
[](common_params & params, const std::string & value) {
|
|
1074
1116
|
params.sparams.grammar = value;
|
|
1075
1117
|
}
|
|
1076
1118
|
).set_sparam());
|
|
1077
|
-
add_opt(
|
|
1119
|
+
add_opt(common_arg(
|
|
1078
1120
|
{"--grammar-file"}, "FNAME",
|
|
1079
1121
|
"file to read grammar from",
|
|
1080
|
-
[](
|
|
1122
|
+
[](common_params & params, const std::string & value) {
|
|
1081
1123
|
std::ifstream file(value);
|
|
1082
1124
|
if (!file) {
|
|
1083
|
-
throw std::runtime_error(
|
|
1125
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
1084
1126
|
}
|
|
1085
1127
|
std::copy(
|
|
1086
1128
|
std::istreambuf_iterator<char>(file),
|
|
@@ -1089,17 +1131,17 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1089
1131
|
);
|
|
1090
1132
|
}
|
|
1091
1133
|
).set_sparam());
|
|
1092
|
-
add_opt(
|
|
1134
|
+
add_opt(common_arg(
|
|
1093
1135
|
{"-j", "--json-schema"}, "SCHEMA",
|
|
1094
1136
|
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
|
|
1095
|
-
[](
|
|
1137
|
+
[](common_params & params, const std::string & value) {
|
|
1096
1138
|
params.sparams.grammar = json_schema_to_grammar(json::parse(value));
|
|
1097
1139
|
}
|
|
1098
1140
|
).set_sparam());
|
|
1099
|
-
add_opt(
|
|
1141
|
+
add_opt(common_arg(
|
|
1100
1142
|
{"--pooling"}, "{none,mean,cls,last,rank}",
|
|
1101
1143
|
"pooling type for embeddings, use model default if unspecified",
|
|
1102
|
-
[](
|
|
1144
|
+
[](common_params & params, const std::string & value) {
|
|
1103
1145
|
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
|
|
1104
1146
|
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
|
|
1105
1147
|
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
|
|
@@ -1108,275 +1150,275 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1108
1150
|
else { throw std::invalid_argument("invalid value"); }
|
|
1109
1151
|
}
|
|
1110
1152
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
|
|
1111
|
-
add_opt(
|
|
1112
|
-
{"--attention"}, "{causal,non
|
|
1153
|
+
add_opt(common_arg(
|
|
1154
|
+
{"--attention"}, "{causal,non-causal}",
|
|
1113
1155
|
"attention type for embeddings, use model default if unspecified",
|
|
1114
|
-
[](
|
|
1156
|
+
[](common_params & params, const std::string & value) {
|
|
1115
1157
|
/**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
|
|
1116
1158
|
else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
|
|
1117
1159
|
else { throw std::invalid_argument("invalid value"); }
|
|
1118
1160
|
}
|
|
1119
1161
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
1120
|
-
add_opt(
|
|
1162
|
+
add_opt(common_arg(
|
|
1121
1163
|
{"--rope-scaling"}, "{none,linear,yarn}",
|
|
1122
1164
|
"RoPE frequency scaling method, defaults to linear unless specified by the model",
|
|
1123
|
-
[](
|
|
1165
|
+
[](common_params & params, const std::string & value) {
|
|
1124
1166
|
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
|
1125
1167
|
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
|
1126
1168
|
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
|
1127
1169
|
else { throw std::invalid_argument("invalid value"); }
|
|
1128
1170
|
}
|
|
1129
1171
|
).set_env("LLAMA_ARG_ROPE_SCALING_TYPE"));
|
|
1130
|
-
add_opt(
|
|
1172
|
+
add_opt(common_arg(
|
|
1131
1173
|
{"--rope-scale"}, "N",
|
|
1132
1174
|
"RoPE context scaling factor, expands context by a factor of N",
|
|
1133
|
-
[](
|
|
1175
|
+
[](common_params & params, const std::string & value) {
|
|
1134
1176
|
params.rope_freq_scale = 1.0f / std::stof(value);
|
|
1135
1177
|
}
|
|
1136
1178
|
).set_env("LLAMA_ARG_ROPE_SCALE"));
|
|
1137
|
-
add_opt(
|
|
1179
|
+
add_opt(common_arg(
|
|
1138
1180
|
{"--rope-freq-base"}, "N",
|
|
1139
1181
|
"RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
|
|
1140
|
-
[](
|
|
1182
|
+
[](common_params & params, const std::string & value) {
|
|
1141
1183
|
params.rope_freq_base = std::stof(value);
|
|
1142
1184
|
}
|
|
1143
1185
|
).set_env("LLAMA_ARG_ROPE_FREQ_BASE"));
|
|
1144
|
-
add_opt(
|
|
1186
|
+
add_opt(common_arg(
|
|
1145
1187
|
{"--rope-freq-scale"}, "N",
|
|
1146
1188
|
"RoPE frequency scaling factor, expands context by a factor of 1/N",
|
|
1147
|
-
[](
|
|
1189
|
+
[](common_params & params, const std::string & value) {
|
|
1148
1190
|
params.rope_freq_scale = std::stof(value);
|
|
1149
1191
|
}
|
|
1150
1192
|
).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
|
|
1151
|
-
add_opt(
|
|
1193
|
+
add_opt(common_arg(
|
|
1152
1194
|
{"--yarn-orig-ctx"}, "N",
|
|
1153
|
-
|
|
1154
|
-
[](
|
|
1195
|
+
string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
|
|
1196
|
+
[](common_params & params, int value) {
|
|
1155
1197
|
params.yarn_orig_ctx = value;
|
|
1156
1198
|
}
|
|
1157
1199
|
).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
|
|
1158
|
-
add_opt(
|
|
1200
|
+
add_opt(common_arg(
|
|
1159
1201
|
{"--yarn-ext-factor"}, "N",
|
|
1160
|
-
|
|
1161
|
-
[](
|
|
1202
|
+
string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
|
|
1203
|
+
[](common_params & params, const std::string & value) {
|
|
1162
1204
|
params.yarn_ext_factor = std::stof(value);
|
|
1163
1205
|
}
|
|
1164
1206
|
).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
|
|
1165
|
-
add_opt(
|
|
1207
|
+
add_opt(common_arg(
|
|
1166
1208
|
{"--yarn-attn-factor"}, "N",
|
|
1167
|
-
|
|
1168
|
-
[](
|
|
1209
|
+
string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
|
|
1210
|
+
[](common_params & params, const std::string & value) {
|
|
1169
1211
|
params.yarn_attn_factor = std::stof(value);
|
|
1170
1212
|
}
|
|
1171
1213
|
).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
|
|
1172
|
-
add_opt(
|
|
1214
|
+
add_opt(common_arg(
|
|
1173
1215
|
{"--yarn-beta-slow"}, "N",
|
|
1174
|
-
|
|
1175
|
-
[](
|
|
1216
|
+
string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
|
|
1217
|
+
[](common_params & params, const std::string & value) {
|
|
1176
1218
|
params.yarn_beta_slow = std::stof(value);
|
|
1177
1219
|
}
|
|
1178
1220
|
).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
|
|
1179
|
-
add_opt(
|
|
1221
|
+
add_opt(common_arg(
|
|
1180
1222
|
{"--yarn-beta-fast"}, "N",
|
|
1181
|
-
|
|
1182
|
-
[](
|
|
1223
|
+
string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
|
|
1224
|
+
[](common_params & params, const std::string & value) {
|
|
1183
1225
|
params.yarn_beta_fast = std::stof(value);
|
|
1184
1226
|
}
|
|
1185
1227
|
).set_env("LLAMA_ARG_YARN_BETA_FAST"));
|
|
1186
|
-
add_opt(
|
|
1228
|
+
add_opt(common_arg(
|
|
1187
1229
|
{"-gan", "--grp-attn-n"}, "N",
|
|
1188
|
-
|
|
1189
|
-
[](
|
|
1230
|
+
string_format("group-attention factor (default: %d)", params.grp_attn_n),
|
|
1231
|
+
[](common_params & params, int value) {
|
|
1190
1232
|
params.grp_attn_n = value;
|
|
1191
1233
|
}
|
|
1192
|
-
).set_env("LLAMA_ARG_GRP_ATTN_N"));
|
|
1193
|
-
add_opt(
|
|
1234
|
+
).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
|
|
1235
|
+
add_opt(common_arg(
|
|
1194
1236
|
{"-gaw", "--grp-attn-w"}, "N",
|
|
1195
|
-
|
|
1196
|
-
[](
|
|
1237
|
+
string_format("group-attention width (default: %d)", params.grp_attn_w),
|
|
1238
|
+
[](common_params & params, int value) {
|
|
1197
1239
|
params.grp_attn_w = value;
|
|
1198
1240
|
}
|
|
1199
|
-
).set_env("LLAMA_ARG_GRP_ATTN_W"));
|
|
1200
|
-
add_opt(
|
|
1241
|
+
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
1242
|
+
add_opt(common_arg(
|
|
1201
1243
|
{"-dkvc", "--dump-kv-cache"},
|
|
1202
1244
|
"verbose print of the KV cache",
|
|
1203
|
-
[](
|
|
1245
|
+
[](common_params & params) {
|
|
1204
1246
|
params.dump_kv_cache = true;
|
|
1205
1247
|
}
|
|
1206
1248
|
));
|
|
1207
|
-
add_opt(
|
|
1249
|
+
add_opt(common_arg(
|
|
1208
1250
|
{"-nkvo", "--no-kv-offload"},
|
|
1209
1251
|
"disable KV offload",
|
|
1210
|
-
[](
|
|
1252
|
+
[](common_params & params) {
|
|
1211
1253
|
params.no_kv_offload = true;
|
|
1212
1254
|
}
|
|
1213
1255
|
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
|
1214
|
-
add_opt(
|
|
1256
|
+
add_opt(common_arg(
|
|
1215
1257
|
{"-ctk", "--cache-type-k"}, "TYPE",
|
|
1216
|
-
|
|
1217
|
-
[](
|
|
1258
|
+
string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
|
|
1259
|
+
[](common_params & params, const std::string & value) {
|
|
1218
1260
|
// TODO: get the type right here
|
|
1219
1261
|
params.cache_type_k = value;
|
|
1220
1262
|
}
|
|
1221
1263
|
).set_env("LLAMA_ARG_CACHE_TYPE_K"));
|
|
1222
|
-
add_opt(
|
|
1264
|
+
add_opt(common_arg(
|
|
1223
1265
|
{"-ctv", "--cache-type-v"}, "TYPE",
|
|
1224
|
-
|
|
1225
|
-
[](
|
|
1266
|
+
string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
|
|
1267
|
+
[](common_params & params, const std::string & value) {
|
|
1226
1268
|
// TODO: get the type right here
|
|
1227
1269
|
params.cache_type_v = value;
|
|
1228
1270
|
}
|
|
1229
1271
|
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
|
|
1230
|
-
add_opt(
|
|
1272
|
+
add_opt(common_arg(
|
|
1231
1273
|
{"--perplexity", "--all-logits"},
|
|
1232
|
-
|
|
1233
|
-
[](
|
|
1274
|
+
string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
|
|
1275
|
+
[](common_params & params) {
|
|
1234
1276
|
params.logits_all = true;
|
|
1235
1277
|
}
|
|
1236
1278
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1237
|
-
add_opt(
|
|
1279
|
+
add_opt(common_arg(
|
|
1238
1280
|
{"--hellaswag"},
|
|
1239
1281
|
"compute HellaSwag score over random tasks from datafile supplied with -f",
|
|
1240
|
-
[](
|
|
1282
|
+
[](common_params & params) {
|
|
1241
1283
|
params.hellaswag = true;
|
|
1242
1284
|
}
|
|
1243
1285
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1244
|
-
add_opt(
|
|
1286
|
+
add_opt(common_arg(
|
|
1245
1287
|
{"--hellaswag-tasks"}, "N",
|
|
1246
|
-
|
|
1247
|
-
[](
|
|
1288
|
+
string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
|
|
1289
|
+
[](common_params & params, int value) {
|
|
1248
1290
|
params.hellaswag_tasks = value;
|
|
1249
1291
|
}
|
|
1250
1292
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1251
|
-
add_opt(
|
|
1293
|
+
add_opt(common_arg(
|
|
1252
1294
|
{"--winogrande"},
|
|
1253
1295
|
"compute Winogrande score over random tasks from datafile supplied with -f",
|
|
1254
|
-
[](
|
|
1296
|
+
[](common_params & params) {
|
|
1255
1297
|
params.winogrande = true;
|
|
1256
1298
|
}
|
|
1257
1299
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1258
|
-
add_opt(
|
|
1300
|
+
add_opt(common_arg(
|
|
1259
1301
|
{"--winogrande-tasks"}, "N",
|
|
1260
|
-
|
|
1261
|
-
[](
|
|
1302
|
+
string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
|
|
1303
|
+
[](common_params & params, int value) {
|
|
1262
1304
|
params.winogrande_tasks = value;
|
|
1263
1305
|
}
|
|
1264
1306
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1265
|
-
add_opt(
|
|
1307
|
+
add_opt(common_arg(
|
|
1266
1308
|
{"--multiple-choice"},
|
|
1267
1309
|
"compute multiple choice score over random tasks from datafile supplied with -f",
|
|
1268
|
-
[](
|
|
1310
|
+
[](common_params & params) {
|
|
1269
1311
|
params.multiple_choice = true;
|
|
1270
1312
|
}
|
|
1271
1313
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1272
|
-
add_opt(
|
|
1314
|
+
add_opt(common_arg(
|
|
1273
1315
|
{"--multiple-choice-tasks"}, "N",
|
|
1274
|
-
|
|
1275
|
-
[](
|
|
1316
|
+
string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
|
|
1317
|
+
[](common_params & params, int value) {
|
|
1276
1318
|
params.multiple_choice_tasks = value;
|
|
1277
1319
|
}
|
|
1278
1320
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1279
|
-
add_opt(
|
|
1321
|
+
add_opt(common_arg(
|
|
1280
1322
|
{"--kl-divergence"},
|
|
1281
1323
|
"computes KL-divergence to logits provided via --kl-divergence-base",
|
|
1282
|
-
[](
|
|
1324
|
+
[](common_params & params) {
|
|
1283
1325
|
params.kl_divergence = true;
|
|
1284
1326
|
}
|
|
1285
1327
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1286
|
-
add_opt(
|
|
1328
|
+
add_opt(common_arg(
|
|
1287
1329
|
{"--save-all-logits", "--kl-divergence-base"}, "FNAME",
|
|
1288
1330
|
"set logits file",
|
|
1289
|
-
[](
|
|
1331
|
+
[](common_params & params, const std::string & value) {
|
|
1290
1332
|
params.logits_file = value;
|
|
1291
1333
|
}
|
|
1292
1334
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1293
|
-
add_opt(
|
|
1335
|
+
add_opt(common_arg(
|
|
1294
1336
|
{"--ppl-stride"}, "N",
|
|
1295
|
-
|
|
1296
|
-
[](
|
|
1337
|
+
string_format("stride for perplexity calculation (default: %d)", params.ppl_stride),
|
|
1338
|
+
[](common_params & params, int value) {
|
|
1297
1339
|
params.ppl_stride = value;
|
|
1298
1340
|
}
|
|
1299
1341
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1300
|
-
add_opt(
|
|
1342
|
+
add_opt(common_arg(
|
|
1301
1343
|
{"--ppl-output-type"}, "<0|1>",
|
|
1302
|
-
|
|
1303
|
-
[](
|
|
1344
|
+
string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
|
|
1345
|
+
[](common_params & params, int value) {
|
|
1304
1346
|
params.ppl_output_type = value;
|
|
1305
1347
|
}
|
|
1306
1348
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1307
|
-
add_opt(
|
|
1349
|
+
add_opt(common_arg(
|
|
1308
1350
|
{"-dt", "--defrag-thold"}, "N",
|
|
1309
|
-
|
|
1310
|
-
[](
|
|
1351
|
+
string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
|
|
1352
|
+
[](common_params & params, const std::string & value) {
|
|
1311
1353
|
params.defrag_thold = std::stof(value);
|
|
1312
1354
|
}
|
|
1313
1355
|
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
|
|
1314
|
-
add_opt(
|
|
1356
|
+
add_opt(common_arg(
|
|
1315
1357
|
{"-np", "--parallel"}, "N",
|
|
1316
|
-
|
|
1317
|
-
[](
|
|
1358
|
+
string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
|
|
1359
|
+
[](common_params & params, int value) {
|
|
1318
1360
|
params.n_parallel = value;
|
|
1319
1361
|
}
|
|
1320
1362
|
).set_env("LLAMA_ARG_N_PARALLEL"));
|
|
1321
|
-
add_opt(
|
|
1363
|
+
add_opt(common_arg(
|
|
1322
1364
|
{"-ns", "--sequences"}, "N",
|
|
1323
|
-
|
|
1324
|
-
[](
|
|
1365
|
+
string_format("number of sequences to decode (default: %d)", params.n_sequences),
|
|
1366
|
+
[](common_params & params, int value) {
|
|
1325
1367
|
params.n_sequences = value;
|
|
1326
1368
|
}
|
|
1327
1369
|
).set_examples({LLAMA_EXAMPLE_PARALLEL}));
|
|
1328
|
-
add_opt(
|
|
1370
|
+
add_opt(common_arg(
|
|
1329
1371
|
{"-cb", "--cont-batching"},
|
|
1330
|
-
|
|
1331
|
-
[](
|
|
1372
|
+
string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
|
|
1373
|
+
[](common_params & params) {
|
|
1332
1374
|
params.cont_batching = true;
|
|
1333
1375
|
}
|
|
1334
1376
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
|
|
1335
|
-
add_opt(
|
|
1377
|
+
add_opt(common_arg(
|
|
1336
1378
|
{"-nocb", "--no-cont-batching"},
|
|
1337
1379
|
"disable continuous batching",
|
|
1338
|
-
[](
|
|
1380
|
+
[](common_params & params) {
|
|
1339
1381
|
params.cont_batching = false;
|
|
1340
1382
|
}
|
|
1341
1383
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
|
1342
|
-
add_opt(
|
|
1384
|
+
add_opt(common_arg(
|
|
1343
1385
|
{"--mmproj"}, "FILE",
|
|
1344
1386
|
"path to a multimodal projector file for LLaVA. see examples/llava/README.md",
|
|
1345
|
-
[](
|
|
1387
|
+
[](common_params & params, const std::string & value) {
|
|
1346
1388
|
params.mmproj = value;
|
|
1347
1389
|
}
|
|
1348
1390
|
).set_examples({LLAMA_EXAMPLE_LLAVA}));
|
|
1349
|
-
add_opt(
|
|
1391
|
+
add_opt(common_arg(
|
|
1350
1392
|
{"--image"}, "FILE",
|
|
1351
1393
|
"path to an image file. use with multimodal models. Specify multiple times for batching",
|
|
1352
|
-
[](
|
|
1394
|
+
[](common_params & params, const std::string & value) {
|
|
1353
1395
|
params.image.emplace_back(value);
|
|
1354
1396
|
}
|
|
1355
1397
|
).set_examples({LLAMA_EXAMPLE_LLAVA}));
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
add_opt(
|
|
1398
|
+
if (llama_supports_rpc()) {
|
|
1399
|
+
add_opt(common_arg(
|
|
1400
|
+
{"--rpc"}, "SERVERS",
|
|
1401
|
+
"comma separated list of RPC servers",
|
|
1402
|
+
[](common_params & params, const std::string & value) {
|
|
1403
|
+
params.rpc_servers = value;
|
|
1404
|
+
}
|
|
1405
|
+
).set_env("LLAMA_ARG_RPC"));
|
|
1406
|
+
}
|
|
1407
|
+
add_opt(common_arg(
|
|
1366
1408
|
{"--mlock"},
|
|
1367
1409
|
"force system to keep model in RAM rather than swapping or compressing",
|
|
1368
|
-
[](
|
|
1410
|
+
[](common_params & params) {
|
|
1369
1411
|
params.use_mlock = true;
|
|
1370
1412
|
}
|
|
1371
1413
|
).set_env("LLAMA_ARG_MLOCK"));
|
|
1372
|
-
add_opt(
|
|
1414
|
+
add_opt(common_arg(
|
|
1373
1415
|
{"--no-mmap"},
|
|
1374
1416
|
"do not memory-map model (slower load but may reduce pageouts if not using mlock)",
|
|
1375
|
-
[](
|
|
1417
|
+
[](common_params & params) {
|
|
1376
1418
|
params.use_mmap = false;
|
|
1377
1419
|
}
|
|
1378
1420
|
).set_env("LLAMA_ARG_NO_MMAP"));
|
|
1379
|
-
add_opt(
|
|
1421
|
+
add_opt(common_arg(
|
|
1380
1422
|
{"--numa"}, "TYPE",
|
|
1381
1423
|
"attempt optimizations that help on some NUMA systems\n"
|
|
1382
1424
|
"- distribute: spread execution evenly over all nodes\n"
|
|
@@ -1384,17 +1426,17 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1384
1426
|
"- numactl: use the CPU map provided by numactl\n"
|
|
1385
1427
|
"if run without this previously, it is recommended to drop the system page cache before using this\n"
|
|
1386
1428
|
"see https://github.com/ggerganov/llama.cpp/issues/1437",
|
|
1387
|
-
[](
|
|
1429
|
+
[](common_params & params, const std::string & value) {
|
|
1388
1430
|
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
|
1389
1431
|
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
|
|
1390
1432
|
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
|
|
1391
1433
|
else { throw std::invalid_argument("invalid value"); }
|
|
1392
1434
|
}
|
|
1393
1435
|
).set_env("LLAMA_ARG_NUMA"));
|
|
1394
|
-
add_opt(
|
|
1436
|
+
add_opt(common_arg(
|
|
1395
1437
|
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
|
1396
1438
|
"number of layers to store in VRAM",
|
|
1397
|
-
[](
|
|
1439
|
+
[](common_params & params, int value) {
|
|
1398
1440
|
params.n_gpu_layers = value;
|
|
1399
1441
|
if (!llama_supports_gpu_offload()) {
|
|
1400
1442
|
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
|
|
@@ -1402,10 +1444,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1402
1444
|
}
|
|
1403
1445
|
}
|
|
1404
1446
|
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
|
|
1405
|
-
add_opt(
|
|
1447
|
+
add_opt(common_arg(
|
|
1406
1448
|
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
|
1407
1449
|
"number of layers to store in VRAM for the draft model",
|
|
1408
|
-
[](
|
|
1450
|
+
[](common_params & params, int value) {
|
|
1409
1451
|
params.n_gpu_layers_draft = value;
|
|
1410
1452
|
if (!llama_supports_gpu_offload()) {
|
|
1411
1453
|
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
|
@@ -1413,13 +1455,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1413
1455
|
}
|
|
1414
1456
|
}
|
|
1415
1457
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
1416
|
-
add_opt(
|
|
1458
|
+
add_opt(common_arg(
|
|
1417
1459
|
{"-sm", "--split-mode"}, "{none,layer,row}",
|
|
1418
1460
|
"how to split the model across multiple GPUs, one of:\n"
|
|
1419
1461
|
"- none: use one GPU only\n"
|
|
1420
1462
|
"- layer (default): split layers and KV across GPUs\n"
|
|
1421
1463
|
"- row: split rows across GPUs",
|
|
1422
|
-
[](
|
|
1464
|
+
[](common_params & params, const std::string & value) {
|
|
1423
1465
|
std::string arg_next = value;
|
|
1424
1466
|
if (arg_next == "none") {
|
|
1425
1467
|
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
|
@@ -1439,10 +1481,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1439
1481
|
}
|
|
1440
1482
|
}
|
|
1441
1483
|
).set_env("LLAMA_ARG_SPLIT_MODE"));
|
|
1442
|
-
add_opt(
|
|
1484
|
+
add_opt(common_arg(
|
|
1443
1485
|
{"-ts", "--tensor-split"}, "N0,N1,N2,...",
|
|
1444
1486
|
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
|
|
1445
|
-
[](
|
|
1487
|
+
[](common_params & params, const std::string & value) {
|
|
1446
1488
|
std::string arg_next = value;
|
|
1447
1489
|
|
|
1448
1490
|
// split string by , and /
|
|
@@ -1451,7 +1493,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1451
1493
|
std::vector<std::string> split_arg{ it, {} };
|
|
1452
1494
|
if (split_arg.size() >= llama_max_devices()) {
|
|
1453
1495
|
throw std::invalid_argument(
|
|
1454
|
-
|
|
1496
|
+
string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
|
|
1455
1497
|
);
|
|
1456
1498
|
}
|
|
1457
1499
|
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
|
@@ -1466,315 +1508,315 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1466
1508
|
}
|
|
1467
1509
|
}
|
|
1468
1510
|
).set_env("LLAMA_ARG_TENSOR_SPLIT"));
|
|
1469
|
-
add_opt(
|
|
1511
|
+
add_opt(common_arg(
|
|
1470
1512
|
{"-mg", "--main-gpu"}, "INDEX",
|
|
1471
|
-
|
|
1472
|
-
[](
|
|
1513
|
+
string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
|
|
1514
|
+
[](common_params & params, int value) {
|
|
1473
1515
|
params.main_gpu = value;
|
|
1474
1516
|
if (!llama_supports_gpu_offload()) {
|
|
1475
1517
|
fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
|
|
1476
1518
|
}
|
|
1477
1519
|
}
|
|
1478
1520
|
).set_env("LLAMA_ARG_MAIN_GPU"));
|
|
1479
|
-
add_opt(
|
|
1521
|
+
add_opt(common_arg(
|
|
1480
1522
|
{"--check-tensors"},
|
|
1481
|
-
|
|
1482
|
-
[](
|
|
1523
|
+
string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
|
|
1524
|
+
[](common_params & params) {
|
|
1483
1525
|
params.check_tensors = true;
|
|
1484
1526
|
}
|
|
1485
1527
|
));
|
|
1486
|
-
add_opt(
|
|
1528
|
+
add_opt(common_arg(
|
|
1487
1529
|
{"--override-kv"}, "KEY=TYPE:VALUE",
|
|
1488
1530
|
"advanced option to override model metadata by key. may be specified multiple times.\n"
|
|
1489
1531
|
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
|
|
1490
|
-
[](
|
|
1532
|
+
[](common_params & params, const std::string & value) {
|
|
1491
1533
|
if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
|
|
1492
|
-
throw std::runtime_error(
|
|
1534
|
+
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str()));
|
|
1493
1535
|
}
|
|
1494
1536
|
}
|
|
1495
1537
|
));
|
|
1496
|
-
add_opt(
|
|
1538
|
+
add_opt(common_arg(
|
|
1497
1539
|
{"--lora"}, "FNAME",
|
|
1498
1540
|
"path to LoRA adapter (can be repeated to use multiple adapters)",
|
|
1499
|
-
[](
|
|
1541
|
+
[](common_params & params, const std::string & value) {
|
|
1500
1542
|
params.lora_adapters.push_back({ std::string(value), 1.0 });
|
|
1501
1543
|
}
|
|
1502
1544
|
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
|
1503
1545
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
1504
|
-
add_opt(
|
|
1546
|
+
add_opt(common_arg(
|
|
1505
1547
|
{"--lora-scaled"}, "FNAME", "SCALE",
|
|
1506
1548
|
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
|
|
1507
|
-
[](
|
|
1549
|
+
[](common_params & params, const std::string & fname, const std::string & scale) {
|
|
1508
1550
|
params.lora_adapters.push_back({ fname, std::stof(scale) });
|
|
1509
1551
|
}
|
|
1510
1552
|
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
|
1511
1553
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
1512
|
-
add_opt(
|
|
1554
|
+
add_opt(common_arg(
|
|
1513
1555
|
{"--control-vector"}, "FNAME",
|
|
1514
1556
|
"add a control vector\nnote: this argument can be repeated to add multiple control vectors",
|
|
1515
|
-
[](
|
|
1557
|
+
[](common_params & params, const std::string & value) {
|
|
1516
1558
|
params.control_vectors.push_back({ 1.0f, value, });
|
|
1517
1559
|
}
|
|
1518
1560
|
));
|
|
1519
|
-
add_opt(
|
|
1561
|
+
add_opt(common_arg(
|
|
1520
1562
|
{"--control-vector-scaled"}, "FNAME", "SCALE",
|
|
1521
1563
|
"add a control vector with user defined scaling SCALE\n"
|
|
1522
1564
|
"note: this argument can be repeated to add multiple scaled control vectors",
|
|
1523
|
-
[](
|
|
1565
|
+
[](common_params & params, const std::string & fname, const std::string & scale) {
|
|
1524
1566
|
params.control_vectors.push_back({ std::stof(scale), fname });
|
|
1525
1567
|
}
|
|
1526
1568
|
));
|
|
1527
|
-
add_opt(
|
|
1569
|
+
add_opt(common_arg(
|
|
1528
1570
|
{"--control-vector-layer-range"}, "START", "END",
|
|
1529
1571
|
"layer range to apply the control vector(s) to, start and end inclusive",
|
|
1530
|
-
[](
|
|
1572
|
+
[](common_params & params, const std::string & start, const std::string & end) {
|
|
1531
1573
|
params.control_vector_layer_start = std::stoi(start);
|
|
1532
1574
|
params.control_vector_layer_end = std::stoi(end);
|
|
1533
1575
|
}
|
|
1534
1576
|
));
|
|
1535
|
-
add_opt(
|
|
1577
|
+
add_opt(common_arg(
|
|
1536
1578
|
{"-a", "--alias"}, "STRING",
|
|
1537
1579
|
"set alias for model name (to be used by REST API)",
|
|
1538
|
-
[](
|
|
1580
|
+
[](common_params & params, const std::string & value) {
|
|
1539
1581
|
params.model_alias = value;
|
|
1540
1582
|
}
|
|
1541
1583
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
|
|
1542
|
-
add_opt(
|
|
1584
|
+
add_opt(common_arg(
|
|
1543
1585
|
{"-m", "--model"}, "FNAME",
|
|
1544
1586
|
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
|
1545
1587
|
? std::string("model path from which to load base model")
|
|
1546
|
-
:
|
|
1588
|
+
: string_format(
|
|
1547
1589
|
"model path (default: `models/$filename` with filename from `--hf-file` "
|
|
1548
1590
|
"or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
|
|
1549
1591
|
),
|
|
1550
|
-
[](
|
|
1592
|
+
[](common_params & params, const std::string & value) {
|
|
1551
1593
|
params.model = value;
|
|
1552
1594
|
}
|
|
1553
1595
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
|
|
1554
|
-
add_opt(
|
|
1596
|
+
add_opt(common_arg(
|
|
1555
1597
|
{"-md", "--model-draft"}, "FNAME",
|
|
1556
1598
|
"draft model for speculative decoding (default: unused)",
|
|
1557
|
-
[](
|
|
1599
|
+
[](common_params & params, const std::string & value) {
|
|
1558
1600
|
params.model_draft = value;
|
|
1559
1601
|
}
|
|
1560
1602
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
1561
|
-
add_opt(
|
|
1603
|
+
add_opt(common_arg(
|
|
1562
1604
|
{"-mu", "--model-url"}, "MODEL_URL",
|
|
1563
1605
|
"model download url (default: unused)",
|
|
1564
|
-
[](
|
|
1606
|
+
[](common_params & params, const std::string & value) {
|
|
1565
1607
|
params.model_url = value;
|
|
1566
1608
|
}
|
|
1567
1609
|
).set_env("LLAMA_ARG_MODEL_URL"));
|
|
1568
|
-
add_opt(
|
|
1610
|
+
add_opt(common_arg(
|
|
1569
1611
|
{"-hfr", "--hf-repo"}, "REPO",
|
|
1570
1612
|
"Hugging Face model repository (default: unused)",
|
|
1571
|
-
[](
|
|
1613
|
+
[](common_params & params, const std::string & value) {
|
|
1572
1614
|
params.hf_repo = value;
|
|
1573
1615
|
}
|
|
1574
1616
|
).set_env("LLAMA_ARG_HF_REPO"));
|
|
1575
|
-
add_opt(
|
|
1617
|
+
add_opt(common_arg(
|
|
1576
1618
|
{"-hff", "--hf-file"}, "FILE",
|
|
1577
1619
|
"Hugging Face model file (default: unused)",
|
|
1578
|
-
[](
|
|
1620
|
+
[](common_params & params, const std::string & value) {
|
|
1579
1621
|
params.hf_file = value;
|
|
1580
1622
|
}
|
|
1581
1623
|
).set_env("LLAMA_ARG_HF_FILE"));
|
|
1582
|
-
add_opt(
|
|
1624
|
+
add_opt(common_arg(
|
|
1583
1625
|
{"-hft", "--hf-token"}, "TOKEN",
|
|
1584
1626
|
"Hugging Face access token (default: value from HF_TOKEN environment variable)",
|
|
1585
|
-
[](
|
|
1627
|
+
[](common_params & params, const std::string & value) {
|
|
1586
1628
|
params.hf_token = value;
|
|
1587
1629
|
}
|
|
1588
1630
|
).set_env("HF_TOKEN"));
|
|
1589
|
-
add_opt(
|
|
1631
|
+
add_opt(common_arg(
|
|
1590
1632
|
{"--context-file"}, "FNAME",
|
|
1591
1633
|
"file to load context from (repeat to specify multiple files)",
|
|
1592
|
-
[](
|
|
1634
|
+
[](common_params & params, const std::string & value) {
|
|
1593
1635
|
std::ifstream file(value, std::ios::binary);
|
|
1594
1636
|
if (!file) {
|
|
1595
|
-
throw std::runtime_error(
|
|
1637
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
1596
1638
|
}
|
|
1597
1639
|
params.context_files.push_back(value);
|
|
1598
1640
|
}
|
|
1599
1641
|
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
|
1600
|
-
add_opt(
|
|
1642
|
+
add_opt(common_arg(
|
|
1601
1643
|
{"--chunk-size"}, "N",
|
|
1602
|
-
|
|
1603
|
-
[](
|
|
1644
|
+
string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
|
|
1645
|
+
[](common_params & params, int value) {
|
|
1604
1646
|
params.chunk_size = value;
|
|
1605
1647
|
}
|
|
1606
1648
|
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
|
1607
|
-
add_opt(
|
|
1649
|
+
add_opt(common_arg(
|
|
1608
1650
|
{"--chunk-separator"}, "STRING",
|
|
1609
|
-
|
|
1610
|
-
[](
|
|
1651
|
+
string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
|
|
1652
|
+
[](common_params & params, const std::string & value) {
|
|
1611
1653
|
params.chunk_separator = value;
|
|
1612
1654
|
}
|
|
1613
1655
|
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
|
1614
|
-
add_opt(
|
|
1656
|
+
add_opt(common_arg(
|
|
1615
1657
|
{"--junk"}, "N",
|
|
1616
|
-
|
|
1617
|
-
[](
|
|
1658
|
+
string_format("number of times to repeat the junk text (default: %d)", params.n_junk),
|
|
1659
|
+
[](common_params & params, int value) {
|
|
1618
1660
|
params.n_junk = value;
|
|
1619
1661
|
}
|
|
1620
1662
|
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
|
1621
|
-
add_opt(
|
|
1663
|
+
add_opt(common_arg(
|
|
1622
1664
|
{"--pos"}, "N",
|
|
1623
|
-
|
|
1624
|
-
[](
|
|
1665
|
+
string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
|
|
1666
|
+
[](common_params & params, int value) {
|
|
1625
1667
|
params.i_pos = value;
|
|
1626
1668
|
}
|
|
1627
1669
|
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
|
1628
|
-
add_opt(
|
|
1670
|
+
add_opt(common_arg(
|
|
1629
1671
|
{"-o", "--output", "--output-file"}, "FNAME",
|
|
1630
|
-
|
|
1672
|
+
string_format("output file (default: '%s')",
|
|
1631
1673
|
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
|
1632
1674
|
? params.lora_outfile.c_str()
|
|
1633
1675
|
: ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
|
|
1634
1676
|
? params.cvector_outfile.c_str()
|
|
1635
1677
|
: params.out_file.c_str()),
|
|
1636
|
-
[](
|
|
1678
|
+
[](common_params & params, const std::string & value) {
|
|
1637
1679
|
params.out_file = value;
|
|
1638
1680
|
params.cvector_outfile = value;
|
|
1639
1681
|
params.lora_outfile = value;
|
|
1640
1682
|
}
|
|
1641
1683
|
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
1642
|
-
add_opt(
|
|
1684
|
+
add_opt(common_arg(
|
|
1643
1685
|
{"-ofreq", "--output-frequency"}, "N",
|
|
1644
|
-
|
|
1645
|
-
[](
|
|
1686
|
+
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
|
1687
|
+
[](common_params & params, int value) {
|
|
1646
1688
|
params.n_out_freq = value;
|
|
1647
1689
|
}
|
|
1648
1690
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
1649
|
-
add_opt(
|
|
1691
|
+
add_opt(common_arg(
|
|
1650
1692
|
{"--save-frequency"}, "N",
|
|
1651
|
-
|
|
1652
|
-
[](
|
|
1693
|
+
string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
|
|
1694
|
+
[](common_params & params, int value) {
|
|
1653
1695
|
params.n_save_freq = value;
|
|
1654
1696
|
}
|
|
1655
1697
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
1656
|
-
add_opt(
|
|
1698
|
+
add_opt(common_arg(
|
|
1657
1699
|
{"--process-output"},
|
|
1658
|
-
|
|
1659
|
-
[](
|
|
1700
|
+
string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
|
|
1701
|
+
[](common_params & params) {
|
|
1660
1702
|
params.process_output = true;
|
|
1661
1703
|
}
|
|
1662
1704
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
1663
|
-
add_opt(
|
|
1705
|
+
add_opt(common_arg(
|
|
1664
1706
|
{"--no-ppl"},
|
|
1665
|
-
|
|
1666
|
-
[](
|
|
1707
|
+
string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
|
|
1708
|
+
[](common_params & params) {
|
|
1667
1709
|
params.compute_ppl = false;
|
|
1668
1710
|
}
|
|
1669
1711
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
1670
|
-
add_opt(
|
|
1712
|
+
add_opt(common_arg(
|
|
1671
1713
|
{"--chunk", "--from-chunk"}, "N",
|
|
1672
|
-
|
|
1673
|
-
[](
|
|
1714
|
+
string_format("start processing the input from chunk N (default: %d)", params.i_chunk),
|
|
1715
|
+
[](common_params & params, int value) {
|
|
1674
1716
|
params.i_chunk = value;
|
|
1675
1717
|
}
|
|
1676
1718
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
1677
|
-
add_opt(
|
|
1719
|
+
add_opt(common_arg(
|
|
1678
1720
|
{"-pps"},
|
|
1679
|
-
|
|
1680
|
-
[](
|
|
1721
|
+
string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
|
|
1722
|
+
[](common_params & params) {
|
|
1681
1723
|
params.is_pp_shared = true;
|
|
1682
1724
|
}
|
|
1683
1725
|
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
|
1684
|
-
add_opt(
|
|
1726
|
+
add_opt(common_arg(
|
|
1685
1727
|
{"-npp"}, "n0,n1,...",
|
|
1686
1728
|
"number of prompt tokens",
|
|
1687
|
-
[](
|
|
1729
|
+
[](common_params & params, const std::string & value) {
|
|
1688
1730
|
auto p = string_split<int>(value, ',');
|
|
1689
1731
|
params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
|
|
1690
1732
|
}
|
|
1691
1733
|
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
|
1692
|
-
add_opt(
|
|
1734
|
+
add_opt(common_arg(
|
|
1693
1735
|
{"-ntg"}, "n0,n1,...",
|
|
1694
1736
|
"number of text generation tokens",
|
|
1695
|
-
[](
|
|
1737
|
+
[](common_params & params, const std::string & value) {
|
|
1696
1738
|
auto p = string_split<int>(value, ',');
|
|
1697
1739
|
params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
|
|
1698
1740
|
}
|
|
1699
1741
|
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
|
1700
|
-
add_opt(
|
|
1742
|
+
add_opt(common_arg(
|
|
1701
1743
|
{"-npl"}, "n0,n1,...",
|
|
1702
1744
|
"number of parallel prompts",
|
|
1703
|
-
[](
|
|
1745
|
+
[](common_params & params, const std::string & value) {
|
|
1704
1746
|
auto p = string_split<int>(value, ',');
|
|
1705
1747
|
params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
|
|
1706
1748
|
}
|
|
1707
1749
|
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
|
1708
|
-
add_opt(
|
|
1750
|
+
add_opt(common_arg(
|
|
1709
1751
|
{"--embd-normalize"}, "N",
|
|
1710
|
-
|
|
1711
|
-
[](
|
|
1752
|
+
string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
|
|
1753
|
+
[](common_params & params, int value) {
|
|
1712
1754
|
params.embd_normalize = value;
|
|
1713
1755
|
}
|
|
1714
1756
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
1715
|
-
add_opt(
|
|
1757
|
+
add_opt(common_arg(
|
|
1716
1758
|
{"--embd-output-format"}, "FORMAT",
|
|
1717
1759
|
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
|
|
1718
|
-
[](
|
|
1760
|
+
[](common_params & params, const std::string & value) {
|
|
1719
1761
|
params.embd_out = value;
|
|
1720
1762
|
}
|
|
1721
1763
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
1722
|
-
add_opt(
|
|
1764
|
+
add_opt(common_arg(
|
|
1723
1765
|
{"--embd-separator"}, "STRING",
|
|
1724
|
-
"separator of
|
|
1725
|
-
[](
|
|
1766
|
+
"separator of embeddings (default \\n) for example \"<#sep#>\"",
|
|
1767
|
+
[](common_params & params, const std::string & value) {
|
|
1726
1768
|
params.embd_sep = value;
|
|
1727
1769
|
}
|
|
1728
1770
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
1729
|
-
add_opt(
|
|
1771
|
+
add_opt(common_arg(
|
|
1730
1772
|
{"--host"}, "HOST",
|
|
1731
|
-
|
|
1732
|
-
[](
|
|
1773
|
+
string_format("ip address to listen (default: %s)", params.hostname.c_str()),
|
|
1774
|
+
[](common_params & params, const std::string & value) {
|
|
1733
1775
|
params.hostname = value;
|
|
1734
1776
|
}
|
|
1735
1777
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
|
|
1736
|
-
add_opt(
|
|
1778
|
+
add_opt(common_arg(
|
|
1737
1779
|
{"--port"}, "PORT",
|
|
1738
|
-
|
|
1739
|
-
[](
|
|
1780
|
+
string_format("port to listen (default: %d)", params.port),
|
|
1781
|
+
[](common_params & params, int value) {
|
|
1740
1782
|
params.port = value;
|
|
1741
1783
|
}
|
|
1742
1784
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
|
|
1743
|
-
add_opt(
|
|
1785
|
+
add_opt(common_arg(
|
|
1744
1786
|
{"--path"}, "PATH",
|
|
1745
|
-
|
|
1746
|
-
[](
|
|
1787
|
+
string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
|
|
1788
|
+
[](common_params & params, const std::string & value) {
|
|
1747
1789
|
params.public_path = value;
|
|
1748
1790
|
}
|
|
1749
1791
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
|
|
1750
|
-
add_opt(
|
|
1792
|
+
add_opt(common_arg(
|
|
1751
1793
|
{"--embedding", "--embeddings"},
|
|
1752
|
-
|
|
1753
|
-
[](
|
|
1794
|
+
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
|
|
1795
|
+
[](common_params & params) {
|
|
1754
1796
|
params.embedding = true;
|
|
1755
1797
|
}
|
|
1756
1798
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
|
1757
|
-
add_opt(
|
|
1799
|
+
add_opt(common_arg(
|
|
1758
1800
|
{"--reranking", "--rerank"},
|
|
1759
|
-
|
|
1760
|
-
[](
|
|
1801
|
+
string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
|
|
1802
|
+
[](common_params & params) {
|
|
1761
1803
|
params.reranking = true;
|
|
1762
1804
|
}
|
|
1763
1805
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
|
|
1764
|
-
add_opt(
|
|
1806
|
+
add_opt(common_arg(
|
|
1765
1807
|
{"--api-key"}, "KEY",
|
|
1766
1808
|
"API key to use for authentication (default: none)",
|
|
1767
|
-
[](
|
|
1809
|
+
[](common_params & params, const std::string & value) {
|
|
1768
1810
|
params.api_keys.push_back(value);
|
|
1769
1811
|
}
|
|
1770
1812
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
|
|
1771
|
-
add_opt(
|
|
1813
|
+
add_opt(common_arg(
|
|
1772
1814
|
{"--api-key-file"}, "FNAME",
|
|
1773
1815
|
"path to file containing API keys (default: none)",
|
|
1774
|
-
[](
|
|
1816
|
+
[](common_params & params, const std::string & value) {
|
|
1775
1817
|
std::ifstream key_file(value);
|
|
1776
1818
|
if (!key_file) {
|
|
1777
|
-
throw std::runtime_error(
|
|
1819
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
1778
1820
|
}
|
|
1779
1821
|
std::string key;
|
|
1780
1822
|
while (std::getline(key_file, key)) {
|
|
@@ -1785,70 +1827,74 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1785
1827
|
key_file.close();
|
|
1786
1828
|
}
|
|
1787
1829
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1788
|
-
add_opt(
|
|
1830
|
+
add_opt(common_arg(
|
|
1789
1831
|
{"--ssl-key-file"}, "FNAME",
|
|
1790
1832
|
"path to file a PEM-encoded SSL private key",
|
|
1791
|
-
[](
|
|
1833
|
+
[](common_params & params, const std::string & value) {
|
|
1792
1834
|
params.ssl_file_key = value;
|
|
1793
1835
|
}
|
|
1794
1836
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE"));
|
|
1795
|
-
add_opt(
|
|
1837
|
+
add_opt(common_arg(
|
|
1796
1838
|
{"--ssl-cert-file"}, "FNAME",
|
|
1797
1839
|
"path to file a PEM-encoded SSL certificate",
|
|
1798
|
-
[](
|
|
1840
|
+
[](common_params & params, const std::string & value) {
|
|
1799
1841
|
params.ssl_file_cert = value;
|
|
1800
1842
|
}
|
|
1801
1843
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
|
|
1802
|
-
add_opt(
|
|
1844
|
+
add_opt(common_arg(
|
|
1803
1845
|
{"-to", "--timeout"}, "N",
|
|
1804
|
-
|
|
1805
|
-
[](
|
|
1846
|
+
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
|
1847
|
+
[](common_params & params, int value) {
|
|
1806
1848
|
params.timeout_read = value;
|
|
1807
1849
|
params.timeout_write = value;
|
|
1808
1850
|
}
|
|
1809
1851
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
|
|
1810
|
-
add_opt(
|
|
1852
|
+
add_opt(common_arg(
|
|
1811
1853
|
{"--threads-http"}, "N",
|
|
1812
|
-
|
|
1813
|
-
[](
|
|
1854
|
+
string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
|
|
1855
|
+
[](common_params & params, int value) {
|
|
1814
1856
|
params.n_threads_http = value;
|
|
1815
1857
|
}
|
|
1816
1858
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
|
|
1817
|
-
add_opt(
|
|
1818
|
-
{"
|
|
1819
|
-
"
|
|
1820
|
-
[](
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
std::string system_prompt;
|
|
1826
|
-
std::copy(
|
|
1827
|
-
std::istreambuf_iterator<char>(file),
|
|
1828
|
-
std::istreambuf_iterator<char>(),
|
|
1829
|
-
std::back_inserter(system_prompt)
|
|
1830
|
-
);
|
|
1831
|
-
params.system_prompt = system_prompt;
|
|
1832
|
-
}
|
|
1833
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1834
|
-
add_opt(llama_arg(
|
|
1859
|
+
add_opt(common_arg(
|
|
1860
|
+
{"--cache-reuse"}, "N",
|
|
1861
|
+
string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
|
|
1862
|
+
[](common_params & params, int value) {
|
|
1863
|
+
params.n_cache_reuse = value;
|
|
1864
|
+
}
|
|
1865
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
|
|
1866
|
+
add_opt(common_arg(
|
|
1835
1867
|
{"--metrics"},
|
|
1836
|
-
|
|
1837
|
-
[](
|
|
1868
|
+
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
|
|
1869
|
+
[](common_params & params) {
|
|
1838
1870
|
params.endpoint_metrics = true;
|
|
1839
1871
|
}
|
|
1840
1872
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
|
|
1841
|
-
add_opt(
|
|
1873
|
+
add_opt(common_arg(
|
|
1874
|
+
{"--slots"},
|
|
1875
|
+
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
|
1876
|
+
[](common_params & params) {
|
|
1877
|
+
params.endpoint_slots = true;
|
|
1878
|
+
}
|
|
1879
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
|
|
1880
|
+
add_opt(common_arg(
|
|
1881
|
+
{"--props"},
|
|
1882
|
+
string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
|
|
1883
|
+
[](common_params & params) {
|
|
1884
|
+
params.endpoint_props = true;
|
|
1885
|
+
}
|
|
1886
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
|
|
1887
|
+
add_opt(common_arg(
|
|
1842
1888
|
{"--no-slots"},
|
|
1843
|
-
|
|
1844
|
-
[](
|
|
1889
|
+
"disables slots monitoring endpoint",
|
|
1890
|
+
[](common_params & params) {
|
|
1845
1891
|
params.endpoint_slots = false;
|
|
1846
1892
|
}
|
|
1847
1893
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
|
|
1848
|
-
add_opt(
|
|
1894
|
+
add_opt(common_arg(
|
|
1849
1895
|
{"--slot-save-path"}, "PATH",
|
|
1850
1896
|
"path to save slot kv cache (default: disabled)",
|
|
1851
|
-
[](
|
|
1897
|
+
[](common_params & params, const std::string & value) {
|
|
1852
1898
|
params.slot_save_path = value;
|
|
1853
1899
|
// if doesn't end with DIRECTORY_SEPARATOR, add it
|
|
1854
1900
|
if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
|
|
@@ -1856,14 +1902,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1856
1902
|
}
|
|
1857
1903
|
}
|
|
1858
1904
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1859
|
-
add_opt(
|
|
1905
|
+
add_opt(common_arg(
|
|
1860
1906
|
{"--chat-template"}, "JINJA_TEMPLATE",
|
|
1861
1907
|
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
|
1862
1908
|
"if suffix/prefix are specified, template will be disabled\n"
|
|
1863
1909
|
"only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
|
|
1864
|
-
[](
|
|
1865
|
-
if (!
|
|
1866
|
-
throw std::runtime_error(
|
|
1910
|
+
[](common_params & params, const std::string & value) {
|
|
1911
|
+
if (!common_chat_verify_template(value)) {
|
|
1912
|
+
throw std::runtime_error(string_format(
|
|
1867
1913
|
"error: the supplied chat template is not supported: %s\n"
|
|
1868
1914
|
"note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
|
|
1869
1915
|
value.c_str()
|
|
@@ -1872,133 +1918,122 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|
|
1872
1918
|
params.chat_template = value;
|
|
1873
1919
|
}
|
|
1874
1920
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
|
1875
|
-
add_opt(
|
|
1921
|
+
add_opt(common_arg(
|
|
1876
1922
|
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
|
1877
|
-
|
|
1878
|
-
[](
|
|
1923
|
+
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
|
1924
|
+
[](common_params & params, const std::string & value) {
|
|
1879
1925
|
params.slot_prompt_similarity = std::stof(value);
|
|
1880
1926
|
}
|
|
1881
1927
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1882
|
-
add_opt(
|
|
1928
|
+
add_opt(common_arg(
|
|
1883
1929
|
{"--lora-init-without-apply"},
|
|
1884
|
-
|
|
1885
|
-
[](
|
|
1930
|
+
string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
|
|
1931
|
+
[](common_params & params) {
|
|
1886
1932
|
params.lora_init_without_apply = true;
|
|
1887
1933
|
}
|
|
1888
1934
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1889
|
-
add_opt(
|
|
1935
|
+
add_opt(common_arg(
|
|
1890
1936
|
{"--simple-io"},
|
|
1891
1937
|
"use basic IO for better compatibility in subprocesses and limited consoles",
|
|
1892
|
-
[](
|
|
1938
|
+
[](common_params & params) {
|
|
1893
1939
|
params.simple_io = true;
|
|
1894
1940
|
}
|
|
1895
1941
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
|
1896
|
-
add_opt(
|
|
1897
|
-
{"-ld", "--logdir"}, "LOGDIR",
|
|
1898
|
-
"path under which to save YAML logs (no logging if unset)",
|
|
1899
|
-
[](gpt_params & params, const std::string & value) {
|
|
1900
|
-
params.logdir = value;
|
|
1901
|
-
|
|
1902
|
-
if (params.logdir.back() != DIRECTORY_SEPARATOR) {
|
|
1903
|
-
params.logdir += DIRECTORY_SEPARATOR;
|
|
1904
|
-
}
|
|
1905
|
-
}
|
|
1906
|
-
));
|
|
1907
|
-
add_opt(llama_arg(
|
|
1942
|
+
add_opt(common_arg(
|
|
1908
1943
|
{"--positive-file"}, "FNAME",
|
|
1909
|
-
|
|
1910
|
-
[](
|
|
1944
|
+
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|
|
1945
|
+
[](common_params & params, const std::string & value) {
|
|
1911
1946
|
params.cvector_positive_file = value;
|
|
1912
1947
|
}
|
|
1913
1948
|
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
|
1914
|
-
add_opt(
|
|
1949
|
+
add_opt(common_arg(
|
|
1915
1950
|
{"--negative-file"}, "FNAME",
|
|
1916
|
-
|
|
1917
|
-
[](
|
|
1951
|
+
string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
|
|
1952
|
+
[](common_params & params, const std::string & value) {
|
|
1918
1953
|
params.cvector_negative_file = value;
|
|
1919
1954
|
}
|
|
1920
1955
|
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
|
1921
|
-
add_opt(
|
|
1956
|
+
add_opt(common_arg(
|
|
1922
1957
|
{"--pca-batch"}, "N",
|
|
1923
|
-
|
|
1924
|
-
[](
|
|
1958
|
+
string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
|
|
1959
|
+
[](common_params & params, int value) {
|
|
1925
1960
|
params.n_pca_batch = value;
|
|
1926
1961
|
}
|
|
1927
1962
|
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
|
1928
|
-
add_opt(
|
|
1963
|
+
add_opt(common_arg(
|
|
1929
1964
|
{"--pca-iter"}, "N",
|
|
1930
|
-
|
|
1931
|
-
[](
|
|
1965
|
+
string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
|
|
1966
|
+
[](common_params & params, int value) {
|
|
1932
1967
|
params.n_pca_iterations = value;
|
|
1933
1968
|
}
|
|
1934
1969
|
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
|
1935
|
-
add_opt(
|
|
1970
|
+
add_opt(common_arg(
|
|
1936
1971
|
{"--method"}, "{pca, mean}",
|
|
1937
1972
|
"dimensionality reduction method to be used (default: pca)",
|
|
1938
|
-
[](
|
|
1973
|
+
[](common_params & params, const std::string & value) {
|
|
1939
1974
|
/**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
|
|
1940
1975
|
else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
|
|
1941
1976
|
else { throw std::invalid_argument("invalid value"); }
|
|
1942
1977
|
}
|
|
1943
1978
|
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
|
1944
|
-
add_opt(
|
|
1979
|
+
add_opt(common_arg(
|
|
1945
1980
|
{"--output-format"}, "{md,jsonl}",
|
|
1946
1981
|
"output format for batched-bench results (default: md)",
|
|
1947
|
-
[](
|
|
1982
|
+
[](common_params & params, const std::string & value) {
|
|
1948
1983
|
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
|
|
1949
1984
|
else if (value == "md") { params.batched_bench_output_jsonl = false; }
|
|
1950
1985
|
else { std::invalid_argument("invalid value"); }
|
|
1951
1986
|
}
|
|
1952
1987
|
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
|
1953
|
-
add_opt(
|
|
1988
|
+
add_opt(common_arg(
|
|
1954
1989
|
{"--log-disable"},
|
|
1955
1990
|
"Log disable",
|
|
1956
|
-
[](
|
|
1957
|
-
|
|
1991
|
+
[](common_params &) {
|
|
1992
|
+
common_log_pause(common_log_main());
|
|
1958
1993
|
}
|
|
1959
1994
|
));
|
|
1960
|
-
add_opt(
|
|
1995
|
+
add_opt(common_arg(
|
|
1961
1996
|
{"--log-file"}, "FNAME",
|
|
1962
1997
|
"Log to file",
|
|
1963
|
-
[](
|
|
1964
|
-
|
|
1998
|
+
[](common_params &, const std::string & value) {
|
|
1999
|
+
common_log_set_file(common_log_main(), value.c_str());
|
|
1965
2000
|
}
|
|
1966
2001
|
));
|
|
1967
|
-
add_opt(
|
|
2002
|
+
add_opt(common_arg(
|
|
1968
2003
|
{"--log-colors"},
|
|
1969
2004
|
"Enable colored logging",
|
|
1970
|
-
[](
|
|
1971
|
-
|
|
2005
|
+
[](common_params &) {
|
|
2006
|
+
common_log_set_colors(common_log_main(), true);
|
|
1972
2007
|
}
|
|
1973
2008
|
).set_env("LLAMA_LOG_COLORS"));
|
|
1974
|
-
add_opt(
|
|
2009
|
+
add_opt(common_arg(
|
|
1975
2010
|
{"-v", "--verbose", "--log-verbose"},
|
|
1976
2011
|
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
|
|
1977
|
-
[](
|
|
2012
|
+
[](common_params & params) {
|
|
1978
2013
|
params.verbosity = INT_MAX;
|
|
1979
|
-
|
|
2014
|
+
common_log_set_verbosity_thold(INT_MAX);
|
|
1980
2015
|
}
|
|
1981
2016
|
));
|
|
1982
|
-
add_opt(
|
|
2017
|
+
add_opt(common_arg(
|
|
1983
2018
|
{"-lv", "--verbosity", "--log-verbosity"}, "N",
|
|
1984
2019
|
"Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
|
|
1985
|
-
[](
|
|
2020
|
+
[](common_params & params, int value) {
|
|
1986
2021
|
params.verbosity = value;
|
|
1987
|
-
|
|
2022
|
+
common_log_set_verbosity_thold(value);
|
|
1988
2023
|
}
|
|
1989
2024
|
).set_env("LLAMA_LOG_VERBOSITY"));
|
|
1990
|
-
add_opt(
|
|
2025
|
+
add_opt(common_arg(
|
|
1991
2026
|
{"--log-prefix"},
|
|
1992
2027
|
"Enable prefx in log messages",
|
|
1993
|
-
[](
|
|
1994
|
-
|
|
2028
|
+
[](common_params &) {
|
|
2029
|
+
common_log_set_prefix(common_log_main(), true);
|
|
1995
2030
|
}
|
|
1996
2031
|
).set_env("LLAMA_LOG_PREFIX"));
|
|
1997
|
-
add_opt(
|
|
2032
|
+
add_opt(common_arg(
|
|
1998
2033
|
{"--log-timestamps"},
|
|
1999
2034
|
"Enable timestamps in log messages",
|
|
2000
|
-
[](
|
|
2001
|
-
|
|
2035
|
+
[](common_params &) {
|
|
2036
|
+
common_log_set_timestamps(common_log_main(), true);
|
|
2002
2037
|
}
|
|
2003
2038
|
).set_env("LLAMA_LOG_TIMESTAMPS"));
|
|
2004
2039
|
|