@fugood/llama.node 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +15 -5
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +1 -1
- package/src/LlamaContext.cpp +81 -18
- package/src/LlamaContext.h +2 -0
- package/src/llama.cpp/.github/workflows/build.yml +197 -159
- package/src/llama.cpp/.github/workflows/docker.yml +5 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +11 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -2
- package/src/llama.cpp/common/arg.cpp +426 -245
- package/src/llama.cpp/common/common.cpp +143 -80
- package/src/llama.cpp/common/common.h +81 -24
- package/src/llama.cpp/common/sampling.cpp +53 -19
- package/src/llama.cpp/common/sampling.h +22 -1
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +101 -148
- package/src/llama.cpp/examples/CMakeLists.txt +32 -13
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +5 -4
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +262 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/llava.cpp +46 -19
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +9 -5
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
- package/src/llama.cpp/examples/server/server.cpp +1758 -886
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +94 -304
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +4 -0
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
- package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml.h +106 -24
- package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
- package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
- package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
- package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
- package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
- package/src/llama.cpp/ggml/src/ggml.c +367 -207
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +26 -19
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/CMakeLists.txt +2 -7
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +35 -90
- package/src/llama.cpp/src/llama-vocab.cpp +6 -1
- package/src/llama.cpp/src/llama.cpp +1748 -640
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -37
- package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
- package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
- package/src/llama.cpp/tests/test-rope.cpp +61 -20
- package/src/llama.cpp/tests/test-sampling.cpp +2 -2
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
|
@@ -119,26 +119,63 @@ std::string common_arg::to_string() {
|
|
|
119
119
|
// utils
|
|
120
120
|
//
|
|
121
121
|
|
|
122
|
-
static void common_params_handle_model_default(
|
|
123
|
-
|
|
122
|
+
static void common_params_handle_model_default(
|
|
123
|
+
std::string & model,
|
|
124
|
+
std::string & model_url,
|
|
125
|
+
std::string & hf_repo,
|
|
126
|
+
std::string & hf_file) {
|
|
127
|
+
if (!hf_repo.empty()) {
|
|
124
128
|
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
125
|
-
if (
|
|
126
|
-
if (
|
|
129
|
+
if (hf_file.empty()) {
|
|
130
|
+
if (model.empty()) {
|
|
127
131
|
throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
|
|
128
132
|
}
|
|
129
|
-
|
|
130
|
-
} else if (
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
133
|
+
hf_file = model;
|
|
134
|
+
} else if (model.empty()) {
|
|
135
|
+
// this is to avoid different repo having same file name, or same file name in different subdirs
|
|
136
|
+
std::string filename = hf_repo + "_" + hf_file;
|
|
137
|
+
// to make sure we don't have any slashes in the filename
|
|
138
|
+
string_replace_all(filename, "/", "_");
|
|
139
|
+
model = fs_get_cache_file(filename);
|
|
140
|
+
}
|
|
141
|
+
} else if (!model_url.empty()) {
|
|
142
|
+
if (model.empty()) {
|
|
143
|
+
auto f = string_split<std::string>(model_url, '#').front();
|
|
136
144
|
f = string_split<std::string>(f, '?').front();
|
|
137
|
-
|
|
145
|
+
model = fs_get_cache_file(string_split<std::string>(f, '/').back());
|
|
138
146
|
}
|
|
139
|
-
} else if (
|
|
140
|
-
|
|
147
|
+
} else if (model.empty()) {
|
|
148
|
+
model = DEFAULT_MODEL_PATH;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
const std::vector<ggml_type> kv_cache_types = {
|
|
153
|
+
GGML_TYPE_F32,
|
|
154
|
+
GGML_TYPE_F16,
|
|
155
|
+
GGML_TYPE_BF16,
|
|
156
|
+
GGML_TYPE_Q8_0,
|
|
157
|
+
GGML_TYPE_Q4_0,
|
|
158
|
+
GGML_TYPE_Q4_1,
|
|
159
|
+
GGML_TYPE_IQ4_NL,
|
|
160
|
+
GGML_TYPE_Q5_0,
|
|
161
|
+
GGML_TYPE_Q5_1,
|
|
162
|
+
};
|
|
163
|
+
|
|
164
|
+
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
|
165
|
+
for (const auto & type : kv_cache_types) {
|
|
166
|
+
if (ggml_type_name(type) == s) {
|
|
167
|
+
return type;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
throw std::runtime_error("Unsupported cache type: " + s);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
static std::string get_all_kv_cache_types() {
|
|
174
|
+
std::ostringstream msg;
|
|
175
|
+
for (const auto & type : kv_cache_types) {
|
|
176
|
+
msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
|
|
141
177
|
}
|
|
178
|
+
return msg.str();
|
|
142
179
|
}
|
|
143
180
|
|
|
144
181
|
//
|
|
@@ -233,16 +270,19 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
233
270
|
}
|
|
234
271
|
}
|
|
235
272
|
|
|
236
|
-
postprocess_cpu_params(params.cpuparams,
|
|
273
|
+
postprocess_cpu_params(params.cpuparams, nullptr);
|
|
237
274
|
postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
|
|
238
|
-
|
|
239
|
-
postprocess_cpu_params(params.
|
|
275
|
+
|
|
276
|
+
postprocess_cpu_params(params.speculative.cpuparams, ¶ms.cpuparams);
|
|
277
|
+
postprocess_cpu_params(params.speculative.cpuparams_batch, ¶ms.cpuparams_batch);
|
|
240
278
|
|
|
241
279
|
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
|
|
242
280
|
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
|
243
281
|
}
|
|
244
282
|
|
|
245
|
-
|
|
283
|
+
// TODO: refactor model params in a common struct
|
|
284
|
+
common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file);
|
|
285
|
+
common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file);
|
|
246
286
|
|
|
247
287
|
if (params.escape) {
|
|
248
288
|
string_process_escapes(params.prompt);
|
|
@@ -251,7 +291,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
251
291
|
for (auto & antiprompt : params.antiprompt) {
|
|
252
292
|
string_process_escapes(antiprompt);
|
|
253
293
|
}
|
|
254
|
-
for (auto & seq_breaker : params.
|
|
294
|
+
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
|
|
255
295
|
string_process_escapes(seq_breaker);
|
|
256
296
|
}
|
|
257
297
|
}
|
|
@@ -297,6 +337,27 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
|
|
|
297
337
|
print_options(specific_options);
|
|
298
338
|
}
|
|
299
339
|
|
|
340
|
+
static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
|
|
341
|
+
std::vector<ggml_backend_dev_t> devices;
|
|
342
|
+
auto dev_names = string_split<std::string>(value, ',');
|
|
343
|
+
if (dev_names.empty()) {
|
|
344
|
+
throw std::invalid_argument("no devices specified");
|
|
345
|
+
}
|
|
346
|
+
if (dev_names.size() == 1 && dev_names[0] == "none") {
|
|
347
|
+
devices.push_back(nullptr);
|
|
348
|
+
} else {
|
|
349
|
+
for (const auto & device : dev_names) {
|
|
350
|
+
auto * dev = ggml_backend_dev_by_name(device.c_str());
|
|
351
|
+
if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
|
|
352
|
+
throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
|
|
353
|
+
}
|
|
354
|
+
devices.push_back(dev);
|
|
355
|
+
}
|
|
356
|
+
devices.push_back(nullptr);
|
|
357
|
+
}
|
|
358
|
+
return devices;
|
|
359
|
+
}
|
|
360
|
+
|
|
300
361
|
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
|
301
362
|
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
|
|
302
363
|
const common_params params_org = ctx_arg.params; // the example can modify the default params
|
|
@@ -322,14 +383,29 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
|
|
322
383
|
return true;
|
|
323
384
|
}
|
|
324
385
|
|
|
386
|
+
static std::string list_builtin_chat_templates() {
|
|
387
|
+
std::vector<const char *> supported_tmpl;
|
|
388
|
+
int32_t res = llama_chat_builtin_templates(nullptr, 0);
|
|
389
|
+
supported_tmpl.resize(res);
|
|
390
|
+
res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
|
|
391
|
+
std::ostringstream msg;
|
|
392
|
+
for (auto & tmpl : supported_tmpl) {
|
|
393
|
+
msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", ");
|
|
394
|
+
}
|
|
395
|
+
return msg.str();
|
|
396
|
+
}
|
|
397
|
+
|
|
325
398
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
|
399
|
+
// load dynamic backends
|
|
400
|
+
ggml_backend_load_all();
|
|
401
|
+
|
|
326
402
|
common_params_context ctx_arg(params);
|
|
327
403
|
ctx_arg.print_usage = print_usage;
|
|
328
404
|
ctx_arg.ex = ex;
|
|
329
405
|
|
|
330
406
|
std::string sampler_type_chars;
|
|
331
407
|
std::string sampler_type_names;
|
|
332
|
-
for (const auto & sampler : params.
|
|
408
|
+
for (const auto & sampler : params.sampling.samplers) {
|
|
333
409
|
sampler_type_chars += common_sampler_type_to_chr(sampler);
|
|
334
410
|
sampler_type_names += common_sampler_type_to_str(sampler) + ";";
|
|
335
411
|
}
|
|
@@ -407,26 +483,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
407
483
|
}
|
|
408
484
|
}
|
|
409
485
|
));
|
|
410
|
-
add_opt(common_arg(
|
|
411
|
-
{"-td", "--threads-draft"}, "N",
|
|
412
|
-
"number of threads to use during generation (default: same as --threads)",
|
|
413
|
-
[](common_params & params, int value) {
|
|
414
|
-
params.draft_cpuparams.n_threads = value;
|
|
415
|
-
if (params.draft_cpuparams.n_threads <= 0) {
|
|
416
|
-
params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
|
|
417
|
-
}
|
|
418
|
-
}
|
|
419
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
420
|
-
add_opt(common_arg(
|
|
421
|
-
{"-tbd", "--threads-batch-draft"}, "N",
|
|
422
|
-
"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
|
|
423
|
-
[](common_params & params, int value) {
|
|
424
|
-
params.draft_cpuparams_batch.n_threads = value;
|
|
425
|
-
if (params.draft_cpuparams_batch.n_threads <= 0) {
|
|
426
|
-
params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
|
427
|
-
}
|
|
428
|
-
}
|
|
429
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
430
486
|
add_opt(common_arg(
|
|
431
487
|
{"-C", "--cpu-mask"}, "M",
|
|
432
488
|
"CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
|
|
@@ -515,108 +571,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
515
571
|
params.cpuparams_batch.poll = value;
|
|
516
572
|
}
|
|
517
573
|
));
|
|
518
|
-
add_opt(common_arg(
|
|
519
|
-
{"-Cd", "--cpu-mask-draft"}, "M",
|
|
520
|
-
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
|
521
|
-
[](common_params & params, const std::string & mask) {
|
|
522
|
-
params.draft_cpuparams.mask_valid = true;
|
|
523
|
-
if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
|
|
524
|
-
throw std::invalid_argument("invalid cpumask");
|
|
525
|
-
}
|
|
526
|
-
}
|
|
527
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
528
|
-
add_opt(common_arg(
|
|
529
|
-
{"-Crd", "--cpu-range-draft"}, "lo-hi",
|
|
530
|
-
"Ranges of CPUs for affinity. Complements --cpu-mask-draft",
|
|
531
|
-
[](common_params & params, const std::string & range) {
|
|
532
|
-
params.draft_cpuparams.mask_valid = true;
|
|
533
|
-
if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
|
|
534
|
-
throw std::invalid_argument("invalid range");
|
|
535
|
-
}
|
|
536
|
-
}
|
|
537
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
538
|
-
add_opt(common_arg(
|
|
539
|
-
{"--cpu-strict-draft"}, "<0|1>",
|
|
540
|
-
"Use strict CPU placement for draft model (default: same as --cpu-strict)",
|
|
541
|
-
[](common_params & params, int value) {
|
|
542
|
-
params.draft_cpuparams.strict_cpu = value;
|
|
543
|
-
}
|
|
544
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
545
|
-
add_opt(common_arg(
|
|
546
|
-
{"--prio-draft"}, "N",
|
|
547
|
-
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
|
|
548
|
-
[](common_params & params, int prio) {
|
|
549
|
-
if (prio < 0 || prio > 3) {
|
|
550
|
-
throw std::invalid_argument("invalid value");
|
|
551
|
-
}
|
|
552
|
-
params.draft_cpuparams.priority = (enum ggml_sched_priority) prio;
|
|
553
|
-
}
|
|
554
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
555
|
-
add_opt(common_arg(
|
|
556
|
-
{"--poll-draft"}, "<0|1>",
|
|
557
|
-
"Use polling to wait for draft model work (default: same as --poll])",
|
|
558
|
-
[](common_params & params, int value) {
|
|
559
|
-
params.draft_cpuparams.poll = value;
|
|
560
|
-
}
|
|
561
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
562
|
-
add_opt(common_arg(
|
|
563
|
-
{"-Cbd", "--cpu-mask-batch-draft"}, "M",
|
|
564
|
-
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
|
565
|
-
[](common_params & params, const std::string & mask) {
|
|
566
|
-
params.draft_cpuparams_batch.mask_valid = true;
|
|
567
|
-
if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) {
|
|
568
|
-
throw std::invalid_argument("invalid cpumask");
|
|
569
|
-
}
|
|
570
|
-
}
|
|
571
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
572
|
-
add_opt(common_arg(
|
|
573
|
-
{"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
|
|
574
|
-
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
|
|
575
|
-
[](common_params & params, const std::string & range) {
|
|
576
|
-
params.draft_cpuparams_batch.mask_valid = true;
|
|
577
|
-
if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
|
|
578
|
-
throw std::invalid_argument("invalid cpumask");
|
|
579
|
-
}
|
|
580
|
-
}
|
|
581
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
582
|
-
add_opt(common_arg(
|
|
583
|
-
{"--cpu-strict-batch-draft"}, "<0|1>",
|
|
584
|
-
"Use strict CPU placement for draft model (default: --cpu-strict-draft)",
|
|
585
|
-
[](common_params & params, int value) {
|
|
586
|
-
params.draft_cpuparams_batch.strict_cpu = value;
|
|
587
|
-
}
|
|
588
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
589
|
-
add_opt(common_arg(
|
|
590
|
-
{"--prio-batch-draft"}, "N",
|
|
591
|
-
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
|
|
592
|
-
[](common_params & params, int prio) {
|
|
593
|
-
if (prio < 0 || prio > 3) {
|
|
594
|
-
throw std::invalid_argument("invalid value");
|
|
595
|
-
}
|
|
596
|
-
params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio;
|
|
597
|
-
}
|
|
598
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
599
|
-
add_opt(common_arg(
|
|
600
|
-
{"--poll-batch-draft"}, "<0|1>",
|
|
601
|
-
"Use polling to wait for draft model work (default: --poll-draft)",
|
|
602
|
-
[](common_params & params, int value) {
|
|
603
|
-
params.draft_cpuparams_batch.poll = value;
|
|
604
|
-
}
|
|
605
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
606
|
-
add_opt(common_arg(
|
|
607
|
-
{"--draft"}, "N",
|
|
608
|
-
string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
|
|
609
|
-
[](common_params & params, int value) {
|
|
610
|
-
params.n_draft = value;
|
|
611
|
-
}
|
|
612
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
|
613
|
-
add_opt(common_arg(
|
|
614
|
-
{"-ps", "--p-split"}, "N",
|
|
615
|
-
string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
|
|
616
|
-
[](common_params & params, const std::string & value) {
|
|
617
|
-
params.p_split = std::stof(value);
|
|
618
|
-
}
|
|
619
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
620
574
|
add_opt(common_arg(
|
|
621
575
|
{"-lcs", "--lookup-cache-static"}, "FNAME",
|
|
622
576
|
"path to static lookup cache to use for lookup decoding (not updated by generation)",
|
|
@@ -672,7 +626,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
672
626
|
[](common_params & params) {
|
|
673
627
|
params.ctx_shift = false;
|
|
674
628
|
}
|
|
675
|
-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
|
629
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
|
676
630
|
add_opt(common_arg(
|
|
677
631
|
{"--chunks"}, "N",
|
|
678
632
|
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
|
|
@@ -701,7 +655,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
701
655
|
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
|
702
656
|
[](common_params & params) {
|
|
703
657
|
params.no_perf = true;
|
|
704
|
-
params.
|
|
658
|
+
params.sampling.no_perf = true;
|
|
705
659
|
}
|
|
706
660
|
).set_env("LLAMA_ARG_NO_PERF"));
|
|
707
661
|
add_opt(common_arg(
|
|
@@ -867,7 +821,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
867
821
|
[](common_params & params) {
|
|
868
822
|
params.warmup = false;
|
|
869
823
|
}
|
|
870
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
824
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
|
|
871
825
|
add_opt(common_arg(
|
|
872
826
|
{"--spm-infill"},
|
|
873
827
|
string_format(
|
|
@@ -883,155 +837,154 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
883
837
|
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
|
884
838
|
[](common_params & params, const std::string & value) {
|
|
885
839
|
const auto sampler_names = string_split<std::string>(value, ';');
|
|
886
|
-
params.
|
|
840
|
+
params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
|
|
887
841
|
}
|
|
888
842
|
).set_sparam());
|
|
889
843
|
add_opt(common_arg(
|
|
890
844
|
{"-s", "--seed"}, "SEED",
|
|
891
|
-
string_format("RNG seed (default: %d, use random seed for %d)", params.
|
|
845
|
+
string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED),
|
|
892
846
|
[](common_params & params, const std::string & value) {
|
|
893
|
-
params.
|
|
847
|
+
params.sampling.seed = std::stoul(value);
|
|
894
848
|
}
|
|
895
849
|
).set_sparam());
|
|
896
850
|
add_opt(common_arg(
|
|
897
|
-
{"--sampling-seq"}, "SEQUENCE",
|
|
851
|
+
{"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
|
|
898
852
|
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
|
|
899
853
|
[](common_params & params, const std::string & value) {
|
|
900
|
-
params.
|
|
854
|
+
params.sampling.samplers = common_sampler_types_from_chars(value);
|
|
901
855
|
}
|
|
902
856
|
).set_sparam());
|
|
903
857
|
add_opt(common_arg(
|
|
904
858
|
{"--ignore-eos"},
|
|
905
859
|
"ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
|
|
906
860
|
[](common_params & params) {
|
|
907
|
-
params.
|
|
908
|
-
}
|
|
909
|
-
).set_sparam());
|
|
910
|
-
add_opt(common_arg(
|
|
911
|
-
{"--penalize-nl"},
|
|
912
|
-
string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
|
|
913
|
-
[](common_params & params) {
|
|
914
|
-
params.sparams.penalize_nl = true;
|
|
861
|
+
params.sampling.ignore_eos = true;
|
|
915
862
|
}
|
|
916
863
|
).set_sparam());
|
|
917
864
|
add_opt(common_arg(
|
|
918
865
|
{"--temp"}, "N",
|
|
919
|
-
string_format("temperature (default: %.1f)", (double)params.
|
|
866
|
+
string_format("temperature (default: %.1f)", (double)params.sampling.temp),
|
|
920
867
|
[](common_params & params, const std::string & value) {
|
|
921
|
-
params.
|
|
922
|
-
params.
|
|
868
|
+
params.sampling.temp = std::stof(value);
|
|
869
|
+
params.sampling.temp = std::max(params.sampling.temp, 0.0f);
|
|
923
870
|
}
|
|
924
871
|
).set_sparam());
|
|
925
872
|
add_opt(common_arg(
|
|
926
873
|
{"--top-k"}, "N",
|
|
927
|
-
string_format("top-k sampling (default: %d, 0 = disabled)", params.
|
|
874
|
+
string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
|
|
928
875
|
[](common_params & params, int value) {
|
|
929
|
-
params.
|
|
876
|
+
params.sampling.top_k = value;
|
|
930
877
|
}
|
|
931
878
|
).set_sparam());
|
|
932
879
|
add_opt(common_arg(
|
|
933
880
|
{"--top-p"}, "N",
|
|
934
|
-
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.
|
|
881
|
+
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
|
|
935
882
|
[](common_params & params, const std::string & value) {
|
|
936
|
-
params.
|
|
883
|
+
params.sampling.top_p = std::stof(value);
|
|
937
884
|
}
|
|
938
885
|
).set_sparam());
|
|
939
886
|
add_opt(common_arg(
|
|
940
887
|
{"--min-p"}, "N",
|
|
941
|
-
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.
|
|
888
|
+
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
|
|
942
889
|
[](common_params & params, const std::string & value) {
|
|
943
|
-
params.
|
|
890
|
+
params.sampling.min_p = std::stof(value);
|
|
944
891
|
}
|
|
945
892
|
).set_sparam());
|
|
946
893
|
add_opt(common_arg(
|
|
947
894
|
{"--xtc-probability"}, "N",
|
|
948
|
-
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.
|
|
895
|
+
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
|
|
949
896
|
[](common_params & params, const std::string & value) {
|
|
950
|
-
params.
|
|
897
|
+
params.sampling.xtc_probability = std::stof(value);
|
|
951
898
|
}
|
|
952
899
|
).set_sparam());
|
|
953
900
|
add_opt(common_arg(
|
|
954
901
|
{"--xtc-threshold"}, "N",
|
|
955
|
-
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.
|
|
902
|
+
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
|
|
956
903
|
[](common_params & params, const std::string & value) {
|
|
957
|
-
params.
|
|
904
|
+
params.sampling.xtc_threshold = std::stof(value);
|
|
958
905
|
}
|
|
959
906
|
).set_sparam());
|
|
960
907
|
add_opt(common_arg(
|
|
961
908
|
{"--typical"}, "N",
|
|
962
|
-
string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.
|
|
909
|
+
string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),
|
|
963
910
|
[](common_params & params, const std::string & value) {
|
|
964
|
-
params.
|
|
911
|
+
params.sampling.typ_p = std::stof(value);
|
|
965
912
|
}
|
|
966
913
|
).set_sparam());
|
|
967
914
|
add_opt(common_arg(
|
|
968
915
|
{"--repeat-last-n"}, "N",
|
|
969
|
-
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.
|
|
916
|
+
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
|
|
970
917
|
[](common_params & params, int value) {
|
|
971
|
-
|
|
972
|
-
|
|
918
|
+
if (value < -1) {
|
|
919
|
+
throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
|
|
920
|
+
}
|
|
921
|
+
params.sampling.penalty_last_n = value;
|
|
922
|
+
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
|
|
973
923
|
}
|
|
974
924
|
).set_sparam());
|
|
975
925
|
add_opt(common_arg(
|
|
976
926
|
{"--repeat-penalty"}, "N",
|
|
977
|
-
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.
|
|
927
|
+
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
|
|
978
928
|
[](common_params & params, const std::string & value) {
|
|
979
|
-
params.
|
|
929
|
+
params.sampling.penalty_repeat = std::stof(value);
|
|
980
930
|
}
|
|
981
931
|
).set_sparam());
|
|
982
932
|
add_opt(common_arg(
|
|
983
933
|
{"--presence-penalty"}, "N",
|
|
984
|
-
string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.
|
|
934
|
+
string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present),
|
|
985
935
|
[](common_params & params, const std::string & value) {
|
|
986
|
-
params.
|
|
936
|
+
params.sampling.penalty_present = std::stof(value);
|
|
987
937
|
}
|
|
988
938
|
).set_sparam());
|
|
989
939
|
add_opt(common_arg(
|
|
990
940
|
{"--frequency-penalty"}, "N",
|
|
991
|
-
string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.
|
|
941
|
+
string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
|
|
992
942
|
[](common_params & params, const std::string & value) {
|
|
993
|
-
params.
|
|
943
|
+
params.sampling.penalty_freq = std::stof(value);
|
|
994
944
|
}
|
|
995
945
|
).set_sparam());
|
|
996
946
|
add_opt(common_arg(
|
|
997
947
|
{"--dry-multiplier"}, "N",
|
|
998
|
-
string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.
|
|
948
|
+
string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
|
|
999
949
|
[](common_params & params, const std::string & value) {
|
|
1000
|
-
params.
|
|
950
|
+
params.sampling.dry_multiplier = std::stof(value);
|
|
1001
951
|
}
|
|
1002
952
|
).set_sparam());
|
|
1003
953
|
add_opt(common_arg(
|
|
1004
954
|
{"--dry-base"}, "N",
|
|
1005
|
-
string_format("set DRY sampling base value (default: %.2f)", (double)params.
|
|
955
|
+
string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base),
|
|
1006
956
|
[](common_params & params, const std::string & value) {
|
|
1007
957
|
float potential_base = std::stof(value);
|
|
1008
958
|
if (potential_base >= 1.0f)
|
|
1009
959
|
{
|
|
1010
|
-
params.
|
|
960
|
+
params.sampling.dry_base = potential_base;
|
|
1011
961
|
}
|
|
1012
962
|
}
|
|
1013
963
|
).set_sparam());
|
|
1014
964
|
add_opt(common_arg(
|
|
1015
965
|
{"--dry-allowed-length"}, "N",
|
|
1016
|
-
string_format("set allowed length for DRY sampling (default: %d)", params.
|
|
966
|
+
string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length),
|
|
1017
967
|
[](common_params & params, int value) {
|
|
1018
|
-
params.
|
|
968
|
+
params.sampling.dry_allowed_length = value;
|
|
1019
969
|
}
|
|
1020
970
|
).set_sparam());
|
|
1021
971
|
add_opt(common_arg(
|
|
1022
972
|
{"--dry-penalty-last-n"}, "N",
|
|
1023
|
-
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.
|
|
973
|
+
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
|
|
1024
974
|
[](common_params & params, int value) {
|
|
1025
|
-
|
|
975
|
+
if (value < -1) {
|
|
976
|
+
throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
|
|
977
|
+
}
|
|
978
|
+
params.sampling.dry_penalty_last_n = value;
|
|
1026
979
|
}
|
|
1027
980
|
).set_sparam());
|
|
1028
981
|
add_opt(common_arg(
|
|
1029
982
|
{"--dry-sequence-breaker"}, "STRING",
|
|
1030
983
|
string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
|
|
1031
|
-
params.
|
|
1032
|
-
std::accumulate(std::next(params.
|
|
1033
|
-
params.
|
|
1034
|
-
std::string("'") + (params.
|
|
984
|
+
params.sampling.dry_sequence_breakers.empty() ? "none" :
|
|
985
|
+
std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()),
|
|
986
|
+
params.sampling.dry_sequence_breakers.end(),
|
|
987
|
+
std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'",
|
|
1035
988
|
[](const std::string& a, const std::string& b) {
|
|
1036
989
|
std::string formatted_b = (b == "\n") ? "\\n" : b;
|
|
1037
990
|
return a + ", '" + formatted_b + "'";
|
|
@@ -1040,51 +993,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1040
993
|
static bool defaults_cleared = false;
|
|
1041
994
|
|
|
1042
995
|
if (!defaults_cleared) {
|
|
1043
|
-
params.
|
|
996
|
+
params.sampling.dry_sequence_breakers.clear();
|
|
1044
997
|
defaults_cleared = true;
|
|
1045
998
|
}
|
|
1046
999
|
|
|
1047
1000
|
if (value == "none") {
|
|
1048
|
-
params.
|
|
1001
|
+
params.sampling.dry_sequence_breakers.clear();
|
|
1049
1002
|
} else {
|
|
1050
|
-
params.
|
|
1003
|
+
params.sampling.dry_sequence_breakers.emplace_back(value);
|
|
1051
1004
|
}
|
|
1052
1005
|
}
|
|
1053
1006
|
).set_sparam());
|
|
1054
1007
|
add_opt(common_arg(
|
|
1055
1008
|
{"--dynatemp-range"}, "N",
|
|
1056
|
-
string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.
|
|
1009
|
+
string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
|
|
1057
1010
|
[](common_params & params, const std::string & value) {
|
|
1058
|
-
params.
|
|
1011
|
+
params.sampling.dynatemp_range = std::stof(value);
|
|
1059
1012
|
}
|
|
1060
1013
|
).set_sparam());
|
|
1061
1014
|
add_opt(common_arg(
|
|
1062
1015
|
{"--dynatemp-exp"}, "N",
|
|
1063
|
-
string_format("dynamic temperature exponent (default: %.1f)", (double)params.
|
|
1016
|
+
string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent),
|
|
1064
1017
|
[](common_params & params, const std::string & value) {
|
|
1065
|
-
params.
|
|
1018
|
+
params.sampling.dynatemp_exponent = std::stof(value);
|
|
1066
1019
|
}
|
|
1067
1020
|
).set_sparam());
|
|
1068
1021
|
add_opt(common_arg(
|
|
1069
1022
|
{"--mirostat"}, "N",
|
|
1070
1023
|
string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
|
|
1071
|
-
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.
|
|
1024
|
+
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
|
|
1072
1025
|
[](common_params & params, int value) {
|
|
1073
|
-
params.
|
|
1026
|
+
params.sampling.mirostat = value;
|
|
1074
1027
|
}
|
|
1075
1028
|
).set_sparam());
|
|
1076
1029
|
add_opt(common_arg(
|
|
1077
1030
|
{"--mirostat-lr"}, "N",
|
|
1078
|
-
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.
|
|
1031
|
+
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
|
|
1079
1032
|
[](common_params & params, const std::string & value) {
|
|
1080
|
-
params.
|
|
1033
|
+
params.sampling.mirostat_eta = std::stof(value);
|
|
1081
1034
|
}
|
|
1082
1035
|
).set_sparam());
|
|
1083
1036
|
add_opt(common_arg(
|
|
1084
1037
|
{"--mirostat-ent"}, "N",
|
|
1085
|
-
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.
|
|
1038
|
+
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
|
|
1086
1039
|
[](common_params & params, const std::string & value) {
|
|
1087
|
-
params.
|
|
1040
|
+
params.sampling.mirostat_tau = std::stof(value);
|
|
1088
1041
|
}
|
|
1089
1042
|
).set_sparam());
|
|
1090
1043
|
add_opt(common_arg(
|
|
@@ -1100,7 +1053,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1100
1053
|
try {
|
|
1101
1054
|
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
|
1102
1055
|
const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
|
1103
|
-
params.
|
|
1056
|
+
params.sampling.logit_bias.push_back({key, bias});
|
|
1104
1057
|
} else {
|
|
1105
1058
|
throw std::invalid_argument("invalid input format");
|
|
1106
1059
|
}
|
|
@@ -1111,9 +1064,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1111
1064
|
).set_sparam());
|
|
1112
1065
|
add_opt(common_arg(
|
|
1113
1066
|
{"--grammar"}, "GRAMMAR",
|
|
1114
|
-
string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.
|
|
1067
|
+
string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
|
|
1115
1068
|
[](common_params & params, const std::string & value) {
|
|
1116
|
-
params.
|
|
1069
|
+
params.sampling.grammar = value;
|
|
1117
1070
|
}
|
|
1118
1071
|
).set_sparam());
|
|
1119
1072
|
add_opt(common_arg(
|
|
@@ -1127,7 +1080,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1127
1080
|
std::copy(
|
|
1128
1081
|
std::istreambuf_iterator<char>(file),
|
|
1129
1082
|
std::istreambuf_iterator<char>(),
|
|
1130
|
-
std::back_inserter(params.
|
|
1083
|
+
std::back_inserter(params.sampling.grammar)
|
|
1131
1084
|
);
|
|
1132
1085
|
}
|
|
1133
1086
|
).set_sparam());
|
|
@@ -1135,7 +1088,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1135
1088
|
{"-j", "--json-schema"}, "SCHEMA",
|
|
1136
1089
|
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
|
|
1137
1090
|
[](common_params & params, const std::string & value) {
|
|
1138
|
-
params.
|
|
1091
|
+
params.sampling.grammar = json_schema_to_grammar(json::parse(value));
|
|
1139
1092
|
}
|
|
1140
1093
|
).set_sparam());
|
|
1141
1094
|
add_opt(common_arg(
|
|
@@ -1255,18 +1208,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1255
1208
|
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
|
1256
1209
|
add_opt(common_arg(
|
|
1257
1210
|
{"-ctk", "--cache-type-k"}, "TYPE",
|
|
1258
|
-
string_format(
|
|
1211
|
+
string_format(
|
|
1212
|
+
"KV cache data type for K\n"
|
|
1213
|
+
"allowed values: %s\n"
|
|
1214
|
+
"(default: %s)",
|
|
1215
|
+
get_all_kv_cache_types().c_str(),
|
|
1216
|
+
ggml_type_name(params.cache_type_k)
|
|
1217
|
+
),
|
|
1259
1218
|
[](common_params & params, const std::string & value) {
|
|
1260
|
-
|
|
1261
|
-
params.cache_type_k = value;
|
|
1219
|
+
params.cache_type_k = kv_cache_type_from_str(value);
|
|
1262
1220
|
}
|
|
1263
1221
|
).set_env("LLAMA_ARG_CACHE_TYPE_K"));
|
|
1264
1222
|
add_opt(common_arg(
|
|
1265
1223
|
{"-ctv", "--cache-type-v"}, "TYPE",
|
|
1266
|
-
string_format(
|
|
1224
|
+
string_format(
|
|
1225
|
+
"KV cache data type for V\n"
|
|
1226
|
+
"allowed values: %s\n"
|
|
1227
|
+
"(default: %s)",
|
|
1228
|
+
get_all_kv_cache_types().c_str(),
|
|
1229
|
+
ggml_type_name(params.cache_type_v)
|
|
1230
|
+
),
|
|
1267
1231
|
[](common_params & params, const std::string & value) {
|
|
1268
|
-
|
|
1269
|
-
params.cache_type_v = value;
|
|
1232
|
+
params.cache_type_v = kv_cache_type_from_str(value);
|
|
1270
1233
|
}
|
|
1271
1234
|
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
|
|
1272
1235
|
add_opt(common_arg(
|
|
@@ -1433,28 +1396,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1433
1396
|
else { throw std::invalid_argument("invalid value"); }
|
|
1434
1397
|
}
|
|
1435
1398
|
).set_env("LLAMA_ARG_NUMA"));
|
|
1399
|
+
add_opt(common_arg(
|
|
1400
|
+
{"-dev", "--device"}, "<dev1,dev2,..>",
|
|
1401
|
+
"comma-separated list of devices to use for offloading (none = don't offload)\n"
|
|
1402
|
+
"use --list-devices to see a list of available devices",
|
|
1403
|
+
[](common_params & params, const std::string & value) {
|
|
1404
|
+
params.devices = parse_device_list(value);
|
|
1405
|
+
}
|
|
1406
|
+
).set_env("LLAMA_ARG_DEVICE"));
|
|
1407
|
+
add_opt(common_arg(
|
|
1408
|
+
{"--list-devices"},
|
|
1409
|
+
"print list of available devices and exit",
|
|
1410
|
+
[](common_params &) {
|
|
1411
|
+
printf("Available devices:\n");
|
|
1412
|
+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
1413
|
+
auto * dev = ggml_backend_dev_get(i);
|
|
1414
|
+
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
|
|
1415
|
+
size_t free, total;
|
|
1416
|
+
ggml_backend_dev_memory(dev, &free, &total);
|
|
1417
|
+
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
|
|
1418
|
+
}
|
|
1419
|
+
}
|
|
1420
|
+
exit(0);
|
|
1421
|
+
}
|
|
1422
|
+
));
|
|
1436
1423
|
add_opt(common_arg(
|
|
1437
1424
|
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
|
1438
1425
|
"number of layers to store in VRAM",
|
|
1439
1426
|
[](common_params & params, int value) {
|
|
1440
1427
|
params.n_gpu_layers = value;
|
|
1441
1428
|
if (!llama_supports_gpu_offload()) {
|
|
1442
|
-
fprintf(stderr, "warning:
|
|
1443
|
-
fprintf(stderr, "warning:
|
|
1429
|
+
fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
|
|
1430
|
+
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
|
1431
|
+
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
|
1444
1432
|
}
|
|
1445
1433
|
}
|
|
1446
1434
|
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
|
|
1447
|
-
add_opt(common_arg(
|
|
1448
|
-
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
|
1449
|
-
"number of layers to store in VRAM for the draft model",
|
|
1450
|
-
[](common_params & params, int value) {
|
|
1451
|
-
params.n_gpu_layers_draft = value;
|
|
1452
|
-
if (!llama_supports_gpu_offload()) {
|
|
1453
|
-
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
|
1454
|
-
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
|
1455
|
-
}
|
|
1456
|
-
}
|
|
1457
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
1458
1435
|
add_opt(common_arg(
|
|
1459
1436
|
{"-sm", "--split-mode"}, "{none,layer,row}",
|
|
1460
1437
|
"how to split the model across multiple GPUs, one of:\n"
|
|
@@ -1468,10 +1445,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1468
1445
|
} else if (arg_next == "layer") {
|
|
1469
1446
|
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
|
1470
1447
|
} else if (arg_next == "row") {
|
|
1471
|
-
#ifdef GGML_USE_SYCL
|
|
1472
|
-
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
|
|
1473
|
-
exit(1);
|
|
1474
|
-
#endif // GGML_USE_SYCL
|
|
1475
1448
|
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
|
1476
1449
|
} else {
|
|
1477
1450
|
throw std::invalid_argument("invalid value");
|
|
@@ -1593,13 +1566,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1593
1566
|
params.model = value;
|
|
1594
1567
|
}
|
|
1595
1568
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
|
|
1596
|
-
add_opt(common_arg(
|
|
1597
|
-
{"-md", "--model-draft"}, "FNAME",
|
|
1598
|
-
"draft model for speculative decoding (default: unused)",
|
|
1599
|
-
[](common_params & params, const std::string & value) {
|
|
1600
|
-
params.model_draft = value;
|
|
1601
|
-
}
|
|
1602
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
1603
1569
|
add_opt(common_arg(
|
|
1604
1570
|
{"-mu", "--model-url"}, "MODEL_URL",
|
|
1605
1571
|
"model download url (default: unused)",
|
|
@@ -1621,6 +1587,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1621
1587
|
params.hf_file = value;
|
|
1622
1588
|
}
|
|
1623
1589
|
).set_env("LLAMA_ARG_HF_FILE"));
|
|
1590
|
+
add_opt(common_arg(
|
|
1591
|
+
{"-hfrv", "--hf-repo-v"}, "REPO",
|
|
1592
|
+
"Hugging Face model repository for the vocoder model (default: unused)",
|
|
1593
|
+
[](common_params & params, const std::string & value) {
|
|
1594
|
+
params.vocoder.hf_repo = value;
|
|
1595
|
+
}
|
|
1596
|
+
).set_env("LLAMA_ARG_HF_REPO_V"));
|
|
1597
|
+
add_opt(common_arg(
|
|
1598
|
+
{"-hffv", "--hf-file-v"}, "FILE",
|
|
1599
|
+
"Hugging Face model file for the vocoder model (default: unused)",
|
|
1600
|
+
[](common_params & params, const std::string & value) {
|
|
1601
|
+
params.vocoder.hf_file = value;
|
|
1602
|
+
}
|
|
1603
|
+
).set_env("LLAMA_ARG_HF_FILE_V"));
|
|
1624
1604
|
add_opt(common_arg(
|
|
1625
1605
|
{"-hft", "--hf-token"}, "TOKEN",
|
|
1626
1606
|
"Hugging Face access token (default: value from HF_TOKEN environment variable)",
|
|
@@ -1789,6 +1769,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1789
1769
|
params.public_path = value;
|
|
1790
1770
|
}
|
|
1791
1771
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
|
|
1772
|
+
add_opt(common_arg(
|
|
1773
|
+
{"--no-webui"},
|
|
1774
|
+
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
|
|
1775
|
+
[](common_params & params) {
|
|
1776
|
+
params.webui = false;
|
|
1777
|
+
}
|
|
1778
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
|
|
1792
1779
|
add_opt(common_arg(
|
|
1793
1780
|
{"--embedding", "--embeddings"},
|
|
1794
1781
|
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
|
|
@@ -1904,9 +1891,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1904
1891
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1905
1892
|
add_opt(common_arg(
|
|
1906
1893
|
{"--chat-template"}, "JINJA_TEMPLATE",
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
|
|
1894
|
+
string_format(
|
|
1895
|
+
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
|
1896
|
+
"if suffix/prefix are specified, template will be disabled\n"
|
|
1897
|
+
"list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
|
|
1898
|
+
),
|
|
1910
1899
|
[](common_params & params, const std::string & value) {
|
|
1911
1900
|
if (!common_chat_verify_template(value)) {
|
|
1912
1901
|
throw std::runtime_error(string_format(
|
|
@@ -2037,5 +2026,197 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2037
2026
|
}
|
|
2038
2027
|
).set_env("LLAMA_LOG_TIMESTAMPS"));
|
|
2039
2028
|
|
|
2029
|
+
// speculative parameters
|
|
2030
|
+
add_opt(common_arg(
|
|
2031
|
+
{"-td", "--threads-draft"}, "N",
|
|
2032
|
+
"number of threads to use during generation (default: same as --threads)",
|
|
2033
|
+
[](common_params & params, int value) {
|
|
2034
|
+
params.speculative.cpuparams.n_threads = value;
|
|
2035
|
+
if (params.speculative.cpuparams.n_threads <= 0) {
|
|
2036
|
+
params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
|
|
2037
|
+
}
|
|
2038
|
+
}
|
|
2039
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2040
|
+
add_opt(common_arg(
|
|
2041
|
+
{"-tbd", "--threads-batch-draft"}, "N",
|
|
2042
|
+
"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
|
|
2043
|
+
[](common_params & params, int value) {
|
|
2044
|
+
params.speculative.cpuparams_batch.n_threads = value;
|
|
2045
|
+
if (params.speculative.cpuparams_batch.n_threads <= 0) {
|
|
2046
|
+
params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
|
2047
|
+
}
|
|
2048
|
+
}
|
|
2049
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2050
|
+
add_opt(common_arg(
|
|
2051
|
+
{"-Cd", "--cpu-mask-draft"}, "M",
|
|
2052
|
+
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
|
2053
|
+
[](common_params & params, const std::string & mask) {
|
|
2054
|
+
params.speculative.cpuparams.mask_valid = true;
|
|
2055
|
+
if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) {
|
|
2056
|
+
throw std::invalid_argument("invalid cpumask");
|
|
2057
|
+
}
|
|
2058
|
+
}
|
|
2059
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2060
|
+
add_opt(common_arg(
|
|
2061
|
+
{"-Crd", "--cpu-range-draft"}, "lo-hi",
|
|
2062
|
+
"Ranges of CPUs for affinity. Complements --cpu-mask-draft",
|
|
2063
|
+
[](common_params & params, const std::string & range) {
|
|
2064
|
+
params.speculative.cpuparams.mask_valid = true;
|
|
2065
|
+
if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) {
|
|
2066
|
+
throw std::invalid_argument("invalid range");
|
|
2067
|
+
}
|
|
2068
|
+
}
|
|
2069
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2070
|
+
add_opt(common_arg(
|
|
2071
|
+
{"--cpu-strict-draft"}, "<0|1>",
|
|
2072
|
+
"Use strict CPU placement for draft model (default: same as --cpu-strict)",
|
|
2073
|
+
[](common_params & params, int value) {
|
|
2074
|
+
params.speculative.cpuparams.strict_cpu = value;
|
|
2075
|
+
}
|
|
2076
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2077
|
+
add_opt(common_arg(
|
|
2078
|
+
{"--prio-draft"}, "N",
|
|
2079
|
+
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority),
|
|
2080
|
+
[](common_params & params, int prio) {
|
|
2081
|
+
if (prio < 0 || prio > 3) {
|
|
2082
|
+
throw std::invalid_argument("invalid value");
|
|
2083
|
+
}
|
|
2084
|
+
params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio;
|
|
2085
|
+
}
|
|
2086
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2087
|
+
add_opt(common_arg(
|
|
2088
|
+
{"--poll-draft"}, "<0|1>",
|
|
2089
|
+
"Use polling to wait for draft model work (default: same as --poll])",
|
|
2090
|
+
[](common_params & params, int value) {
|
|
2091
|
+
params.speculative.cpuparams.poll = value;
|
|
2092
|
+
}
|
|
2093
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2094
|
+
add_opt(common_arg(
|
|
2095
|
+
{"-Cbd", "--cpu-mask-batch-draft"}, "M",
|
|
2096
|
+
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
|
2097
|
+
[](common_params & params, const std::string & mask) {
|
|
2098
|
+
params.speculative.cpuparams_batch.mask_valid = true;
|
|
2099
|
+
if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) {
|
|
2100
|
+
throw std::invalid_argument("invalid cpumask");
|
|
2101
|
+
}
|
|
2102
|
+
}
|
|
2103
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2104
|
+
add_opt(common_arg(
|
|
2105
|
+
{"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
|
|
2106
|
+
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
|
|
2107
|
+
[](common_params & params, const std::string & range) {
|
|
2108
|
+
params.speculative.cpuparams_batch.mask_valid = true;
|
|
2109
|
+
if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) {
|
|
2110
|
+
throw std::invalid_argument("invalid cpumask");
|
|
2111
|
+
}
|
|
2112
|
+
}
|
|
2113
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2114
|
+
add_opt(common_arg(
|
|
2115
|
+
{"--cpu-strict-batch-draft"}, "<0|1>",
|
|
2116
|
+
"Use strict CPU placement for draft model (default: --cpu-strict-draft)",
|
|
2117
|
+
[](common_params & params, int value) {
|
|
2118
|
+
params.speculative.cpuparams_batch.strict_cpu = value;
|
|
2119
|
+
}
|
|
2120
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2121
|
+
add_opt(common_arg(
|
|
2122
|
+
{"--prio-batch-draft"}, "N",
|
|
2123
|
+
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority),
|
|
2124
|
+
[](common_params & params, int prio) {
|
|
2125
|
+
if (prio < 0 || prio > 3) {
|
|
2126
|
+
throw std::invalid_argument("invalid value");
|
|
2127
|
+
}
|
|
2128
|
+
params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
|
|
2129
|
+
}
|
|
2130
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2131
|
+
add_opt(common_arg(
|
|
2132
|
+
{"--poll-batch-draft"}, "<0|1>",
|
|
2133
|
+
"Use polling to wait for draft model work (default: --poll-draft)",
|
|
2134
|
+
[](common_params & params, int value) {
|
|
2135
|
+
params.speculative.cpuparams_batch.poll = value;
|
|
2136
|
+
}
|
|
2137
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
2138
|
+
add_opt(common_arg(
|
|
2139
|
+
{"--draft-max", "--draft", "--draft-n"}, "N",
|
|
2140
|
+
string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
|
|
2141
|
+
[](common_params & params, int value) {
|
|
2142
|
+
params.speculative.n_max = value;
|
|
2143
|
+
}
|
|
2144
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
|
|
2145
|
+
add_opt(common_arg(
|
|
2146
|
+
{"--draft-min", "--draft-n-min"}, "N",
|
|
2147
|
+
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
|
|
2148
|
+
[](common_params & params, int value) {
|
|
2149
|
+
params.speculative.n_min = value;
|
|
2150
|
+
}
|
|
2151
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
|
|
2152
|
+
add_opt(common_arg(
|
|
2153
|
+
{"--draft-p-split"}, "P",
|
|
2154
|
+
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
|
|
2155
|
+
[](common_params & params, const std::string & value) {
|
|
2156
|
+
params.speculative.p_split = std::stof(value);
|
|
2157
|
+
}
|
|
2158
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
|
|
2159
|
+
add_opt(common_arg(
|
|
2160
|
+
{"--draft-p-min"}, "P",
|
|
2161
|
+
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
|
|
2162
|
+
[](common_params & params, const std::string & value) {
|
|
2163
|
+
params.speculative.p_min = std::stof(value);
|
|
2164
|
+
}
|
|
2165
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
|
|
2166
|
+
add_opt(common_arg(
|
|
2167
|
+
{"-cd", "--ctx-size-draft"}, "N",
|
|
2168
|
+
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
|
|
2169
|
+
[](common_params & params, int value) {
|
|
2170
|
+
params.speculative.n_ctx = value;
|
|
2171
|
+
}
|
|
2172
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
|
|
2173
|
+
add_opt(common_arg(
|
|
2174
|
+
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
|
|
2175
|
+
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
|
|
2176
|
+
"use --list-devices to see a list of available devices",
|
|
2177
|
+
[](common_params & params, const std::string & value) {
|
|
2178
|
+
params.speculative.devices = parse_device_list(value);
|
|
2179
|
+
}
|
|
2180
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
|
2181
|
+
add_opt(common_arg(
|
|
2182
|
+
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
|
2183
|
+
"number of layers to store in VRAM for the draft model",
|
|
2184
|
+
[](common_params & params, int value) {
|
|
2185
|
+
params.speculative.n_gpu_layers = value;
|
|
2186
|
+
if (!llama_supports_gpu_offload()) {
|
|
2187
|
+
fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
|
|
2188
|
+
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
|
2189
|
+
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
|
2190
|
+
}
|
|
2191
|
+
}
|
|
2192
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
|
|
2193
|
+
add_opt(common_arg(
|
|
2194
|
+
{"-md", "--model-draft"}, "FNAME",
|
|
2195
|
+
"draft model for speculative decoding (default: unused)",
|
|
2196
|
+
[](common_params & params, const std::string & value) {
|
|
2197
|
+
params.speculative.model = value;
|
|
2198
|
+
}
|
|
2199
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
|
2200
|
+
|
|
2201
|
+
add_opt(common_arg(
|
|
2202
|
+
{"-mv", "--model-vocoder"}, "FNAME",
|
|
2203
|
+
"vocoder model for audio generation (default: unused)",
|
|
2204
|
+
[](common_params & params, const std::string & value) {
|
|
2205
|
+
params.vocoder.model = value;
|
|
2206
|
+
}
|
|
2207
|
+
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
|
|
2208
|
+
|
|
2209
|
+
// model-specific
|
|
2210
|
+
add_opt(common_arg(
|
|
2211
|
+
{"--tts-oute-default"},
|
|
2212
|
+
string_format("use default OuteTTS models (note: can download weights from the internet)"),
|
|
2213
|
+
[](common_params & params) {
|
|
2214
|
+
params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
|
|
2215
|
+
params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
|
|
2216
|
+
params.vocoder.hf_repo = "ggml-org/WavTokenizer";
|
|
2217
|
+
params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
|
|
2218
|
+
}
|
|
2219
|
+
).set_examples({LLAMA_EXAMPLE_TTS}));
|
|
2220
|
+
|
|
2040
2221
|
return ctx_arg;
|
|
2041
2222
|
}
|