@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#endif
|
|
4
4
|
|
|
5
5
|
#include "common.h"
|
|
6
|
+
#include "log.h"
|
|
6
7
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
7
8
|
#define JSON_ASSERT GGML_ASSERT
|
|
8
9
|
#include "json.hpp"
|
|
@@ -25,6 +26,7 @@
|
|
|
25
26
|
#include <unordered_map>
|
|
26
27
|
#include <unordered_set>
|
|
27
28
|
#include <vector>
|
|
29
|
+
#include <thread>
|
|
28
30
|
|
|
29
31
|
#if defined(__APPLE__) && defined(__MACH__)
|
|
30
32
|
#include <sys/types.h>
|
|
@@ -48,7 +50,6 @@
|
|
|
48
50
|
#if defined(LLAMA_USE_CURL)
|
|
49
51
|
#include <curl/curl.h>
|
|
50
52
|
#include <curl/easy.h>
|
|
51
|
-
#include <thread>
|
|
52
53
|
#include <future>
|
|
53
54
|
#endif
|
|
54
55
|
|
|
@@ -56,14 +57,6 @@
|
|
|
56
57
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
57
58
|
#endif
|
|
58
59
|
|
|
59
|
-
#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
|
|
60
|
-
#define GGML_USE_CUDA_SYCL
|
|
61
|
-
#endif
|
|
62
|
-
|
|
63
|
-
#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
|
|
64
|
-
#define GGML_USE_CUDA_SYCL_VULKAN
|
|
65
|
-
#endif
|
|
66
|
-
|
|
67
60
|
#if defined(LLAMA_USE_CURL)
|
|
68
61
|
#ifdef __linux__
|
|
69
62
|
#include <linux/limits.h>
|
|
@@ -110,8 +103,34 @@ int32_t cpu_get_num_physical_cores() {
|
|
|
110
103
|
if (result == 0) {
|
|
111
104
|
return num_physical_cores;
|
|
112
105
|
}
|
|
113
|
-
#elif defined(_WIN32)
|
|
114
|
-
//TODO:
|
|
106
|
+
#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
|
107
|
+
// TODO: windows + arm64 + mingw64
|
|
108
|
+
unsigned int n_threads_win = std::thread::hardware_concurrency();
|
|
109
|
+
unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
|
|
110
|
+
|
|
111
|
+
DWORD buffer_size = 0;
|
|
112
|
+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
|
|
113
|
+
if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
|
|
114
|
+
return default_threads;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
std::vector<char> buffer(buffer_size);
|
|
119
|
+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
|
|
120
|
+
return default_threads;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
int32_t num_physical_cores = 0;
|
|
124
|
+
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
|
|
125
|
+
while (buffer_size > 0) {
|
|
126
|
+
if (info->Relationship == RelationProcessorCore) {
|
|
127
|
+
num_physical_cores += info->Processor.GroupCount;
|
|
128
|
+
}
|
|
129
|
+
buffer_size -= info->Size;
|
|
130
|
+
info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
return num_physical_cores > 0 ? num_physical_cores : default_threads;
|
|
115
134
|
#endif
|
|
116
135
|
unsigned int n_threads = std::thread::hardware_concurrency();
|
|
117
136
|
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
|
|
@@ -156,1567 +175,223 @@ static int cpu_count_math_cpus(int n_cpu) {
|
|
|
156
175
|
if (pin_cpu(cpu)) {
|
|
157
176
|
return -1;
|
|
158
177
|
}
|
|
159
|
-
if (is_running_on_efficiency_core()) {
|
|
160
|
-
continue; // efficiency cores harm lockstep threading
|
|
161
|
-
}
|
|
162
|
-
++cpu; // hyperthreading isn't useful for linear algebra
|
|
163
|
-
++result;
|
|
164
|
-
}
|
|
165
|
-
return result;
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
#endif // __x86_64__ && __linux__
|
|
169
|
-
|
|
170
|
-
/**
|
|
171
|
-
* Returns number of CPUs on system that are useful for math.
|
|
172
|
-
*/
|
|
173
|
-
int32_t cpu_get_num_math() {
|
|
174
|
-
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
|
|
175
|
-
int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
|
|
176
|
-
if (n_cpu < 1) {
|
|
177
|
-
return cpu_get_num_physical_cores();
|
|
178
|
-
}
|
|
179
|
-
if (is_hybrid_cpu()) {
|
|
180
|
-
cpu_set_t affinity;
|
|
181
|
-
if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
|
|
182
|
-
int result = cpu_count_math_cpus(n_cpu);
|
|
183
|
-
pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
|
|
184
|
-
if (result > 0) {
|
|
185
|
-
return result;
|
|
186
|
-
}
|
|
187
|
-
}
|
|
188
|
-
}
|
|
189
|
-
#endif
|
|
190
|
-
return cpu_get_num_physical_cores();
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
//
|
|
194
|
-
// CLI argument parsing
|
|
195
|
-
//
|
|
196
|
-
|
|
197
|
-
void gpt_params_handle_hf_token(gpt_params & params) {
|
|
198
|
-
if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
|
|
199
|
-
params.hf_token = std::getenv("HF_TOKEN");
|
|
200
|
-
}
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
void gpt_params_handle_model_default(gpt_params & params) {
|
|
204
|
-
if (!params.hf_repo.empty()) {
|
|
205
|
-
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
206
|
-
if (params.hf_file.empty()) {
|
|
207
|
-
if (params.model.empty()) {
|
|
208
|
-
throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
|
|
209
|
-
}
|
|
210
|
-
params.hf_file = params.model;
|
|
211
|
-
} else if (params.model.empty()) {
|
|
212
|
-
params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
|
|
213
|
-
}
|
|
214
|
-
} else if (!params.model_url.empty()) {
|
|
215
|
-
if (params.model.empty()) {
|
|
216
|
-
auto f = string_split(params.model_url, '#').front();
|
|
217
|
-
f = string_split(f, '?').front();
|
|
218
|
-
params.model = fs_get_cache_file(string_split(f, '/').back());
|
|
219
|
-
}
|
|
220
|
-
} else if (params.model.empty()) {
|
|
221
|
-
params.model = DEFAULT_MODEL_PATH;
|
|
222
|
-
}
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|
226
|
-
bool invalid_param = false;
|
|
227
|
-
std::string arg;
|
|
228
|
-
const std::string arg_prefix = "--";
|
|
229
|
-
llama_sampling_params & sparams = params.sparams;
|
|
230
|
-
|
|
231
|
-
for (int i = 1; i < argc; i++) {
|
|
232
|
-
arg = argv[i];
|
|
233
|
-
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
|
234
|
-
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
235
|
-
}
|
|
236
|
-
if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
|
|
237
|
-
throw std::invalid_argument("error: unknown argument: " + arg);
|
|
238
|
-
}
|
|
239
|
-
if (invalid_param) {
|
|
240
|
-
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
|
241
|
-
}
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
|
|
245
|
-
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
gpt_params_handle_model_default(params);
|
|
249
|
-
|
|
250
|
-
gpt_params_handle_hf_token(params);
|
|
251
|
-
|
|
252
|
-
if (params.escape) {
|
|
253
|
-
string_process_escapes(params.prompt);
|
|
254
|
-
string_process_escapes(params.input_prefix);
|
|
255
|
-
string_process_escapes(params.input_suffix);
|
|
256
|
-
string_process_escapes(sparams.cfg_negative_prompt);
|
|
257
|
-
for (auto & antiprompt : params.antiprompt) {
|
|
258
|
-
string_process_escapes(antiprompt);
|
|
259
|
-
}
|
|
260
|
-
}
|
|
261
|
-
|
|
262
|
-
if (!params.kv_overrides.empty()) {
|
|
263
|
-
params.kv_overrides.emplace_back();
|
|
264
|
-
params.kv_overrides.back().key[0] = 0;
|
|
265
|
-
}
|
|
266
|
-
|
|
267
|
-
return true;
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
271
|
-
const auto params_org = params; // the example can modify the default params
|
|
272
|
-
|
|
273
|
-
try {
|
|
274
|
-
if (!gpt_params_parse_ex(argc, argv, params) || params.usage) {
|
|
275
|
-
params = params_org;
|
|
276
|
-
params.usage = true;
|
|
277
|
-
return false;
|
|
278
|
-
}
|
|
279
|
-
} catch (const std::invalid_argument & ex) {
|
|
280
|
-
fprintf(stderr, "%s\n", ex.what());
|
|
281
|
-
params = params_org;
|
|
282
|
-
return false;
|
|
283
|
-
}
|
|
284
|
-
|
|
285
|
-
return true;
|
|
286
|
-
}
|
|
287
|
-
|
|
288
|
-
#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
|
|
289
|
-
|
|
290
|
-
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
|
|
291
|
-
const char split_delim = ',';
|
|
292
|
-
|
|
293
|
-
llama_sampling_params & sparams = params.sparams;
|
|
294
|
-
|
|
295
|
-
if (arg == "-s" || arg == "--seed") {
|
|
296
|
-
CHECK_ARG
|
|
297
|
-
// TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
|
|
298
|
-
params.seed = std::stoul(argv[i]);
|
|
299
|
-
sparams.seed = std::stoul(argv[i]);
|
|
300
|
-
return true;
|
|
301
|
-
}
|
|
302
|
-
if (arg == "-t" || arg == "--threads") {
|
|
303
|
-
CHECK_ARG
|
|
304
|
-
params.n_threads = std::stoi(argv[i]);
|
|
305
|
-
if (params.n_threads <= 0) {
|
|
306
|
-
params.n_threads = std::thread::hardware_concurrency();
|
|
307
|
-
}
|
|
308
|
-
return true;
|
|
309
|
-
}
|
|
310
|
-
if (arg == "-tb" || arg == "--threads-batch") {
|
|
311
|
-
CHECK_ARG
|
|
312
|
-
params.n_threads_batch = std::stoi(argv[i]);
|
|
313
|
-
if (params.n_threads_batch <= 0) {
|
|
314
|
-
params.n_threads_batch = std::thread::hardware_concurrency();
|
|
315
|
-
}
|
|
316
|
-
return true;
|
|
317
|
-
}
|
|
318
|
-
if (arg == "-td" || arg == "--threads-draft") {
|
|
319
|
-
CHECK_ARG
|
|
320
|
-
params.n_threads_draft = std::stoi(argv[i]);
|
|
321
|
-
if (params.n_threads_draft <= 0) {
|
|
322
|
-
params.n_threads_draft = std::thread::hardware_concurrency();
|
|
323
|
-
}
|
|
324
|
-
return true;
|
|
325
|
-
}
|
|
326
|
-
if (arg == "-tbd" || arg == "--threads-batch-draft") {
|
|
327
|
-
CHECK_ARG
|
|
328
|
-
params.n_threads_batch_draft = std::stoi(argv[i]);
|
|
329
|
-
if (params.n_threads_batch_draft <= 0) {
|
|
330
|
-
params.n_threads_batch_draft = std::thread::hardware_concurrency();
|
|
331
|
-
}
|
|
332
|
-
return true;
|
|
333
|
-
}
|
|
334
|
-
if (arg == "-p" || arg == "--prompt") {
|
|
335
|
-
CHECK_ARG
|
|
336
|
-
params.prompt = argv[i];
|
|
337
|
-
return true;
|
|
338
|
-
}
|
|
339
|
-
if (arg == "-e" || arg == "--escape") {
|
|
340
|
-
params.escape = true;
|
|
341
|
-
return true;
|
|
342
|
-
}
|
|
343
|
-
if (arg == "--no-escape") {
|
|
344
|
-
params.escape = false;
|
|
345
|
-
return true;
|
|
346
|
-
}
|
|
347
|
-
if (arg == "--prompt-cache") {
|
|
348
|
-
CHECK_ARG
|
|
349
|
-
params.path_prompt_cache = argv[i];
|
|
350
|
-
return true;
|
|
351
|
-
}
|
|
352
|
-
if (arg == "--prompt-cache-all") {
|
|
353
|
-
params.prompt_cache_all = true;
|
|
354
|
-
return true;
|
|
355
|
-
}
|
|
356
|
-
if (arg == "--prompt-cache-ro") {
|
|
357
|
-
params.prompt_cache_ro = true;
|
|
358
|
-
return true;
|
|
359
|
-
}
|
|
360
|
-
if (arg == "-bf" || arg == "--binary-file") {
|
|
361
|
-
CHECK_ARG
|
|
362
|
-
std::ifstream file(argv[i], std::ios::binary);
|
|
363
|
-
if (!file) {
|
|
364
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
365
|
-
invalid_param = true;
|
|
366
|
-
return true;
|
|
367
|
-
}
|
|
368
|
-
// store the external file name in params
|
|
369
|
-
params.prompt_file = argv[i];
|
|
370
|
-
std::ostringstream ss;
|
|
371
|
-
ss << file.rdbuf();
|
|
372
|
-
params.prompt = ss.str();
|
|
373
|
-
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
|
|
374
|
-
return true;
|
|
375
|
-
}
|
|
376
|
-
if (arg == "-f" || arg == "--file") {
|
|
377
|
-
CHECK_ARG
|
|
378
|
-
std::ifstream file(argv[i]);
|
|
379
|
-
if (!file) {
|
|
380
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
381
|
-
invalid_param = true;
|
|
382
|
-
return true;
|
|
383
|
-
}
|
|
384
|
-
// store the external file name in params
|
|
385
|
-
params.prompt_file = argv[i];
|
|
386
|
-
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
|
387
|
-
if (!params.prompt.empty() && params.prompt.back() == '\n') {
|
|
388
|
-
params.prompt.pop_back();
|
|
389
|
-
}
|
|
390
|
-
return true;
|
|
391
|
-
}
|
|
392
|
-
if (arg == "--in-file") {
|
|
393
|
-
CHECK_ARG
|
|
394
|
-
std::ifstream file(argv[i]);
|
|
395
|
-
if (!file) {
|
|
396
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
397
|
-
invalid_param = true;
|
|
398
|
-
return true;
|
|
399
|
-
}
|
|
400
|
-
params.in_files.push_back(argv[i]);
|
|
401
|
-
return true;
|
|
402
|
-
}
|
|
403
|
-
if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
|
|
404
|
-
CHECK_ARG
|
|
405
|
-
params.n_predict = std::stoi(argv[i]);
|
|
406
|
-
return true;
|
|
407
|
-
}
|
|
408
|
-
if (arg == "--top-k") {
|
|
409
|
-
CHECK_ARG
|
|
410
|
-
sparams.top_k = std::stoi(argv[i]);
|
|
411
|
-
return true;
|
|
412
|
-
}
|
|
413
|
-
if (arg == "-c" || arg == "--ctx-size") {
|
|
414
|
-
CHECK_ARG
|
|
415
|
-
params.n_ctx = std::stoi(argv[i]);
|
|
416
|
-
return true;
|
|
417
|
-
}
|
|
418
|
-
if (arg == "--grp-attn-n" || arg == "-gan") {
|
|
419
|
-
CHECK_ARG
|
|
420
|
-
params.grp_attn_n = std::stoi(argv[i]);
|
|
421
|
-
return true;
|
|
422
|
-
}
|
|
423
|
-
if (arg == "--grp-attn-w" || arg == "-gaw") {
|
|
424
|
-
CHECK_ARG
|
|
425
|
-
params.grp_attn_w = std::stoi(argv[i]);
|
|
426
|
-
return true;
|
|
427
|
-
}
|
|
428
|
-
if (arg == "--rope-freq-base") {
|
|
429
|
-
CHECK_ARG
|
|
430
|
-
params.rope_freq_base = std::stof(argv[i]);
|
|
431
|
-
return true;
|
|
432
|
-
}
|
|
433
|
-
if (arg == "--rope-freq-scale") {
|
|
434
|
-
CHECK_ARG
|
|
435
|
-
params.rope_freq_scale = std::stof(argv[i]);
|
|
436
|
-
return true;
|
|
437
|
-
}
|
|
438
|
-
if (arg == "--rope-scaling") {
|
|
439
|
-
CHECK_ARG
|
|
440
|
-
std::string value(argv[i]);
|
|
441
|
-
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
|
442
|
-
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
|
443
|
-
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
|
444
|
-
else { invalid_param = true; }
|
|
445
|
-
return true;
|
|
446
|
-
}
|
|
447
|
-
if (arg == "--rope-scale") {
|
|
448
|
-
CHECK_ARG
|
|
449
|
-
params.rope_freq_scale = 1.0f / std::stof(argv[i]);
|
|
450
|
-
return true;
|
|
451
|
-
}
|
|
452
|
-
if (arg == "--yarn-orig-ctx") {
|
|
453
|
-
CHECK_ARG
|
|
454
|
-
params.yarn_orig_ctx = std::stoi(argv[i]);
|
|
455
|
-
return true;
|
|
456
|
-
}
|
|
457
|
-
if (arg == "--yarn-ext-factor") {
|
|
458
|
-
CHECK_ARG
|
|
459
|
-
params.yarn_ext_factor = std::stof(argv[i]);
|
|
460
|
-
return true;
|
|
461
|
-
}
|
|
462
|
-
if (arg == "--yarn-attn-factor") {
|
|
463
|
-
CHECK_ARG
|
|
464
|
-
params.yarn_attn_factor = std::stof(argv[i]);
|
|
465
|
-
return true;
|
|
466
|
-
}
|
|
467
|
-
if (arg == "--yarn-beta-fast") {
|
|
468
|
-
CHECK_ARG
|
|
469
|
-
params.yarn_beta_fast = std::stof(argv[i]);
|
|
470
|
-
return true;
|
|
471
|
-
}
|
|
472
|
-
if (arg == "--yarn-beta-slow") {
|
|
473
|
-
CHECK_ARG
|
|
474
|
-
params.yarn_beta_slow = std::stof(argv[i]);
|
|
475
|
-
return true;
|
|
476
|
-
}
|
|
477
|
-
if (arg == "--pooling") {
|
|
478
|
-
CHECK_ARG
|
|
479
|
-
std::string value(argv[i]);
|
|
480
|
-
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
|
|
481
|
-
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
|
|
482
|
-
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
|
|
483
|
-
else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
|
|
484
|
-
else { invalid_param = true; }
|
|
485
|
-
return true;
|
|
486
|
-
}
|
|
487
|
-
if (arg == "--attention") {
|
|
488
|
-
CHECK_ARG
|
|
489
|
-
std::string value(argv[i]);
|
|
490
|
-
/**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
|
|
491
|
-
else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
|
|
492
|
-
else { invalid_param = true; }
|
|
493
|
-
return true;
|
|
494
|
-
}
|
|
495
|
-
if (arg == "--defrag-thold" || arg == "-dt") {
|
|
496
|
-
CHECK_ARG
|
|
497
|
-
params.defrag_thold = std::stof(argv[i]);
|
|
498
|
-
return true;
|
|
499
|
-
}
|
|
500
|
-
if (arg == "--samplers") {
|
|
501
|
-
CHECK_ARG
|
|
502
|
-
const auto sampler_names = string_split(argv[i], ';');
|
|
503
|
-
sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
|
|
504
|
-
return true;
|
|
505
|
-
}
|
|
506
|
-
if (arg == "--sampling-seq") {
|
|
507
|
-
CHECK_ARG
|
|
508
|
-
sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]);
|
|
509
|
-
return true;
|
|
510
|
-
}
|
|
511
|
-
if (arg == "--top-p") {
|
|
512
|
-
CHECK_ARG
|
|
513
|
-
sparams.top_p = std::stof(argv[i]);
|
|
514
|
-
return true;
|
|
515
|
-
}
|
|
516
|
-
if (arg == "--min-p") {
|
|
517
|
-
CHECK_ARG
|
|
518
|
-
sparams.min_p = std::stof(argv[i]);
|
|
519
|
-
return true;
|
|
520
|
-
}
|
|
521
|
-
if (arg == "--temp") {
|
|
522
|
-
CHECK_ARG
|
|
523
|
-
sparams.temp = std::stof(argv[i]);
|
|
524
|
-
sparams.temp = std::max(sparams.temp, 0.0f);
|
|
525
|
-
return true;
|
|
526
|
-
}
|
|
527
|
-
if (arg == "--tfs") {
|
|
528
|
-
CHECK_ARG
|
|
529
|
-
sparams.tfs_z = std::stof(argv[i]);
|
|
530
|
-
return true;
|
|
531
|
-
}
|
|
532
|
-
if (arg == "--typical") {
|
|
533
|
-
CHECK_ARG
|
|
534
|
-
sparams.typical_p = std::stof(argv[i]);
|
|
535
|
-
return true;
|
|
536
|
-
}
|
|
537
|
-
if (arg == "--repeat-last-n") {
|
|
538
|
-
CHECK_ARG
|
|
539
|
-
sparams.penalty_last_n = std::stoi(argv[i]);
|
|
540
|
-
sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
|
|
541
|
-
return true;
|
|
542
|
-
}
|
|
543
|
-
if (arg == "--repeat-penalty") {
|
|
544
|
-
CHECK_ARG
|
|
545
|
-
sparams.penalty_repeat = std::stof(argv[i]);
|
|
546
|
-
return true;
|
|
547
|
-
}
|
|
548
|
-
if (arg == "--frequency-penalty") {
|
|
549
|
-
CHECK_ARG
|
|
550
|
-
sparams.penalty_freq = std::stof(argv[i]);
|
|
551
|
-
return true;
|
|
552
|
-
}
|
|
553
|
-
if (arg == "--presence-penalty") {
|
|
554
|
-
CHECK_ARG
|
|
555
|
-
sparams.penalty_present = std::stof(argv[i]);
|
|
556
|
-
return true;
|
|
557
|
-
}
|
|
558
|
-
if (arg == "--dynatemp-range") {
|
|
559
|
-
CHECK_ARG
|
|
560
|
-
sparams.dynatemp_range = std::stof(argv[i]);
|
|
561
|
-
return true;
|
|
562
|
-
}
|
|
563
|
-
if (arg == "--dynatemp-exp") {
|
|
564
|
-
CHECK_ARG
|
|
565
|
-
sparams.dynatemp_exponent = std::stof(argv[i]);
|
|
566
|
-
return true;
|
|
567
|
-
}
|
|
568
|
-
if (arg == "--mirostat") {
|
|
569
|
-
CHECK_ARG
|
|
570
|
-
sparams.mirostat = std::stoi(argv[i]);
|
|
571
|
-
return true;
|
|
572
|
-
}
|
|
573
|
-
if (arg == "--mirostat-lr") {
|
|
574
|
-
CHECK_ARG
|
|
575
|
-
sparams.mirostat_eta = std::stof(argv[i]);
|
|
576
|
-
return true;
|
|
577
|
-
}
|
|
578
|
-
if (arg == "--mirostat-ent") {
|
|
579
|
-
CHECK_ARG
|
|
580
|
-
sparams.mirostat_tau = std::stof(argv[i]);
|
|
581
|
-
return true;
|
|
582
|
-
}
|
|
583
|
-
if (arg == "--cfg-negative-prompt") {
|
|
584
|
-
CHECK_ARG
|
|
585
|
-
sparams.cfg_negative_prompt = argv[i];
|
|
586
|
-
return true;
|
|
587
|
-
}
|
|
588
|
-
if (arg == "--cfg-negative-prompt-file") {
|
|
589
|
-
CHECK_ARG
|
|
590
|
-
std::ifstream file(argv[i]);
|
|
591
|
-
if (!file) {
|
|
592
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
593
|
-
invalid_param = true;
|
|
594
|
-
return true;
|
|
595
|
-
}
|
|
596
|
-
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
|
|
597
|
-
if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
|
|
598
|
-
sparams.cfg_negative_prompt.pop_back();
|
|
599
|
-
}
|
|
600
|
-
return true;
|
|
601
|
-
}
|
|
602
|
-
if (arg == "--cfg-scale") {
|
|
603
|
-
CHECK_ARG
|
|
604
|
-
sparams.cfg_scale = std::stof(argv[i]);
|
|
605
|
-
return true;
|
|
606
|
-
}
|
|
607
|
-
if (arg == "-b" || arg == "--batch-size") {
|
|
608
|
-
CHECK_ARG
|
|
609
|
-
params.n_batch = std::stoi(argv[i]);
|
|
610
|
-
return true;
|
|
611
|
-
}
|
|
612
|
-
if (arg == "-ub" || arg == "--ubatch-size") {
|
|
613
|
-
CHECK_ARG
|
|
614
|
-
params.n_ubatch = std::stoi(argv[i]);
|
|
615
|
-
return true;
|
|
616
|
-
}
|
|
617
|
-
if (arg == "--keep") {
|
|
618
|
-
CHECK_ARG
|
|
619
|
-
params.n_keep = std::stoi(argv[i]);
|
|
620
|
-
return true;
|
|
621
|
-
}
|
|
622
|
-
if (arg == "--draft") {
|
|
623
|
-
CHECK_ARG
|
|
624
|
-
params.n_draft = std::stoi(argv[i]);
|
|
625
|
-
return true;
|
|
626
|
-
}
|
|
627
|
-
if (arg == "--chunks") {
|
|
628
|
-
CHECK_ARG
|
|
629
|
-
params.n_chunks = std::stoi(argv[i]);
|
|
630
|
-
return true;
|
|
631
|
-
}
|
|
632
|
-
if (arg == "-np" || arg == "--parallel") {
|
|
633
|
-
CHECK_ARG
|
|
634
|
-
params.n_parallel = std::stoi(argv[i]);
|
|
635
|
-
return true;
|
|
636
|
-
}
|
|
637
|
-
if (arg == "-ns" || arg == "--sequences") {
|
|
638
|
-
CHECK_ARG
|
|
639
|
-
params.n_sequences = std::stoi(argv[i]);
|
|
640
|
-
return true;
|
|
641
|
-
}
|
|
642
|
-
if (arg == "--p-split" || arg == "-ps") {
|
|
643
|
-
CHECK_ARG
|
|
644
|
-
params.p_split = std::stof(argv[i]);
|
|
645
|
-
return true;
|
|
646
|
-
}
|
|
647
|
-
if (arg == "-m" || arg == "--model") {
|
|
648
|
-
CHECK_ARG
|
|
649
|
-
params.model = argv[i];
|
|
650
|
-
return true;
|
|
651
|
-
}
|
|
652
|
-
if (arg == "-md" || arg == "--model-draft") {
|
|
653
|
-
CHECK_ARG
|
|
654
|
-
params.model_draft = argv[i];
|
|
655
|
-
return true;
|
|
656
|
-
}
|
|
657
|
-
if (arg == "-a" || arg == "--alias") {
|
|
658
|
-
CHECK_ARG
|
|
659
|
-
params.model_alias = argv[i];
|
|
660
|
-
return true;
|
|
661
|
-
}
|
|
662
|
-
if (arg == "-mu" || arg == "--model-url") {
|
|
663
|
-
CHECK_ARG
|
|
664
|
-
params.model_url = argv[i];
|
|
665
|
-
return true;
|
|
666
|
-
}
|
|
667
|
-
if (arg == "-hft" || arg == "--hf-token") {
|
|
668
|
-
if (++i >= argc) {
|
|
669
|
-
invalid_param = true;
|
|
670
|
-
return true;
|
|
671
|
-
}
|
|
672
|
-
params.hf_token = argv[i];
|
|
673
|
-
return true;
|
|
674
|
-
}
|
|
675
|
-
if (arg == "-hfr" || arg == "--hf-repo") {
|
|
676
|
-
CHECK_ARG
|
|
677
|
-
params.hf_repo = argv[i];
|
|
678
|
-
return true;
|
|
679
|
-
}
|
|
680
|
-
if (arg == "-hff" || arg == "--hf-file") {
|
|
681
|
-
CHECK_ARG
|
|
682
|
-
params.hf_file = argv[i];
|
|
683
|
-
return true;
|
|
684
|
-
}
|
|
685
|
-
if (arg == "--lora") {
|
|
686
|
-
CHECK_ARG
|
|
687
|
-
params.lora_adapter.emplace_back(argv[i], 1.0f);
|
|
688
|
-
return true;
|
|
689
|
-
}
|
|
690
|
-
if (arg == "--lora-scaled") {
|
|
691
|
-
CHECK_ARG
|
|
692
|
-
const char* lora_adapter = argv[i];
|
|
693
|
-
CHECK_ARG
|
|
694
|
-
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
|
695
|
-
return true;
|
|
696
|
-
}
|
|
697
|
-
if (arg == "--control-vector") {
|
|
698
|
-
CHECK_ARG
|
|
699
|
-
params.control_vectors.push_back({ 1.0f, argv[i], });
|
|
700
|
-
return true;
|
|
701
|
-
}
|
|
702
|
-
if (arg == "--control-vector-scaled") {
|
|
703
|
-
CHECK_ARG
|
|
704
|
-
const char* fname = argv[i];
|
|
705
|
-
CHECK_ARG
|
|
706
|
-
params.control_vectors.push_back({ std::stof(argv[i]), fname, });
|
|
707
|
-
return true;
|
|
708
|
-
}
|
|
709
|
-
if (arg == "--control-vector-layer-range") {
|
|
710
|
-
CHECK_ARG
|
|
711
|
-
params.control_vector_layer_start = std::stoi(argv[i]);
|
|
712
|
-
CHECK_ARG
|
|
713
|
-
params.control_vector_layer_end = std::stoi(argv[i]);
|
|
714
|
-
return true;
|
|
715
|
-
}
|
|
716
|
-
if (arg == "--mmproj") {
|
|
717
|
-
CHECK_ARG
|
|
718
|
-
params.mmproj = argv[i];
|
|
719
|
-
return true;
|
|
720
|
-
}
|
|
721
|
-
if (arg == "--image") {
|
|
722
|
-
CHECK_ARG
|
|
723
|
-
params.image.emplace_back(argv[i]);
|
|
724
|
-
return true;
|
|
725
|
-
}
|
|
726
|
-
if (arg == "-i" || arg == "--interactive") {
|
|
727
|
-
params.interactive = true;
|
|
728
|
-
return true;
|
|
729
|
-
}
|
|
730
|
-
if (arg == "-sp" || arg == "--special") {
|
|
731
|
-
params.special = true;
|
|
732
|
-
return true;
|
|
733
|
-
}
|
|
734
|
-
if (arg == "--embedding" || arg == "--embeddings") {
|
|
735
|
-
params.embedding = true;
|
|
736
|
-
return true;
|
|
737
|
-
}
|
|
738
|
-
if (arg == "--embd-normalize") {
|
|
739
|
-
CHECK_ARG
|
|
740
|
-
params.embd_normalize = std::stoi(argv[i]);
|
|
741
|
-
return true;
|
|
742
|
-
}
|
|
743
|
-
if (arg == "--embd-output-format") {
|
|
744
|
-
CHECK_ARG
|
|
745
|
-
params.embd_out = argv[i];
|
|
746
|
-
return true;
|
|
747
|
-
}
|
|
748
|
-
if (arg == "--embd-separator") {
|
|
749
|
-
CHECK_ARG
|
|
750
|
-
params.embd_sep = argv[i];
|
|
751
|
-
return true;
|
|
752
|
-
}
|
|
753
|
-
if (arg == "-if" || arg == "--interactive-first") {
|
|
754
|
-
params.interactive_first = true;
|
|
755
|
-
return true;
|
|
756
|
-
}
|
|
757
|
-
if (arg == "-cnv" || arg == "--conversation") {
|
|
758
|
-
params.conversation = true;
|
|
759
|
-
return true;
|
|
760
|
-
}
|
|
761
|
-
if (arg == "--infill") {
|
|
762
|
-
params.infill = true;
|
|
763
|
-
return true;
|
|
764
|
-
}
|
|
765
|
-
if (arg == "-dkvc" || arg == "--dump-kv-cache") {
|
|
766
|
-
params.dump_kv_cache = true;
|
|
767
|
-
return true;
|
|
768
|
-
}
|
|
769
|
-
if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
|
770
|
-
params.no_kv_offload = true;
|
|
771
|
-
return true;
|
|
772
|
-
}
|
|
773
|
-
if (arg == "-ctk" || arg == "--cache-type-k") {
|
|
774
|
-
params.cache_type_k = argv[++i];
|
|
775
|
-
return true;
|
|
776
|
-
}
|
|
777
|
-
if (arg == "-ctv" || arg == "--cache-type-v") {
|
|
778
|
-
params.cache_type_v = argv[++i];
|
|
779
|
-
return true;
|
|
780
|
-
}
|
|
781
|
-
if (arg == "-mli" || arg == "--multiline-input") {
|
|
782
|
-
params.multiline_input = true;
|
|
783
|
-
return true;
|
|
784
|
-
}
|
|
785
|
-
if (arg == "--simple-io") {
|
|
786
|
-
params.simple_io = true;
|
|
787
|
-
return true;
|
|
788
|
-
}
|
|
789
|
-
if (arg == "-cb" || arg == "--cont-batching") {
|
|
790
|
-
params.cont_batching = true;
|
|
791
|
-
return true;
|
|
792
|
-
}
|
|
793
|
-
if (arg == "-nocb" || arg == "--no-cont-batching") {
|
|
794
|
-
params.cont_batching = false;
|
|
795
|
-
return true;
|
|
796
|
-
}
|
|
797
|
-
if (arg == "-fa" || arg == "--flash-attn") {
|
|
798
|
-
params.flash_attn = true;
|
|
799
|
-
return true;
|
|
800
|
-
}
|
|
801
|
-
if (arg == "-co" || arg == "--color") {
|
|
802
|
-
params.use_color = true;
|
|
803
|
-
return true;
|
|
804
|
-
}
|
|
805
|
-
if (arg == "--mlock") {
|
|
806
|
-
params.use_mlock = true;
|
|
807
|
-
return true;
|
|
808
|
-
}
|
|
809
|
-
if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
|
|
810
|
-
CHECK_ARG
|
|
811
|
-
params.n_gpu_layers = std::stoi(argv[i]);
|
|
812
|
-
if (!llama_supports_gpu_offload()) {
|
|
813
|
-
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
|
|
814
|
-
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
|
815
|
-
}
|
|
816
|
-
return true;
|
|
817
|
-
}
|
|
818
|
-
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
|
|
819
|
-
CHECK_ARG
|
|
820
|
-
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
|
821
|
-
if (!llama_supports_gpu_offload()) {
|
|
822
|
-
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
|
823
|
-
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
|
824
|
-
}
|
|
825
|
-
return true;
|
|
826
|
-
}
|
|
827
|
-
if (arg == "--main-gpu" || arg == "-mg") {
|
|
828
|
-
CHECK_ARG
|
|
829
|
-
params.main_gpu = std::stoi(argv[i]);
|
|
830
|
-
#ifndef GGML_USE_CUDA_SYCL_VULKAN
|
|
831
|
-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
|
|
832
|
-
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
|
833
|
-
return true;
|
|
834
|
-
}
|
|
835
|
-
if (arg == "--split-mode" || arg == "-sm") {
|
|
836
|
-
CHECK_ARG
|
|
837
|
-
std::string arg_next = argv[i];
|
|
838
|
-
if (arg_next == "none") {
|
|
839
|
-
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
|
840
|
-
}
|
|
841
|
-
else if (arg_next == "layer") {
|
|
842
|
-
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
|
843
|
-
}
|
|
844
|
-
else if (arg_next == "row") {
|
|
845
|
-
#ifdef GGML_USE_SYCL
|
|
846
|
-
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
|
|
847
|
-
exit(1);
|
|
848
|
-
#endif // GGML_USE_SYCL
|
|
849
|
-
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
|
850
|
-
}
|
|
851
|
-
else {
|
|
852
|
-
invalid_param = true;
|
|
853
|
-
return true;
|
|
854
|
-
}
|
|
855
|
-
#ifndef GGML_USE_CUDA_SYCL_VULKAN
|
|
856
|
-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
|
|
857
|
-
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
|
858
|
-
return true;
|
|
859
|
-
}
|
|
860
|
-
if (arg == "--tensor-split" || arg == "-ts") {
|
|
861
|
-
CHECK_ARG
|
|
862
|
-
std::string arg_next = argv[i];
|
|
863
|
-
|
|
864
|
-
// split string by , and /
|
|
865
|
-
const std::regex regex{ R"([,/]+)" };
|
|
866
|
-
std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
|
|
867
|
-
std::vector<std::string> split_arg{ it, {} };
|
|
868
|
-
if (split_arg.size() >= llama_max_devices()) {
|
|
869
|
-
invalid_param = true;
|
|
870
|
-
return true;
|
|
871
|
-
}
|
|
872
|
-
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
|
873
|
-
if (i < split_arg.size()) {
|
|
874
|
-
params.tensor_split[i] = std::stof(split_arg[i]);
|
|
875
|
-
}
|
|
876
|
-
else {
|
|
877
|
-
params.tensor_split[i] = 0.0f;
|
|
878
|
-
}
|
|
879
|
-
}
|
|
880
|
-
#ifndef GGML_USE_CUDA_SYCL_VULKAN
|
|
881
|
-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
|
|
882
|
-
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
|
883
|
-
return true;
|
|
884
|
-
}
|
|
885
|
-
if (arg == "--rpc") {
|
|
886
|
-
CHECK_ARG
|
|
887
|
-
params.rpc_servers = argv[i];
|
|
888
|
-
return true;
|
|
889
|
-
}
|
|
890
|
-
if (arg == "--no-mmap") {
|
|
891
|
-
params.use_mmap = false;
|
|
892
|
-
return true;
|
|
893
|
-
}
|
|
894
|
-
if (arg == "--numa") {
|
|
895
|
-
CHECK_ARG
|
|
896
|
-
std::string value(argv[i]);
|
|
897
|
-
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
|
898
|
-
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
|
|
899
|
-
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
|
|
900
|
-
else { invalid_param = true; }
|
|
901
|
-
return true;
|
|
902
|
-
}
|
|
903
|
-
if (arg == "-v" || arg == "--verbose") {
|
|
904
|
-
params.verbosity = 1;
|
|
905
|
-
return true;
|
|
906
|
-
}
|
|
907
|
-
if (arg == "--verbosity") {
|
|
908
|
-
CHECK_ARG
|
|
909
|
-
params.verbosity = std::stoi(argv[i]);
|
|
910
|
-
return true;
|
|
911
|
-
}
|
|
912
|
-
if (arg == "--verbose-prompt") {
|
|
913
|
-
params.verbose_prompt = true;
|
|
914
|
-
return true;
|
|
915
|
-
}
|
|
916
|
-
if (arg == "--no-display-prompt") {
|
|
917
|
-
params.display_prompt = false;
|
|
918
|
-
return true;
|
|
919
|
-
}
|
|
920
|
-
if (arg == "-r" || arg == "--reverse-prompt") {
|
|
921
|
-
CHECK_ARG
|
|
922
|
-
params.antiprompt.emplace_back(argv[i]);
|
|
923
|
-
return true;
|
|
924
|
-
}
|
|
925
|
-
if (arg == "-ld" || arg == "--logdir") {
|
|
926
|
-
CHECK_ARG
|
|
927
|
-
params.logdir = argv[i];
|
|
928
|
-
|
|
929
|
-
if (params.logdir.back() != DIRECTORY_SEPARATOR) {
|
|
930
|
-
params.logdir += DIRECTORY_SEPARATOR;
|
|
931
|
-
}
|
|
932
|
-
return true;
|
|
933
|
-
}
|
|
934
|
-
if (arg == "-lcs" || arg == "--lookup-cache-static") {
|
|
935
|
-
CHECK_ARG
|
|
936
|
-
params.lookup_cache_static = argv[i];
|
|
937
|
-
return true;
|
|
938
|
-
}
|
|
939
|
-
if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
|
|
940
|
-
CHECK_ARG
|
|
941
|
-
params.lookup_cache_dynamic = argv[i];
|
|
942
|
-
return true;
|
|
943
|
-
}
|
|
944
|
-
if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
|
|
945
|
-
CHECK_ARG
|
|
946
|
-
params.logits_file = argv[i];
|
|
947
|
-
return true;
|
|
948
|
-
}
|
|
949
|
-
if (arg == "--perplexity" || arg == "--all-logits") {
|
|
950
|
-
params.logits_all = true;
|
|
951
|
-
return true;
|
|
952
|
-
}
|
|
953
|
-
if (arg == "--ppl-stride") {
|
|
954
|
-
CHECK_ARG
|
|
955
|
-
params.ppl_stride = std::stoi(argv[i]);
|
|
956
|
-
return true;
|
|
957
|
-
}
|
|
958
|
-
if (arg == "--ppl-output-type") {
|
|
959
|
-
CHECK_ARG
|
|
960
|
-
params.ppl_output_type = std::stoi(argv[i]);
|
|
961
|
-
return true;
|
|
962
|
-
}
|
|
963
|
-
if (arg == "-ptc" || arg == "--print-token-count") {
|
|
964
|
-
CHECK_ARG
|
|
965
|
-
params.n_print = std::stoi(argv[i]);
|
|
966
|
-
return true;
|
|
967
|
-
}
|
|
968
|
-
if (arg == "--check-tensors") {
|
|
969
|
-
params.check_tensors = true;
|
|
970
|
-
return true;
|
|
971
|
-
}
|
|
972
|
-
if (arg == "--hellaswag") {
|
|
973
|
-
params.hellaswag = true;
|
|
974
|
-
return true;
|
|
975
|
-
}
|
|
976
|
-
if (arg == "--hellaswag-tasks") {
|
|
977
|
-
CHECK_ARG
|
|
978
|
-
params.hellaswag_tasks = std::stoi(argv[i]);
|
|
979
|
-
return true;
|
|
980
|
-
}
|
|
981
|
-
if (arg == "--winogrande") {
|
|
982
|
-
params.winogrande = true;
|
|
983
|
-
return true;
|
|
984
|
-
}
|
|
985
|
-
if (arg == "--winogrande-tasks") {
|
|
986
|
-
CHECK_ARG
|
|
987
|
-
params.winogrande_tasks = std::stoi(argv[i]);
|
|
988
|
-
return true;
|
|
989
|
-
}
|
|
990
|
-
if (arg == "--multiple-choice") {
|
|
991
|
-
params.multiple_choice = true;
|
|
992
|
-
return true;
|
|
993
|
-
}
|
|
994
|
-
if (arg == "--multiple-choice-tasks") {
|
|
995
|
-
CHECK_ARG
|
|
996
|
-
params.multiple_choice_tasks = std::stoi(argv[i]);
|
|
997
|
-
return true;
|
|
998
|
-
}
|
|
999
|
-
if (arg == "--kl-divergence") {
|
|
1000
|
-
params.kl_divergence = true;
|
|
1001
|
-
return true;
|
|
1002
|
-
}
|
|
1003
|
-
if (arg == "--ignore-eos") {
|
|
1004
|
-
params.ignore_eos = true;
|
|
1005
|
-
return true;
|
|
1006
|
-
}
|
|
1007
|
-
if (arg == "--penalize-nl") {
|
|
1008
|
-
sparams.penalize_nl = true;
|
|
1009
|
-
return true;
|
|
1010
|
-
}
|
|
1011
|
-
if (arg == "-l" || arg == "--logit-bias") {
|
|
1012
|
-
CHECK_ARG
|
|
1013
|
-
std::stringstream ss(argv[i]);
|
|
1014
|
-
llama_token key;
|
|
1015
|
-
char sign;
|
|
1016
|
-
std::string value_str;
|
|
1017
|
-
try {
|
|
1018
|
-
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
|
1019
|
-
sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
|
1020
|
-
}
|
|
1021
|
-
else {
|
|
1022
|
-
throw std::exception();
|
|
1023
|
-
}
|
|
1024
|
-
}
|
|
1025
|
-
catch (const std::exception&) {
|
|
1026
|
-
invalid_param = true;
|
|
1027
|
-
return true;
|
|
1028
|
-
}
|
|
1029
|
-
return true;
|
|
1030
|
-
}
|
|
1031
|
-
if (arg == "-h" || arg == "--help" || arg == "--usage" ) {
|
|
1032
|
-
params.usage = true;
|
|
1033
|
-
return true;
|
|
1034
|
-
}
|
|
1035
|
-
if (arg == "--version") {
|
|
1036
|
-
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
|
1037
|
-
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
|
1038
|
-
exit(0);
|
|
1039
|
-
}
|
|
1040
|
-
if (arg == "--in-prefix-bos") {
|
|
1041
|
-
params.input_prefix_bos = true;
|
|
1042
|
-
params.enable_chat_template = false;
|
|
1043
|
-
return true;
|
|
1044
|
-
}
|
|
1045
|
-
if (arg == "--in-prefix") {
|
|
1046
|
-
CHECK_ARG
|
|
1047
|
-
params.input_prefix = argv[i];
|
|
1048
|
-
params.enable_chat_template = false;
|
|
1049
|
-
return true;
|
|
1050
|
-
}
|
|
1051
|
-
if (arg == "--in-suffix") {
|
|
1052
|
-
CHECK_ARG
|
|
1053
|
-
params.input_suffix = argv[i];
|
|
1054
|
-
params.enable_chat_template = false;
|
|
1055
|
-
return true;
|
|
1056
|
-
}
|
|
1057
|
-
if (arg == "--spm-infill") {
|
|
1058
|
-
params.spm_infill = true;
|
|
1059
|
-
return true;
|
|
1060
|
-
}
|
|
1061
|
-
if (arg == "--grammar") {
|
|
1062
|
-
CHECK_ARG
|
|
1063
|
-
sparams.grammar = argv[i];
|
|
1064
|
-
return true;
|
|
1065
|
-
}
|
|
1066
|
-
if (arg == "--grammar-file") {
|
|
1067
|
-
CHECK_ARG
|
|
1068
|
-
std::ifstream file(argv[i]);
|
|
1069
|
-
if (!file) {
|
|
1070
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
1071
|
-
invalid_param = true;
|
|
1072
|
-
return true;
|
|
1073
|
-
}
|
|
1074
|
-
std::copy(
|
|
1075
|
-
std::istreambuf_iterator<char>(file),
|
|
1076
|
-
std::istreambuf_iterator<char>(),
|
|
1077
|
-
std::back_inserter(sparams.grammar)
|
|
1078
|
-
);
|
|
1079
|
-
return true;
|
|
1080
|
-
}
|
|
1081
|
-
if (arg == "-j" || arg == "--json-schema") {
|
|
1082
|
-
CHECK_ARG
|
|
1083
|
-
sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
|
|
1084
|
-
return true;
|
|
1085
|
-
}
|
|
1086
|
-
if (arg == "--override-kv") {
|
|
1087
|
-
CHECK_ARG
|
|
1088
|
-
if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
|
|
1089
|
-
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
|
1090
|
-
invalid_param = true;
|
|
1091
|
-
return true;
|
|
1092
|
-
}
|
|
1093
|
-
return true;
|
|
1094
|
-
}
|
|
1095
|
-
if (arg == "--host") {
|
|
1096
|
-
CHECK_ARG
|
|
1097
|
-
params.hostname = argv[i];
|
|
1098
|
-
return true;
|
|
1099
|
-
}
|
|
1100
|
-
if (arg == "--port") {
|
|
1101
|
-
CHECK_ARG
|
|
1102
|
-
params.port = std::stoi(argv[i]);
|
|
1103
|
-
return true;
|
|
1104
|
-
}
|
|
1105
|
-
if (arg == "--path") {
|
|
1106
|
-
CHECK_ARG
|
|
1107
|
-
params.public_path = argv[i];
|
|
1108
|
-
return true;
|
|
178
|
+
if (is_running_on_efficiency_core()) {
|
|
179
|
+
continue; // efficiency cores harm lockstep threading
|
|
180
|
+
}
|
|
181
|
+
++cpu; // hyperthreading isn't useful for linear algebra
|
|
182
|
+
++result;
|
|
1109
183
|
}
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
184
|
+
return result;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
#endif // __x86_64__ && __linux__
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* Returns number of CPUs on system that are useful for math.
|
|
191
|
+
*/
|
|
192
|
+
int32_t cpu_get_num_math() {
|
|
193
|
+
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
|
|
194
|
+
int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
|
|
195
|
+
if (n_cpu < 1) {
|
|
196
|
+
return cpu_get_num_physical_cores();
|
|
1114
197
|
}
|
|
1115
|
-
if (
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
}
|
|
1123
|
-
std::string key;
|
|
1124
|
-
while (std::getline(key_file, key)) {
|
|
1125
|
-
if (!key.empty()) {
|
|
1126
|
-
params.api_keys.push_back(key);
|
|
198
|
+
if (is_hybrid_cpu()) {
|
|
199
|
+
cpu_set_t affinity;
|
|
200
|
+
if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
|
|
201
|
+
int result = cpu_count_math_cpus(n_cpu);
|
|
202
|
+
pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
|
|
203
|
+
if (result > 0) {
|
|
204
|
+
return result;
|
|
1127
205
|
}
|
|
1128
206
|
}
|
|
1129
|
-
key_file.close();
|
|
1130
|
-
return true;
|
|
1131
|
-
}
|
|
1132
|
-
if (arg == "--ssl-key-file") {
|
|
1133
|
-
CHECK_ARG
|
|
1134
|
-
params.ssl_file_key = argv[i];
|
|
1135
|
-
return true;
|
|
1136
|
-
}
|
|
1137
|
-
if (arg == "--ssl-cert-file") {
|
|
1138
|
-
CHECK_ARG
|
|
1139
|
-
params.ssl_file_cert = argv[i];
|
|
1140
|
-
return true;
|
|
1141
207
|
}
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
208
|
+
#endif
|
|
209
|
+
return cpu_get_num_physical_cores();
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// Helper for setting process priority
|
|
213
|
+
|
|
214
|
+
#if defined(_WIN32)
|
|
215
|
+
|
|
216
|
+
bool set_process_priority(enum ggml_sched_priority prio) {
|
|
217
|
+
if (prio == GGML_SCHED_PRIO_NORMAL) {
|
|
1146
218
|
return true;
|
|
1147
219
|
}
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
220
|
+
|
|
221
|
+
DWORD p = NORMAL_PRIORITY_CLASS;
|
|
222
|
+
switch (prio) {
|
|
223
|
+
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
|
|
224
|
+
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
|
|
225
|
+
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
|
|
226
|
+
case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
|
|
1152
227
|
}
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
|
1158
|
-
invalid_param = true;
|
|
1159
|
-
return true;
|
|
1160
|
-
}
|
|
1161
|
-
std::string system_prompt;
|
|
1162
|
-
std::copy(
|
|
1163
|
-
std::istreambuf_iterator<char>(file),
|
|
1164
|
-
std::istreambuf_iterator<char>(),
|
|
1165
|
-
std::back_inserter(system_prompt)
|
|
1166
|
-
);
|
|
1167
|
-
params.system_prompt = system_prompt;
|
|
1168
|
-
return true;
|
|
228
|
+
|
|
229
|
+
if (!SetPriorityClass(GetCurrentProcess(), p)) {
|
|
230
|
+
LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
|
|
231
|
+
return false;
|
|
1169
232
|
}
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
233
|
+
|
|
234
|
+
return true;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
#else // MacOS and POSIX
|
|
238
|
+
#include <sys/types.h>
|
|
239
|
+
#include <sys/resource.h>
|
|
240
|
+
|
|
241
|
+
bool set_process_priority(enum ggml_sched_priority prio) {
|
|
242
|
+
if (prio == GGML_SCHED_PRIO_NORMAL) {
|
|
1180
243
|
return true;
|
|
1181
244
|
}
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
245
|
+
|
|
246
|
+
int p = 0;
|
|
247
|
+
switch (prio) {
|
|
248
|
+
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
|
|
249
|
+
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
|
|
250
|
+
case GGML_SCHED_PRIO_HIGH: p = -10; break;
|
|
251
|
+
case GGML_SCHED_PRIO_REALTIME: p = -20; break;
|
|
1185
252
|
}
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
253
|
+
|
|
254
|
+
if (!setpriority(PRIO_PROCESS, 0, p)) {
|
|
255
|
+
LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
|
|
256
|
+
return false;
|
|
1189
257
|
}
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
258
|
+
return true;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
#endif
|
|
262
|
+
|
|
263
|
+
//
|
|
264
|
+
// CLI argument parsing
|
|
265
|
+
//
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
|
|
269
|
+
int32_t n_set = 0;
|
|
270
|
+
|
|
271
|
+
if (cpuparams.n_threads < 0) {
|
|
272
|
+
// Assuming everything about cpuparams is invalid
|
|
273
|
+
if (role_model != nullptr) {
|
|
274
|
+
cpuparams = *role_model;
|
|
275
|
+
} else {
|
|
276
|
+
cpuparams.n_threads = cpu_get_num_math();
|
|
1196
277
|
}
|
|
1197
|
-
return true;
|
|
1198
278
|
}
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
if (
|
|
1202
|
-
|
|
1203
|
-
fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
|
|
1204
|
-
invalid_param = true;
|
|
1205
|
-
return true;
|
|
279
|
+
|
|
280
|
+
for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
|
|
281
|
+
if (cpuparams.cpumask[i]) {
|
|
282
|
+
n_set++;
|
|
1206
283
|
}
|
|
1207
|
-
params.chat_template = argv[i];
|
|
1208
|
-
return true;
|
|
1209
|
-
}
|
|
1210
|
-
if (arg == "--slot-prompt-similarity" || arg == "-sps") {
|
|
1211
|
-
CHECK_ARG
|
|
1212
|
-
params.slot_prompt_similarity = std::stof(argv[i]);
|
|
1213
|
-
return true;
|
|
1214
|
-
}
|
|
1215
|
-
if (arg == "-pps") {
|
|
1216
|
-
params.is_pp_shared = true;
|
|
1217
|
-
return true;
|
|
1218
284
|
}
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
return true;
|
|
1224
|
-
}
|
|
1225
|
-
if (arg == "-ntg") {
|
|
1226
|
-
CHECK_ARG
|
|
1227
|
-
auto p = string_split<int>(argv[i], split_delim);
|
|
1228
|
-
params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
|
|
1229
|
-
return true;
|
|
285
|
+
|
|
286
|
+
if (n_set && n_set < cpuparams.n_threads) {
|
|
287
|
+
// Not enough set bits, may experience performance issues.
|
|
288
|
+
LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
|
|
1230
289
|
}
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
|
|
293
|
+
size_t dash_loc = range.find('-');
|
|
294
|
+
if (dash_loc == std::string::npos) {
|
|
295
|
+
LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
|
|
296
|
+
return false;
|
|
1236
297
|
}
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
298
|
+
|
|
299
|
+
size_t start_i;
|
|
300
|
+
size_t end_i;
|
|
301
|
+
|
|
302
|
+
if (dash_loc == 0) {
|
|
303
|
+
start_i = 0;
|
|
304
|
+
} else {
|
|
305
|
+
start_i = std::stoull(range.substr(0, dash_loc));
|
|
306
|
+
if (start_i >= GGML_MAX_N_THREADS) {
|
|
307
|
+
LOG_ERR("Start index out of bounds!\n");
|
|
308
|
+
return false;
|
|
1244
309
|
}
|
|
1245
|
-
params.context_files.push_back(argv[i]);
|
|
1246
|
-
return true;
|
|
1247
|
-
}
|
|
1248
|
-
if (arg == "--chunk-size") {
|
|
1249
|
-
CHECK_ARG
|
|
1250
|
-
params.chunk_size = std::stoi(argv[i]);
|
|
1251
|
-
return true;
|
|
1252
|
-
}
|
|
1253
|
-
if (arg == "--chunk-separator") {
|
|
1254
|
-
CHECK_ARG
|
|
1255
|
-
params.chunk_separator = argv[i];
|
|
1256
|
-
return true;
|
|
1257
|
-
}
|
|
1258
|
-
if (arg == "--junk") {
|
|
1259
|
-
CHECK_ARG
|
|
1260
|
-
params.n_junk = std::stoi(argv[i]);
|
|
1261
|
-
return true;
|
|
1262
|
-
}
|
|
1263
|
-
if (arg == "--pos") {
|
|
1264
|
-
CHECK_ARG
|
|
1265
|
-
params.i_pos = std::stoi(argv[i]);
|
|
1266
|
-
return true;
|
|
1267
|
-
}
|
|
1268
|
-
if (arg == "-o" || arg == "--output" || arg == "--output-file") {
|
|
1269
|
-
CHECK_ARG
|
|
1270
|
-
params.out_file = argv[i];
|
|
1271
|
-
params.cvector_outfile = argv[i];
|
|
1272
|
-
params.lora_outfile = argv[i];
|
|
1273
|
-
return true;
|
|
1274
|
-
}
|
|
1275
|
-
if (arg == "-ofreq" || arg == "--output-frequency") {
|
|
1276
|
-
CHECK_ARG
|
|
1277
|
-
params.n_out_freq = std::stoi(argv[i]);
|
|
1278
|
-
return true;
|
|
1279
|
-
}
|
|
1280
|
-
if (arg == "--save-frequency") {
|
|
1281
|
-
CHECK_ARG
|
|
1282
|
-
params.n_save_freq = std::stoi(argv[i]);
|
|
1283
|
-
return true;
|
|
1284
|
-
}
|
|
1285
|
-
if (arg == "--process-output") {
|
|
1286
|
-
params.process_output = true;
|
|
1287
|
-
return true;
|
|
1288
|
-
}
|
|
1289
|
-
if (arg == "--no-ppl") {
|
|
1290
|
-
params.compute_ppl = false;
|
|
1291
|
-
return true;
|
|
1292
|
-
}
|
|
1293
|
-
if (arg == "--chunk" || arg == "--from-chunk") {
|
|
1294
|
-
CHECK_ARG
|
|
1295
|
-
params.i_chunk = std::stoi(argv[i]);
|
|
1296
|
-
return true;
|
|
1297
|
-
}
|
|
1298
|
-
// cvector params
|
|
1299
|
-
if (arg == "--positive-file") {
|
|
1300
|
-
CHECK_ARG
|
|
1301
|
-
params.cvector_positive_file = argv[i];
|
|
1302
|
-
return true;
|
|
1303
|
-
}
|
|
1304
|
-
if (arg == "--negative-file") {
|
|
1305
|
-
CHECK_ARG
|
|
1306
|
-
params.cvector_negative_file = argv[i];
|
|
1307
|
-
return true;
|
|
1308
|
-
}
|
|
1309
|
-
if (arg == "--pca-batch") {
|
|
1310
|
-
CHECK_ARG
|
|
1311
|
-
params.n_pca_batch = std::stoi(argv[i]);
|
|
1312
|
-
return true;
|
|
1313
310
|
}
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
/**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
|
|
1323
|
-
else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
|
|
1324
|
-
else { invalid_param = true; }
|
|
1325
|
-
return true;
|
|
1326
|
-
}
|
|
1327
|
-
if (arg == "--no-warmup") {
|
|
1328
|
-
params.warmup = false;
|
|
1329
|
-
return true;
|
|
1330
|
-
}
|
|
1331
|
-
#ifndef LOG_DISABLE_LOGS
|
|
1332
|
-
// Parse args for logging parameters
|
|
1333
|
-
if (log_param_single_parse(argv[i])) {
|
|
1334
|
-
// Do nothing, log_param_single_parse automatically does it's thing
|
|
1335
|
-
// and returns if a match was found and parsed.
|
|
1336
|
-
return true;
|
|
1337
|
-
}
|
|
1338
|
-
if (log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i])) {
|
|
1339
|
-
// We have a matching known parameter requiring an argument,
|
|
1340
|
-
// now we need to check if there is anything after this argv
|
|
1341
|
-
// and flag invalid_param or parse it.
|
|
1342
|
-
CHECK_ARG
|
|
1343
|
-
if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) {
|
|
1344
|
-
invalid_param = true;
|
|
1345
|
-
return true;
|
|
311
|
+
|
|
312
|
+
if (dash_loc == range.length() - 1) {
|
|
313
|
+
end_i = GGML_MAX_N_THREADS - 1;
|
|
314
|
+
} else {
|
|
315
|
+
end_i = std::stoull(range.substr(dash_loc + 1));
|
|
316
|
+
if (end_i >= GGML_MAX_N_THREADS) {
|
|
317
|
+
LOG_ERR("End index out of bounds!\n");
|
|
318
|
+
return false;
|
|
1346
319
|
}
|
|
1347
|
-
return true;
|
|
1348
320
|
}
|
|
1349
|
-
// End of Parse args for logging parameters
|
|
1350
|
-
#endif // LOG_DISABLE_LOGS
|
|
1351
321
|
|
|
1352
|
-
|
|
322
|
+
for (size_t i = start_i; i <= end_i; i++) {
|
|
323
|
+
boolmask[i] = true;
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
return true;
|
|
1353
327
|
}
|
|
1354
328
|
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
#else
|
|
1362
|
-
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
|
1363
|
-
#endif
|
|
329
|
+
bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
|
|
330
|
+
// Discard potential 0x prefix
|
|
331
|
+
size_t start_i = 0;
|
|
332
|
+
if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
|
|
333
|
+
start_i = 2;
|
|
334
|
+
}
|
|
1364
335
|
|
|
1365
|
-
|
|
1366
|
-
|
|
336
|
+
size_t num_digits = mask.length() - start_i;
|
|
337
|
+
if (num_digits > 128) num_digits = 128;
|
|
1367
338
|
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
for (
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
}
|
|
1374
|
-
sampler_type_names.pop_back();
|
|
339
|
+
size_t end_i = num_digits + start_i;
|
|
340
|
+
|
|
341
|
+
for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
|
|
342
|
+
char c = mask.at(i);
|
|
343
|
+
int8_t id = c;
|
|
1375
344
|
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
345
|
+
if ((c >= '0' && c <= '9')) {
|
|
346
|
+
id -= '0';
|
|
347
|
+
} else if (c >= 'a' && c <= 'f') {
|
|
348
|
+
id -= 'a' - 10;
|
|
349
|
+
} else if (c >= 'A' && c <= 'F') {
|
|
350
|
+
id -= 'A' - 10;
|
|
351
|
+
} else {
|
|
352
|
+
LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
|
|
353
|
+
return false;
|
|
1385
354
|
}
|
|
1386
355
|
|
|
1387
|
-
|
|
356
|
+
boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
|
|
357
|
+
boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
|
|
358
|
+
boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
|
|
359
|
+
boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
|
|
360
|
+
}
|
|
1388
361
|
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
362
|
+
return true;
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
void gpt_init() {
|
|
366
|
+
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
|
|
367
|
+
if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
|
|
368
|
+
gpt_log_add(gpt_log_main(), level, "%s", text);
|
|
369
|
+
}
|
|
370
|
+
}, NULL);
|
|
1394
371
|
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
options.push_back({ "*", "-v, --verbose", "print verbose information" });
|
|
1403
|
-
options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity });
|
|
1404
|
-
options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
|
|
1405
|
-
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
|
|
1406
|
-
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
|
|
1407
|
-
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
|
|
1408
|
-
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
|
|
1409
|
-
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
|
|
1410
|
-
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
|
|
1411
|
-
options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
|
|
1412
|
-
"number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
|
|
1413
|
-
options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
|
|
1414
|
-
options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
|
|
1415
|
-
options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
|
|
1416
|
-
"path to static lookup cache to use for lookup decoding (not updated by generation)" });
|
|
1417
|
-
options.push_back({ "*", "-lcd, --lookup-cache-dynamic FNAME",
|
|
1418
|
-
"path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
|
|
1419
|
-
|
|
1420
|
-
options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
|
|
1421
|
-
options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
|
|
1422
|
-
options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch });
|
|
1423
|
-
options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch });
|
|
1424
|
-
options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
|
|
1425
|
-
options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
|
|
1426
|
-
options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
|
|
1427
|
-
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n"
|
|
1428
|
-
"in conversation mode, this will be used as system prompt\n"
|
|
1429
|
-
"(default: '%s')", params.prompt.c_str() });
|
|
1430
|
-
options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
|
|
1431
|
-
options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" });
|
|
1432
|
-
options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
|
|
1433
|
-
options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
|
|
1434
|
-
options.push_back({ "*", " --no-escape", "do not process escape sequences" });
|
|
1435
|
-
options.push_back({ "main", "-ptc, --print-token-count N", "print token count every N tokens (default: %d)", params.n_print });
|
|
1436
|
-
options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" });
|
|
1437
|
-
options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n"
|
|
1438
|
-
"not supported with --interactive or other interactive options" });
|
|
1439
|
-
options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" });
|
|
1440
|
-
options.push_back({ "main", "-r, --reverse-prompt PROMPT",
|
|
1441
|
-
"halt generation at PROMPT, return control in interactive mode\n"
|
|
1442
|
-
"can be specified more than once for multiple prompts" });
|
|
1443
|
-
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
|
|
1444
|
-
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode, does not print special tokens and suffix/prefix\n"
|
|
1445
|
-
"if suffix/prefix are not specified, default chat template will be used\n"
|
|
1446
|
-
"(default: %s)", params.conversation ? "true" : "false" });
|
|
1447
|
-
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
|
|
1448
|
-
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
|
|
1449
|
-
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
|
|
1450
|
-
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
|
|
1451
|
-
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
|
|
1452
|
-
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
|
|
1453
|
-
options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" });
|
|
1454
|
-
options.push_back({ "server infill",
|
|
1455
|
-
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
|
|
1456
|
-
|
|
1457
|
-
options.push_back({ "sampling" });
|
|
1458
|
-
options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
|
|
1459
|
-
"(default: %s)", sampler_type_names.c_str() });
|
|
1460
|
-
options.push_back({ "*", " --sampling-seq SEQUENCE",
|
|
1461
|
-
"simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() });
|
|
1462
|
-
options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" });
|
|
1463
|
-
options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" });
|
|
1464
|
-
options.push_back({ "*", " --temp N", "temperature (default: %.1f)", (double)sparams.temp });
|
|
1465
|
-
options.push_back({ "*", " --top-k N", "top-k sampling (default: %d, 0 = disabled)", sparams.top_k });
|
|
1466
|
-
options.push_back({ "*", " --top-p N", "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p });
|
|
1467
|
-
options.push_back({ "*", " --min-p N", "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p });
|
|
1468
|
-
options.push_back({ "*", " --tfs N", "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z });
|
|
1469
|
-
options.push_back({ "*", " --typical N", "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p });
|
|
1470
|
-
options.push_back({ "*", " --repeat-last-n N", "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n });
|
|
1471
|
-
options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat });
|
|
1472
|
-
options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present });
|
|
1473
|
-
options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq });
|
|
1474
|
-
options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range });
|
|
1475
|
-
options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent });
|
|
1476
|
-
options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n"
|
|
1477
|
-
"Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
|
|
1478
|
-
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat });
|
|
1479
|
-
options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
|
|
1480
|
-
options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
|
|
1481
|
-
options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n"
|
|
1482
|
-
"i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
|
|
1483
|
-
"or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
|
|
1484
|
-
options.push_back({ "main", " --cfg-negative-prompt PROMPT",
|
|
1485
|
-
"negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() });
|
|
1486
|
-
options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
|
|
1487
|
-
"negative prompt file to use for guidance" });
|
|
1488
|
-
options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
|
|
1489
|
-
options.push_back({ "main", " --chat-template JINJA_TEMPLATE",
|
|
1490
|
-
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
|
1491
|
-
"if suffix/prefix are specified, template will be disabled\n"
|
|
1492
|
-
"only commonly used templates are accepted:\n"
|
|
1493
|
-
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
|
|
1494
|
-
options.push_back({ "grammar" });
|
|
1495
|
-
options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
|
|
1496
|
-
options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
|
|
1497
|
-
options.push_back({ "*", "-j, --json-schema SCHEMA",
|
|
1498
|
-
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n"
|
|
1499
|
-
"For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
|
|
1500
|
-
|
|
1501
|
-
options.push_back({ "embedding" });
|
|
1502
|
-
options.push_back({ "embedding", " --pooling {none,mean,cls,last}",
|
|
1503
|
-
"pooling type for embeddings, use model default if unspecified" });
|
|
1504
|
-
options.push_back({ "embedding", " --attention {causal,non-causal}",
|
|
1505
|
-
"attention type for embeddings, use model default if unspecified" });
|
|
1506
|
-
|
|
1507
|
-
options.push_back({ "context hacking" });
|
|
1508
|
-
options.push_back({ "*", " --rope-scaling {none,linear,yarn}",
|
|
1509
|
-
"RoPE frequency scaling method, defaults to linear unless specified by the model" });
|
|
1510
|
-
options.push_back({ "*", " --rope-scale N", "RoPE context scaling factor, expands context by a factor of N" });
|
|
1511
|
-
options.push_back({ "*", " --rope-freq-base N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" });
|
|
1512
|
-
options.push_back({ "*", " --rope-freq-scale N", "RoPE frequency scaling factor, expands context by a factor of 1/N" });
|
|
1513
|
-
options.push_back({ "*", " --yarn-orig-ctx N", "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx });
|
|
1514
|
-
options.push_back({ "*", " --yarn-ext-factor N", "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor });
|
|
1515
|
-
options.push_back({ "*", " --yarn-attn-factor N", "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor });
|
|
1516
|
-
options.push_back({ "*", " --yarn-beta-slow N", "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow });
|
|
1517
|
-
options.push_back({ "*", " --yarn-beta-fast N", "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast });
|
|
1518
|
-
options.push_back({ "*", "-gan, --grp-attn-n N", "group-attention factor (default: %d)", params.grp_attn_n });
|
|
1519
|
-
options.push_back({ "*", "-gaw, --grp-attn-w N", "group-attention width (default: %.1f)", (double)params.grp_attn_w });
|
|
1520
|
-
options.push_back({ "*", "-dkvc, --dump-kv-cache", "verbose print of the KV cache" });
|
|
1521
|
-
options.push_back({ "*", "-nkvo, --no-kv-offload", "disable KV offload" });
|
|
1522
|
-
options.push_back({ "*", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() });
|
|
1523
|
-
options.push_back({ "*", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() });
|
|
1524
|
-
|
|
1525
|
-
options.push_back({ "perplexity" });
|
|
1526
|
-
options.push_back({ "perplexity", " --all-logits", "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" });
|
|
1527
|
-
options.push_back({ "perplexity", " --hellaswag", "compute HellaSwag score over random tasks from datafile supplied with -f" });
|
|
1528
|
-
options.push_back({ "perplexity", " --hellaswag-tasks N", "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks });
|
|
1529
|
-
options.push_back({ "perplexity", " --winogrande", "compute Winogrande score over random tasks from datafile supplied with -f" });
|
|
1530
|
-
options.push_back({ "perplexity", " --winogrande-tasks N", "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks });
|
|
1531
|
-
options.push_back({ "perplexity", " --multiple-choice", "compute multiple choice score over random tasks from datafile supplied with -f" });
|
|
1532
|
-
options.push_back({ "perplexity", " --multiple-choice-tasks N",
|
|
1533
|
-
"number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks });
|
|
1534
|
-
options.push_back({ "perplexity", " --kl-divergence", "computes KL-divergence to logits provided via --kl-divergence-base" });
|
|
1535
|
-
options.push_back({ "perplexity", " --ppl-stride N", "stride for perplexity calculation (default: %d)", params.ppl_stride });
|
|
1536
|
-
options.push_back({ "perplexity", " --ppl-output-type {0,1}",
|
|
1537
|
-
"output type for perplexity calculation (default: %d)", params.ppl_output_type });
|
|
1538
|
-
|
|
1539
|
-
options.push_back({ "parallel" });
|
|
1540
|
-
options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold });
|
|
1541
|
-
options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
|
|
1542
|
-
options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
|
|
1543
|
-
options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
|
|
1544
|
-
options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" });
|
|
1545
|
-
|
|
1546
|
-
options.push_back({ "multi-modality" });
|
|
1547
|
-
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
|
|
1548
|
-
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
|
|
1549
|
-
|
|
1550
|
-
options.push_back({ "backend" });
|
|
1551
|
-
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
|
|
1552
|
-
|
|
1553
|
-
if (llama_supports_mlock()) {
|
|
1554
|
-
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
|
|
1555
|
-
}
|
|
1556
|
-
if (llama_supports_mmap()) {
|
|
1557
|
-
options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
|
|
1558
|
-
}
|
|
1559
|
-
options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n"
|
|
1560
|
-
" - distribute: spread execution evenly over all nodes\n"
|
|
1561
|
-
" - isolate: only spawn threads on CPUs on the node that execution started on\n"
|
|
1562
|
-
" - numactl: use the CPU map provided by numactl\n"
|
|
1563
|
-
"if run without this previously, it is recommended to drop the system page cache before using this\n"
|
|
1564
|
-
"see https://github.com/ggerganov/llama.cpp/issues/1437" });
|
|
1565
|
-
|
|
1566
|
-
if (llama_supports_gpu_offload()) {
|
|
1567
|
-
options.push_back({ "*", "-ngl, --gpu-layers N",
|
|
1568
|
-
"number of layers to store in VRAM" });
|
|
1569
|
-
options.push_back({ "*", "-ngld, --gpu-layers-draft N",
|
|
1570
|
-
"number of layers to store in VRAM for the draft model" });
|
|
1571
|
-
options.push_back({ "*", "-sm, --split-mode SPLIT_MODE",
|
|
1572
|
-
"how to split the model across multiple GPUs, one of:\n"
|
|
1573
|
-
" - none: use one GPU only\n"
|
|
1574
|
-
" - layer (default): split layers and KV across GPUs\n"
|
|
1575
|
-
" - row: split rows across GPUs" });
|
|
1576
|
-
options.push_back({ "*", "-ts, --tensor-split SPLIT",
|
|
1577
|
-
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
|
|
1578
|
-
options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n"
|
|
1579
|
-
"or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
|
|
1580
|
-
}
|
|
1581
|
-
|
|
1582
|
-
options.push_back({ "model" });
|
|
1583
|
-
options.push_back({ "*", " --check-tensors", "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" });
|
|
1584
|
-
options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
|
|
1585
|
-
"advanced option to override model metadata by key. may be specified multiple times.\n"
|
|
1586
|
-
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
|
|
1587
|
-
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" });
|
|
1588
|
-
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
|
1589
|
-
options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
|
|
1590
|
-
"note: this argument can be repeated to add multiple control vectors" });
|
|
1591
|
-
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
|
|
1592
|
-
"add a control vector with user defined scaling SCALE\n"
|
|
1593
|
-
"note: this argument can be repeated to add multiple scaled control vectors" });
|
|
1594
|
-
options.push_back({ "*", " --control-vector-layer-range START END",
|
|
1595
|
-
"layer range to apply the control vector(s) to, start and end inclusive" });
|
|
1596
|
-
options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
|
|
1597
|
-
"or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
|
|
1598
|
-
options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" });
|
|
1599
|
-
options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
|
|
1600
|
-
options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
|
|
1601
|
-
options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" });
|
|
1602
|
-
options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
|
|
1603
|
-
|
|
1604
|
-
options.push_back({ "retrieval" });
|
|
1605
|
-
options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" });
|
|
1606
|
-
options.push_back({ "retrieval", " --chunk-size N", "minimum length of embedded text chunks (default: %d)", params.chunk_size });
|
|
1607
|
-
options.push_back({ "retrieval", " --chunk-separator STRING",
|
|
1608
|
-
"separator between chunks (default: '%s')", params.chunk_separator.c_str() });
|
|
1609
|
-
|
|
1610
|
-
options.push_back({ "passkey" });
|
|
1611
|
-
options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk });
|
|
1612
|
-
options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos });
|
|
1613
|
-
|
|
1614
|
-
options.push_back({ "imatrix" });
|
|
1615
|
-
options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() });
|
|
1616
|
-
options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq });
|
|
1617
|
-
options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
|
|
1618
|
-
options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
|
|
1619
|
-
options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
|
|
1620
|
-
options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk });
|
|
1621
|
-
|
|
1622
|
-
options.push_back({ "bench" });
|
|
1623
|
-
options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
|
|
1624
|
-
options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" });
|
|
1625
|
-
options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" });
|
|
1626
|
-
options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" });
|
|
1627
|
-
|
|
1628
|
-
options.push_back({ "embedding" });
|
|
1629
|
-
options.push_back({ "embedding", " --embd-normalize", "normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize });
|
|
1630
|
-
options.push_back({ "embedding", " --embd-output-format", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix" });
|
|
1631
|
-
options.push_back({ "embedding", " --embd-separator", "separator of embendings (default \\n) for example \"<#sep#>\"" });
|
|
1632
|
-
|
|
1633
|
-
options.push_back({ "server" });
|
|
1634
|
-
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
|
|
1635
|
-
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
|
|
1636
|
-
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
|
|
1637
|
-
options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
|
|
1638
|
-
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
|
|
1639
|
-
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
|
|
1640
|
-
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
|
|
1641
|
-
options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" });
|
|
1642
|
-
options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read });
|
|
1643
|
-
options.push_back({ "server", " --threads-http N", "number of threads used to process HTTP requests (default: %d)", params.n_threads_http });
|
|
1644
|
-
options.push_back({ "server", " --system-prompt-file FNAME",
|
|
1645
|
-
"set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" });
|
|
1646
|
-
options.push_back({ "server", " --log-format {text,json}",
|
|
1647
|
-
"log output format: json or text (default: json)" });
|
|
1648
|
-
options.push_back({ "server", " --metrics", "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" });
|
|
1649
|
-
options.push_back({ "server", " --no-slots", "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" });
|
|
1650
|
-
options.push_back({ "server", " --slot-save-path PATH", "path to save slot kv cache (default: disabled)" });
|
|
1651
|
-
options.push_back({ "server", " --chat-template JINJA_TEMPLATE",
|
|
1652
|
-
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
|
1653
|
-
"only commonly used templates are accepted:\n"
|
|
1654
|
-
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
|
|
1655
|
-
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
|
|
1656
|
-
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
|
|
1657
|
-
|
|
1658
|
-
#ifndef LOG_DISABLE_LOGS
|
|
1659
|
-
options.push_back({ "logging" });
|
|
1660
|
-
options.push_back({ "*", " --simple-io", "use basic IO for better compatibility in subprocesses and limited consoles" });
|
|
1661
|
-
options.push_back({ "*", "-ld, --logdir LOGDIR", "path under which to save YAML logs (no logging if unset)" });
|
|
1662
|
-
options.push_back({ "logging", " --log-test", "Run simple logging test" });
|
|
1663
|
-
options.push_back({ "logging", " --log-disable", "Disable trace logs" });
|
|
1664
|
-
options.push_back({ "logging", " --log-enable", "Enable trace logs" });
|
|
1665
|
-
options.push_back({ "logging", " --log-file FNAME", "Specify a log filename (without extension)" });
|
|
1666
|
-
options.push_back({ "logging", " --log-new", "Create a separate new log file on start. "
|
|
1667
|
-
"Each log file will have unique name: \"<name>.<ID>.log\"" });
|
|
1668
|
-
options.push_back({ "logging", " --log-append", "Don't truncate the old log file." });
|
|
1669
|
-
#endif // LOG_DISABLE_LOGS
|
|
1670
|
-
|
|
1671
|
-
options.push_back({ "cvector" });
|
|
1672
|
-
options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() });
|
|
1673
|
-
options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
|
|
1674
|
-
options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
|
|
1675
|
-
options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
|
|
1676
|
-
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
|
|
1677
|
-
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
|
|
1678
|
-
|
|
1679
|
-
options.push_back({ "export-lora" });
|
|
1680
|
-
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
|
|
1681
|
-
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
|
|
1682
|
-
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
|
1683
|
-
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
|
|
1684
|
-
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
|
|
1685
|
-
|
|
1686
|
-
printf("usage: %s [options]\n", argv[0]);
|
|
1687
|
-
|
|
1688
|
-
for (const auto & o : options) {
|
|
1689
|
-
if (!o.grp.empty()) {
|
|
1690
|
-
printf("\n%s:\n\n", o.grp.c_str());
|
|
1691
|
-
continue;
|
|
1692
|
-
}
|
|
1693
|
-
printf(" %-32s", o.args.c_str());
|
|
1694
|
-
if (o.args.length() > 30) {
|
|
1695
|
-
printf("\n%34s", "");
|
|
1696
|
-
}
|
|
1697
|
-
|
|
1698
|
-
const auto desc = o.desc;
|
|
1699
|
-
size_t start = 0;
|
|
1700
|
-
size_t end = desc.find('\n');
|
|
1701
|
-
while (end != std::string::npos) {
|
|
1702
|
-
printf("%s\n%34s", desc.substr(start, end - start).c_str(), "");
|
|
1703
|
-
start = end + 1;
|
|
1704
|
-
end = desc.find('\n', start);
|
|
1705
|
-
}
|
|
1706
|
-
|
|
1707
|
-
printf("%s\n", desc.substr(start).c_str());
|
|
1708
|
-
}
|
|
1709
|
-
printf("\n");
|
|
372
|
+
#ifdef NDEBUG
|
|
373
|
+
const char * build_type = "";
|
|
374
|
+
#else
|
|
375
|
+
const char * build_type = " (debug)";
|
|
376
|
+
#endif
|
|
377
|
+
|
|
378
|
+
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
|
1710
379
|
}
|
|
1711
380
|
|
|
1712
381
|
std::string gpt_params_get_system_info(const gpt_params & params) {
|
|
1713
382
|
std::ostringstream os;
|
|
1714
383
|
|
|
1715
|
-
os << "system_info: n_threads = " << params.n_threads;
|
|
1716
|
-
if (params.
|
|
1717
|
-
os << " (n_threads_batch = " << params.
|
|
384
|
+
os << "system_info: n_threads = " << params.cpuparams.n_threads;
|
|
385
|
+
if (params.cpuparams_batch.n_threads != -1) {
|
|
386
|
+
os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
|
|
1718
387
|
}
|
|
388
|
+
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
|
389
|
+
// TODO: windows + arm64 + mingw64
|
|
390
|
+
DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
|
|
391
|
+
os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
|
|
392
|
+
#else
|
|
1719
393
|
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
|
|
394
|
+
#endif
|
|
1720
395
|
|
|
1721
396
|
return os.str();
|
|
1722
397
|
}
|
|
@@ -1766,6 +441,111 @@ std::string string_get_sortable_timestamp() {
|
|
|
1766
441
|
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
|
|
1767
442
|
}
|
|
1768
443
|
|
|
444
|
+
void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
|
445
|
+
if (search.empty()) {
|
|
446
|
+
return;
|
|
447
|
+
}
|
|
448
|
+
std::string builder;
|
|
449
|
+
builder.reserve(s.length());
|
|
450
|
+
size_t pos = 0;
|
|
451
|
+
size_t last_pos = 0;
|
|
452
|
+
while ((pos = s.find(search, last_pos)) != std::string::npos) {
|
|
453
|
+
builder.append(s, last_pos, pos - last_pos);
|
|
454
|
+
builder.append(replace);
|
|
455
|
+
last_pos = pos + search.length();
|
|
456
|
+
}
|
|
457
|
+
builder.append(s, last_pos, std::string::npos);
|
|
458
|
+
s = std::move(builder);
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
std::string string_from(bool value) {
|
|
462
|
+
return value ? "true" : "false";
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
std::string string_from(const std::vector<int> & values) {
|
|
466
|
+
std::stringstream buf;
|
|
467
|
+
|
|
468
|
+
buf << "[ ";
|
|
469
|
+
bool first = true;
|
|
470
|
+
for (auto e : values) {
|
|
471
|
+
if (first) {
|
|
472
|
+
first = false;
|
|
473
|
+
} else {
|
|
474
|
+
buf << ", ";
|
|
475
|
+
}
|
|
476
|
+
buf << std::to_string(e);
|
|
477
|
+
}
|
|
478
|
+
buf << " ]";
|
|
479
|
+
|
|
480
|
+
return buf.str();
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
|
|
484
|
+
std::stringstream buf;
|
|
485
|
+
|
|
486
|
+
buf << "[ ";
|
|
487
|
+
|
|
488
|
+
bool first = true;
|
|
489
|
+
for (const auto & token : tokens) {
|
|
490
|
+
if (!first) {
|
|
491
|
+
buf << ", ";
|
|
492
|
+
} else {
|
|
493
|
+
first = false;
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
auto detokenized = llama_token_to_piece(ctx, token);
|
|
497
|
+
|
|
498
|
+
detokenized.erase(
|
|
499
|
+
std::remove_if(
|
|
500
|
+
detokenized.begin(),
|
|
501
|
+
detokenized.end(),
|
|
502
|
+
[](const unsigned char c) { return !std::isprint(c); }),
|
|
503
|
+
detokenized.end());
|
|
504
|
+
|
|
505
|
+
buf << "'" << detokenized << "'"
|
|
506
|
+
<< ":" << std::to_string(token);
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
buf << " ]";
|
|
510
|
+
|
|
511
|
+
return buf.str();
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
|
|
515
|
+
std::stringstream buf;
|
|
516
|
+
|
|
517
|
+
buf << "[ ";
|
|
518
|
+
|
|
519
|
+
bool first = true;
|
|
520
|
+
for (int i = 0; i < batch.n_tokens; ++i) {
|
|
521
|
+
if (!first) {
|
|
522
|
+
buf << ", ";
|
|
523
|
+
} else {
|
|
524
|
+
first = false;
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
|
|
528
|
+
|
|
529
|
+
detokenized.erase(
|
|
530
|
+
std::remove_if(
|
|
531
|
+
detokenized.begin(),
|
|
532
|
+
detokenized.end(),
|
|
533
|
+
[](const unsigned char c) { return !std::isprint(c); }),
|
|
534
|
+
detokenized.end());
|
|
535
|
+
|
|
536
|
+
buf << "\n" << std::to_string(i)
|
|
537
|
+
<< ":token '" << detokenized << "'"
|
|
538
|
+
<< ":pos " << std::to_string(batch.pos[i])
|
|
539
|
+
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
|
|
540
|
+
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
|
|
541
|
+
<< ":logits " << std::to_string(batch.logits[i]);
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
buf << " ]";
|
|
545
|
+
|
|
546
|
+
return buf.str();
|
|
547
|
+
}
|
|
548
|
+
|
|
1769
549
|
void string_process_escapes(std::string & input) {
|
|
1770
550
|
std::size_t input_len = input.length();
|
|
1771
551
|
std::size_t output_idx = 0;
|
|
@@ -1806,7 +586,7 @@ void string_process_escapes(std::string & input) {
|
|
|
1806
586
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
|
|
1807
587
|
const char * sep = strchr(data, '=');
|
|
1808
588
|
if (sep == nullptr || sep - data >= 128) {
|
|
1809
|
-
|
|
589
|
+
LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
|
|
1810
590
|
return false;
|
|
1811
591
|
}
|
|
1812
592
|
llama_model_kv_override kvo;
|
|
@@ -1829,20 +609,20 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
|
|
|
1829
609
|
} else if (std::strcmp(sep, "false") == 0) {
|
|
1830
610
|
kvo.val_bool = false;
|
|
1831
611
|
} else {
|
|
1832
|
-
|
|
612
|
+
LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
|
|
1833
613
|
return false;
|
|
1834
614
|
}
|
|
1835
615
|
} else if (strncmp(sep, "str:", 4) == 0) {
|
|
1836
616
|
sep += 4;
|
|
1837
617
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
|
1838
618
|
if (strlen(sep) > 127) {
|
|
1839
|
-
|
|
619
|
+
LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
|
|
1840
620
|
return false;
|
|
1841
621
|
}
|
|
1842
622
|
strncpy(kvo.val_str, sep, 127);
|
|
1843
623
|
kvo.val_str[127] = '\0';
|
|
1844
624
|
} else {
|
|
1845
|
-
|
|
625
|
+
LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
|
|
1846
626
|
return false;
|
|
1847
627
|
}
|
|
1848
628
|
overrides.emplace_back(std::move(kvo));
|
|
@@ -2039,8 +819,8 @@ std::string fs_get_cache_file(const std::string & filename) {
|
|
|
2039
819
|
//
|
|
2040
820
|
// Model utils
|
|
2041
821
|
//
|
|
2042
|
-
|
|
2043
|
-
|
|
822
|
+
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
823
|
+
llama_init_result iparams;
|
|
2044
824
|
auto mparams = llama_model_params_from_gpt_params(params);
|
|
2045
825
|
|
|
2046
826
|
llama_model * model = nullptr;
|
|
@@ -2054,17 +834,42 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
|
2054
834
|
}
|
|
2055
835
|
|
|
2056
836
|
if (model == NULL) {
|
|
2057
|
-
|
|
2058
|
-
return
|
|
837
|
+
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
|
|
838
|
+
return iparams;
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
if (params.reranking) {
|
|
842
|
+
bool ok = true;
|
|
843
|
+
|
|
844
|
+
if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
|
|
845
|
+
LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
|
|
846
|
+
ok = false;
|
|
847
|
+
}
|
|
848
|
+
|
|
849
|
+
if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
|
850
|
+
LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
|
|
851
|
+
ok = false;
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
|
|
855
|
+
LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
|
|
856
|
+
ok = false;
|
|
857
|
+
}
|
|
858
|
+
|
|
859
|
+
if (!ok) {
|
|
860
|
+
llama_free_model(model);
|
|
861
|
+
|
|
862
|
+
return iparams;
|
|
863
|
+
}
|
|
2059
864
|
}
|
|
2060
865
|
|
|
2061
866
|
auto cparams = llama_context_params_from_gpt_params(params);
|
|
2062
867
|
|
|
2063
868
|
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
|
2064
869
|
if (lctx == NULL) {
|
|
2065
|
-
|
|
870
|
+
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
|
2066
871
|
llama_free_model(model);
|
|
2067
|
-
return
|
|
872
|
+
return iparams;
|
|
2068
873
|
}
|
|
2069
874
|
|
|
2070
875
|
if (!params.control_vectors.empty()) {
|
|
@@ -2075,7 +880,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
|
2075
880
|
if (cvec.n_embd == -1) {
|
|
2076
881
|
llama_free(lctx);
|
|
2077
882
|
llama_free_model(model);
|
|
2078
|
-
|
|
883
|
+
|
|
884
|
+
return iparams;
|
|
2079
885
|
}
|
|
2080
886
|
|
|
2081
887
|
int err = llama_control_vector_apply(lctx,
|
|
@@ -2087,38 +893,50 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
|
2087
893
|
if (err) {
|
|
2088
894
|
llama_free(lctx);
|
|
2089
895
|
llama_free_model(model);
|
|
2090
|
-
|
|
896
|
+
|
|
897
|
+
return iparams;
|
|
2091
898
|
}
|
|
2092
899
|
}
|
|
2093
900
|
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
901
|
+
// load and optionally apply lora adapters
|
|
902
|
+
for (auto & la : params.lora_adapters) {
|
|
903
|
+
llama_lora_adapter_container loaded_la;
|
|
904
|
+
loaded_la.path = la.path;
|
|
905
|
+
loaded_la.scale = la.scale;
|
|
906
|
+
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
|
907
|
+
if (loaded_la.adapter == nullptr) {
|
|
908
|
+
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
|
2100
909
|
llama_free(lctx);
|
|
2101
910
|
llama_free_model(model);
|
|
2102
|
-
return
|
|
911
|
+
return iparams;
|
|
2103
912
|
}
|
|
2104
|
-
|
|
913
|
+
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
|
914
|
+
}
|
|
915
|
+
if (!params.lora_init_without_apply) {
|
|
916
|
+
llama_lora_adapters_apply(lctx, iparams.lora_adapters);
|
|
2105
917
|
}
|
|
2106
918
|
|
|
2107
|
-
if (params.ignore_eos) {
|
|
2108
|
-
|
|
919
|
+
if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
|
920
|
+
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
|
921
|
+
params.sparams.ignore_eos = false;
|
|
2109
922
|
}
|
|
2110
923
|
|
|
2111
924
|
if (params.warmup) {
|
|
2112
|
-
|
|
925
|
+
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
|
2113
926
|
|
|
2114
927
|
std::vector<llama_token> tmp;
|
|
2115
928
|
llama_token bos = llama_token_bos(model);
|
|
2116
929
|
llama_token eos = llama_token_eos(model);
|
|
2117
930
|
// some models (e.g. T5) don't have a BOS token
|
|
2118
|
-
if (bos !=
|
|
931
|
+
if (bos != LLAMA_TOKEN_NULL) {
|
|
2119
932
|
tmp.push_back(bos);
|
|
2120
933
|
}
|
|
2121
|
-
|
|
934
|
+
if (eos != LLAMA_TOKEN_NULL) {
|
|
935
|
+
tmp.push_back(eos);
|
|
936
|
+
}
|
|
937
|
+
if (tmp.empty()) {
|
|
938
|
+
tmp.push_back(0);
|
|
939
|
+
}
|
|
2122
940
|
|
|
2123
941
|
if (llama_model_has_encoder(model)) {
|
|
2124
942
|
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
|
|
@@ -2129,13 +947,27 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
|
2129
947
|
tmp.clear();
|
|
2130
948
|
tmp.push_back(decoder_start_token_id);
|
|
2131
949
|
}
|
|
2132
|
-
|
|
950
|
+
if (llama_model_has_decoder(model)) {
|
|
951
|
+
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
|
952
|
+
}
|
|
2133
953
|
llama_kv_cache_clear(lctx);
|
|
2134
954
|
llama_synchronize(lctx);
|
|
2135
|
-
|
|
955
|
+
llama_perf_context_reset(lctx);
|
|
2136
956
|
}
|
|
2137
957
|
|
|
2138
|
-
|
|
958
|
+
iparams.model = model;
|
|
959
|
+
iparams.context = lctx;
|
|
960
|
+
|
|
961
|
+
return iparams;
|
|
962
|
+
}
|
|
963
|
+
|
|
964
|
+
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
|
|
965
|
+
llama_lora_adapter_clear(ctx);
|
|
966
|
+
for (auto & la : lora_adapters) {
|
|
967
|
+
if (la.scale != 0.0f) {
|
|
968
|
+
llama_lora_adapter_set(ctx, la.adapter, la.scale);
|
|
969
|
+
}
|
|
970
|
+
}
|
|
2139
971
|
}
|
|
2140
972
|
|
|
2141
973
|
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
|
|
@@ -2197,9 +1029,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
|
2197
1029
|
cparams.n_seq_max = params.n_parallel;
|
|
2198
1030
|
cparams.n_batch = params.n_batch;
|
|
2199
1031
|
cparams.n_ubatch = params.n_ubatch;
|
|
2200
|
-
cparams.n_threads = params.n_threads;
|
|
2201
|
-
cparams.n_threads_batch = params.
|
|
2202
|
-
|
|
1032
|
+
cparams.n_threads = params.cpuparams.n_threads;
|
|
1033
|
+
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
|
1034
|
+
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
|
2203
1035
|
cparams.logits_all = params.logits_all;
|
|
2204
1036
|
cparams.embeddings = params.embedding;
|
|
2205
1037
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
|
@@ -2217,6 +1049,12 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
|
2217
1049
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
|
2218
1050
|
cparams.offload_kqv = !params.no_kv_offload;
|
|
2219
1051
|
cparams.flash_attn = params.flash_attn;
|
|
1052
|
+
cparams.no_perf = params.no_perf;
|
|
1053
|
+
|
|
1054
|
+
if (params.reranking) {
|
|
1055
|
+
cparams.embeddings = true;
|
|
1056
|
+
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
|
1057
|
+
}
|
|
2220
1058
|
|
|
2221
1059
|
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
|
2222
1060
|
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
|
@@ -2224,19 +1062,62 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
|
2224
1062
|
return cparams;
|
|
2225
1063
|
}
|
|
2226
1064
|
|
|
1065
|
+
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
|
|
1066
|
+
struct ggml_threadpool_params tpp;
|
|
1067
|
+
|
|
1068
|
+
ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
|
|
1069
|
+
|
|
1070
|
+
if (params.mask_valid) {
|
|
1071
|
+
std::memcpy(&tpp.cpumask, ¶ms.cpumask, GGML_MAX_N_THREADS);
|
|
1072
|
+
}
|
|
1073
|
+
|
|
1074
|
+
tpp.prio = params.priority;
|
|
1075
|
+
tpp.poll = params.poll;
|
|
1076
|
+
tpp.strict_cpu = params.strict_cpu;
|
|
1077
|
+
|
|
1078
|
+
return tpp;
|
|
1079
|
+
}
|
|
1080
|
+
|
|
2227
1081
|
#ifdef LLAMA_USE_CURL
|
|
2228
1082
|
|
|
1083
|
+
#define CURL_MAX_RETRY 3
|
|
1084
|
+
#define CURL_RETRY_DELAY_SECONDS 2
|
|
1085
|
+
|
|
1086
|
+
|
|
2229
1087
|
static bool starts_with(const std::string & str, const std::string & prefix) {
|
|
2230
1088
|
// While we wait for C++20's std::string::starts_with...
|
|
2231
1089
|
return str.rfind(prefix, 0) == 0;
|
|
2232
1090
|
}
|
|
2233
1091
|
|
|
1092
|
+
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
|
|
1093
|
+
int remaining_attempts = max_attempts;
|
|
1094
|
+
|
|
1095
|
+
while (remaining_attempts > 0) {
|
|
1096
|
+
LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
|
1097
|
+
|
|
1098
|
+
CURLcode res = curl_easy_perform(curl);
|
|
1099
|
+
if (res == CURLE_OK) {
|
|
1100
|
+
return true;
|
|
1101
|
+
}
|
|
1102
|
+
|
|
1103
|
+
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
|
|
1104
|
+
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
|
|
1105
|
+
|
|
1106
|
+
remaining_attempts--;
|
|
1107
|
+
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
|
1108
|
+
}
|
|
1109
|
+
|
|
1110
|
+
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
|
|
1111
|
+
|
|
1112
|
+
return false;
|
|
1113
|
+
}
|
|
1114
|
+
|
|
2234
1115
|
static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
|
2235
1116
|
|
|
2236
1117
|
// Initialize libcurl
|
|
2237
1118
|
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
|
2238
1119
|
if (!curl) {
|
|
2239
|
-
|
|
1120
|
+
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
|
2240
1121
|
return false;
|
|
2241
1122
|
}
|
|
2242
1123
|
|
|
@@ -2277,11 +1158,11 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
2277
1158
|
if (metadata_in.good()) {
|
|
2278
1159
|
try {
|
|
2279
1160
|
metadata_in >> metadata;
|
|
2280
|
-
|
|
1161
|
+
LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
|
2281
1162
|
if (metadata.contains("url") && metadata.at("url").is_string()) {
|
|
2282
1163
|
auto previous_url = metadata.at("url").get<std::string>();
|
|
2283
1164
|
if (previous_url != url) {
|
|
2284
|
-
|
|
1165
|
+
LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
|
|
2285
1166
|
return false;
|
|
2286
1167
|
}
|
|
2287
1168
|
}
|
|
@@ -2292,12 +1173,12 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
2292
1173
|
last_modified = metadata.at("lastModified");
|
|
2293
1174
|
}
|
|
2294
1175
|
} catch (const nlohmann::json::exception & e) {
|
|
2295
|
-
|
|
1176
|
+
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
|
2296
1177
|
return false;
|
|
2297
1178
|
}
|
|
2298
1179
|
}
|
|
2299
1180
|
} else {
|
|
2300
|
-
|
|
1181
|
+
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
|
2301
1182
|
}
|
|
2302
1183
|
|
|
2303
1184
|
// Send a HEAD request to retrieve the etag and last-modified headers
|
|
@@ -2334,9 +1215,8 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
2334
1215
|
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
|
2335
1216
|
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
|
2336
1217
|
|
|
2337
|
-
|
|
2338
|
-
if (
|
|
2339
|
-
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
|
|
1218
|
+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
|
1219
|
+
if (!was_perform_successful) {
|
|
2340
1220
|
return false;
|
|
2341
1221
|
}
|
|
2342
1222
|
|
|
@@ -2346,26 +1226,26 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
2346
1226
|
// HEAD not supported, we don't know if the file has changed
|
|
2347
1227
|
// force trigger downloading
|
|
2348
1228
|
force_download = true;
|
|
2349
|
-
|
|
1229
|
+
LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
|
2350
1230
|
}
|
|
2351
1231
|
}
|
|
2352
1232
|
|
|
2353
1233
|
bool should_download = !file_exists || force_download;
|
|
2354
1234
|
if (!should_download) {
|
|
2355
1235
|
if (!etag.empty() && etag != headers.etag) {
|
|
2356
|
-
|
|
1236
|
+
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|
|
2357
1237
|
should_download = true;
|
|
2358
1238
|
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
|
|
2359
|
-
|
|
1239
|
+
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
|
|
2360
1240
|
should_download = true;
|
|
2361
1241
|
}
|
|
2362
1242
|
}
|
|
2363
1243
|
if (should_download) {
|
|
2364
1244
|
std::string path_temporary = path + ".downloadInProgress";
|
|
2365
1245
|
if (file_exists) {
|
|
2366
|
-
|
|
1246
|
+
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
|
2367
1247
|
if (remove(path.c_str()) != 0) {
|
|
2368
|
-
|
|
1248
|
+
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
|
2369
1249
|
return false;
|
|
2370
1250
|
}
|
|
2371
1251
|
}
|
|
@@ -2380,7 +1260,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
2380
1260
|
|
|
2381
1261
|
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
|
|
2382
1262
|
if (!outfile) {
|
|
2383
|
-
|
|
1263
|
+
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
|
|
2384
1264
|
return false;
|
|
2385
1265
|
}
|
|
2386
1266
|
|
|
@@ -2411,18 +1291,17 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
2411
1291
|
};
|
|
2412
1292
|
|
|
2413
1293
|
// start the download
|
|
2414
|
-
|
|
2415
|
-
|
|
2416
|
-
|
|
2417
|
-
if (
|
|
2418
|
-
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
|
|
1294
|
+
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
|
1295
|
+
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
|
1296
|
+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
|
1297
|
+
if (!was_perform_successful) {
|
|
2419
1298
|
return false;
|
|
2420
1299
|
}
|
|
2421
1300
|
|
|
2422
1301
|
long http_code = 0;
|
|
2423
1302
|
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
|
2424
1303
|
if (http_code < 200 || http_code >= 400) {
|
|
2425
|
-
|
|
1304
|
+
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
|
2426
1305
|
return false;
|
|
2427
1306
|
}
|
|
2428
1307
|
|
|
@@ -2436,10 +1315,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
2436
1315
|
{"lastModified", headers.last_modified}
|
|
2437
1316
|
});
|
|
2438
1317
|
std::ofstream(metadata_path) << metadata.dump(4);
|
|
2439
|
-
|
|
1318
|
+
LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
|
2440
1319
|
|
|
2441
1320
|
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
|
2442
|
-
|
|
1321
|
+
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
|
2443
1322
|
return false;
|
|
2444
1323
|
}
|
|
2445
1324
|
}
|
|
@@ -2454,7 +1333,7 @@ struct llama_model * llama_load_model_from_url(
|
|
|
2454
1333
|
const struct llama_model_params & params) {
|
|
2455
1334
|
// Basic validation of the model_url
|
|
2456
1335
|
if (!model_url || strlen(model_url) == 0) {
|
|
2457
|
-
|
|
1336
|
+
LOG_ERR("%s: invalid model_url\n", __func__);
|
|
2458
1337
|
return NULL;
|
|
2459
1338
|
}
|
|
2460
1339
|
|
|
@@ -2471,7 +1350,7 @@ struct llama_model * llama_load_model_from_url(
|
|
|
2471
1350
|
};
|
|
2472
1351
|
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
|
|
2473
1352
|
if (!ctx_gguf) {
|
|
2474
|
-
|
|
1353
|
+
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
|
|
2475
1354
|
return NULL;
|
|
2476
1355
|
}
|
|
2477
1356
|
|
|
@@ -2491,14 +1370,12 @@ struct llama_model * llama_load_model_from_url(
|
|
|
2491
1370
|
// and extract split URL and PATH prefixes
|
|
2492
1371
|
{
|
|
2493
1372
|
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
|
|
2494
|
-
|
|
2495
|
-
" n_split=%d\n", __func__, path_model, n_split);
|
|
1373
|
+
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
|
|
2496
1374
|
return NULL;
|
|
2497
1375
|
}
|
|
2498
1376
|
|
|
2499
1377
|
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
|
2500
|
-
|
|
2501
|
-
" n_split=%d\n", __func__, model_url, n_split);
|
|
1378
|
+
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
|
|
2502
1379
|
return NULL;
|
|
2503
1380
|
}
|
|
2504
1381
|
}
|
|
@@ -2558,7 +1435,7 @@ struct llama_model * llama_load_model_from_url(
|
|
|
2558
1435
|
const char * /*path_model*/,
|
|
2559
1436
|
const char * /*hf_token*/,
|
|
2560
1437
|
const struct llama_model_params & /*params*/) {
|
|
2561
|
-
|
|
1438
|
+
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
|
2562
1439
|
return nullptr;
|
|
2563
1440
|
}
|
|
2564
1441
|
|
|
@@ -2568,7 +1445,7 @@ struct llama_model * llama_load_model_from_hf(
|
|
|
2568
1445
|
const char * /*path_model*/,
|
|
2569
1446
|
const char * /*hf_token*/,
|
|
2570
1447
|
const struct llama_model_params & /*params*/) {
|
|
2571
|
-
|
|
1448
|
+
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
|
2572
1449
|
return nullptr;
|
|
2573
1450
|
}
|
|
2574
1451
|
|
|
@@ -2588,6 +1465,8 @@ void llama_batch_add(
|
|
|
2588
1465
|
llama_pos pos,
|
|
2589
1466
|
const std::vector<llama_seq_id> & seq_ids,
|
|
2590
1467
|
bool logits) {
|
|
1468
|
+
GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
|
|
1469
|
+
|
|
2591
1470
|
batch.token [batch.n_tokens] = id;
|
|
2592
1471
|
batch.pos [batch.n_tokens] = pos;
|
|
2593
1472
|
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
|
|
@@ -2662,12 +1541,6 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
|
|
|
2662
1541
|
return text;
|
|
2663
1542
|
}
|
|
2664
1543
|
|
|
2665
|
-
bool llama_should_add_bos_token(const llama_model * model) {
|
|
2666
|
-
const int add_bos = llama_add_bos_token(model);
|
|
2667
|
-
|
|
2668
|
-
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
|
|
2669
|
-
}
|
|
2670
|
-
|
|
2671
1544
|
//
|
|
2672
1545
|
// Chat template utils
|
|
2673
1546
|
//
|
|
@@ -2902,13 +1775,13 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
|
2902
1775
|
};
|
|
2903
1776
|
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
|
|
2904
1777
|
if (!ctx_gguf) {
|
|
2905
|
-
|
|
1778
|
+
LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
|
|
2906
1779
|
return result;
|
|
2907
1780
|
}
|
|
2908
1781
|
|
|
2909
1782
|
int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
|
|
2910
1783
|
if (n_tensors == 0) {
|
|
2911
|
-
|
|
1784
|
+
LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
|
|
2912
1785
|
}
|
|
2913
1786
|
|
|
2914
1787
|
for (int i = 0; i < n_tensors; i++) {
|
|
@@ -2926,23 +1799,23 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
|
2926
1799
|
}
|
|
2927
1800
|
}
|
|
2928
1801
|
if (layer_idx < 0) {
|
|
2929
|
-
|
|
1802
|
+
LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
|
2930
1803
|
result.n_embd = -1;
|
|
2931
1804
|
break;
|
|
2932
1805
|
} else if (layer_idx == 0) {
|
|
2933
|
-
|
|
1806
|
+
LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
|
2934
1807
|
result.n_embd = -1;
|
|
2935
1808
|
break;
|
|
2936
1809
|
}
|
|
2937
1810
|
|
|
2938
1811
|
struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
|
|
2939
1812
|
if (tensor->type != GGML_TYPE_F32) {
|
|
2940
|
-
|
|
1813
|
+
LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
|
|
2941
1814
|
result.n_embd = -1;
|
|
2942
1815
|
break;
|
|
2943
1816
|
}
|
|
2944
1817
|
if (ggml_n_dims(tensor) != 1) {
|
|
2945
|
-
|
|
1818
|
+
LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
|
|
2946
1819
|
result.n_embd = -1;
|
|
2947
1820
|
break;
|
|
2948
1821
|
}
|
|
@@ -2950,7 +1823,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
|
2950
1823
|
if (result.n_embd == -1) {
|
|
2951
1824
|
result.n_embd = ggml_nelements(tensor);
|
|
2952
1825
|
} else if (ggml_nelements(tensor) != result.n_embd) {
|
|
2953
|
-
|
|
1826
|
+
LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
|
|
2954
1827
|
result.n_embd = -1;
|
|
2955
1828
|
break;
|
|
2956
1829
|
}
|
|
@@ -2967,7 +1840,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
|
2967
1840
|
}
|
|
2968
1841
|
|
|
2969
1842
|
if (result.n_embd == -1) {
|
|
2970
|
-
|
|
1843
|
+
LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
|
|
2971
1844
|
result.data.clear();
|
|
2972
1845
|
}
|
|
2973
1846
|
|
|
@@ -2988,7 +1861,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|
|
2988
1861
|
break;
|
|
2989
1862
|
}
|
|
2990
1863
|
if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
|
|
2991
|
-
|
|
1864
|
+
LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
|
|
2992
1865
|
result.n_embd = -1;
|
|
2993
1866
|
break;
|
|
2994
1867
|
}
|
|
@@ -3004,7 +1877,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|
|
3004
1877
|
}
|
|
3005
1878
|
|
|
3006
1879
|
if (result.n_embd == -1) {
|
|
3007
|
-
|
|
1880
|
+
LOG_ERR("%s: no valid control vector files passed\n", __func__);
|
|
3008
1881
|
result.data.clear();
|
|
3009
1882
|
}
|
|
3010
1883
|
|
|
@@ -3075,7 +1948,7 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
|
|
|
3075
1948
|
|
|
3076
1949
|
void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
|
3077
1950
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
|
3078
|
-
const
|
|
1951
|
+
const auto & sparams = params.sparams;
|
|
3079
1952
|
|
|
3080
1953
|
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
|
3081
1954
|
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
|
@@ -3095,6 +1968,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
|
3095
1968
|
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
|
|
3096
1969
|
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
|
3097
1970
|
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
|
1971
|
+
fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false");
|
|
3098
1972
|
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
|
3099
1973
|
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
|
3100
1974
|
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
|
@@ -3126,8 +2000,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
|
3126
2000
|
|
|
3127
2001
|
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
|
3128
2002
|
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
|
3129
|
-
yaml_dump_string_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
|
|
3130
|
-
fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
|
|
3131
2003
|
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
|
3132
2004
|
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
|
3133
2005
|
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
|
@@ -3138,10 +2010,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
|
3138
2010
|
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
|
3139
2011
|
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
|
3140
2012
|
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
|
3141
|
-
|
|
3142
|
-
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
|
|
3143
|
-
const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
|
3144
|
-
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
|
2013
|
+
fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");
|
|
3145
2014
|
|
|
3146
2015
|
yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
|
3147
2016
|
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
|
|
@@ -3152,27 +2021,23 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
|
3152
2021
|
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
|
3153
2022
|
|
|
3154
2023
|
fprintf(stream, "logit_bias:\n");
|
|
3155
|
-
for (
|
|
3156
|
-
|
|
3157
|
-
continue;
|
|
3158
|
-
}
|
|
3159
|
-
fprintf(stream, " %d: %f", lb.first, lb.second);
|
|
2024
|
+
for (const auto & logit_bias : sparams.logit_bias) {
|
|
2025
|
+
fprintf(stream, " %d: %f", logit_bias.token, logit_bias.bias);
|
|
3160
2026
|
}
|
|
3161
2027
|
|
|
3162
2028
|
fprintf(stream, "lora:\n");
|
|
3163
|
-
for (
|
|
3164
|
-
if (
|
|
3165
|
-
|
|
2029
|
+
for (auto & la : params.lora_adapters) {
|
|
2030
|
+
if (la.scale == 1.0f) {
|
|
2031
|
+
fprintf(stream, " - %s\n", la.path.c_str());
|
|
3166
2032
|
}
|
|
3167
|
-
fprintf(stream, " - %s\n", std::get<0>(la).c_str());
|
|
3168
2033
|
}
|
|
3169
2034
|
fprintf(stream, "lora_scaled:\n");
|
|
3170
|
-
for (
|
|
3171
|
-
if (
|
|
3172
|
-
|
|
2035
|
+
for (auto & la : params.lora_adapters) {
|
|
2036
|
+
if (la.scale != 1.0f) {
|
|
2037
|
+
fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
|
|
3173
2038
|
}
|
|
3174
|
-
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
|
|
3175
2039
|
}
|
|
2040
|
+
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
|
|
3176
2041
|
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
|
3177
2042
|
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
|
3178
2043
|
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
|
@@ -3210,7 +2075,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
|
3210
2075
|
|
|
3211
2076
|
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
|
|
3212
2077
|
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
|
|
3213
|
-
fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
|
|
3214
2078
|
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
|
3215
2079
|
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
|
3216
2080
|
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
|
@@ -3220,11 +2084,11 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
|
3220
2084
|
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
|
|
3221
2085
|
|
|
3222
2086
|
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
|
3223
|
-
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
|
|
2087
|
+
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
|
|
3224
2088
|
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
|
3225
2089
|
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
|
3226
2090
|
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
|
3227
|
-
fprintf(stream, "
|
|
2091
|
+
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
|
|
3228
2092
|
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
|
3229
2093
|
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
|
3230
2094
|
}
|