@fugood/llama.node 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +9 -0
- package/README.md +1 -1
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +2 -1
- package/patches/llama.patch +22 -0
- package/src/LlamaContext.cpp +2 -2
- package/src/TokenizeWorker.cpp +1 -1
- package/src/llama.cpp/CMakeLists.txt +82 -54
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
- package/src/llama.cpp/common/common.cpp +748 -754
- package/src/llama.cpp/common/common.h +49 -41
- package/src/llama.cpp/common/grammar-parser.cpp +10 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
- package/src/llama.cpp/common/log.h +5 -5
- package/src/llama.cpp/common/sampling.cpp +92 -10
- package/src/llama.cpp/common/sampling.h +6 -1
- package/src/llama.cpp/common/train.cpp +2 -2
- package/src/llama.cpp/examples/CMakeLists.txt +3 -0
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +13 -4
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
- package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
- package/src/llama.cpp/examples/infill/infill.cpp +8 -8
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +57 -8
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +55 -0
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/CMakeLists.txt +7 -8
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
- package/src/llama.cpp/examples/llava/clip.h +1 -1
- package/src/llama.cpp/examples/llava/llava-cli.cpp +27 -7
- package/src/llama.cpp/examples/llava/llava.cpp +0 -15
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +29 -17
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
- package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +134 -0
- package/src/llama.cpp/examples/server/server.cpp +33 -25
- package/src/llama.cpp/examples/server/utils.hpp +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
- package/src/llama.cpp/ggml-backend.c +2 -3
- package/src/llama.cpp/ggml-common.h +0 -54
- package/src/llama.cpp/ggml-cuda.h +1 -0
- package/src/llama.cpp/ggml-impl.h +51 -0
- package/src/llama.cpp/ggml-kompute.cpp +13 -3
- package/src/llama.cpp/ggml-opencl.cpp +4 -1
- package/src/llama.cpp/ggml-quants.c +3715 -2050
- package/src/llama.cpp/ggml-rpc.cpp +1155 -0
- package/src/llama.cpp/ggml-rpc.h +24 -0
- package/src/llama.cpp/ggml-sycl.cpp +119 -673
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- package/src/llama.cpp/ggml-vulkan.cpp +203 -224
- package/src/llama.cpp/ggml.c +1208 -1483
- package/src/llama.cpp/ggml.h +71 -46
- package/src/llama.cpp/llama.cpp +1374 -938
- package/src/llama.cpp/llama.h +22 -6
- package/src/llama.cpp/requirements.txt +0 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +120 -57
- package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
- package/src/llama.cpp/tests/test-grad0.cpp +43 -83
- package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
- package/src/llama.cpp/unicode-data.cpp +6969 -2169
- package/src/llama.cpp/unicode-data.h +15 -12
- package/src/llama.cpp/unicode.cpp +89 -111
- package/src/llama.cpp/unicode.h +44 -12
- package/src/llama.cpp/build.zig +0 -172
- package/src/llama.cpp/ggml-mpi.c +0 -216
- package/src/llama.cpp/ggml-mpi.h +0 -39
- package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
|
@@ -73,7 +73,11 @@
|
|
|
73
73
|
|
|
74
74
|
using json = nlohmann::ordered_json;
|
|
75
75
|
|
|
76
|
-
|
|
76
|
+
//
|
|
77
|
+
// CPU utils
|
|
78
|
+
//
|
|
79
|
+
|
|
80
|
+
int32_t cpu_get_num_physical_cores() {
|
|
77
81
|
#ifdef __linux__
|
|
78
82
|
// enumerate the set of thread siblings, num entries is num cores
|
|
79
83
|
std::unordered_set<std::string> siblings;
|
|
@@ -142,9 +146,9 @@ static bool is_running_on_efficiency_core(void) {
|
|
|
142
146
|
return core_type == intel_atom;
|
|
143
147
|
}
|
|
144
148
|
|
|
145
|
-
static int
|
|
149
|
+
static int cpu_count_math_cpus(int n_cpu) {
|
|
146
150
|
int result = 0;
|
|
147
|
-
for (int cpu = 0; cpu <
|
|
151
|
+
for (int cpu = 0; cpu < n_cpu; ++cpu) {
|
|
148
152
|
if (pin_cpu(cpu)) {
|
|
149
153
|
return -1;
|
|
150
154
|
}
|
|
@@ -162,16 +166,16 @@ static int count_math_cpus(int cpu_count) {
|
|
|
162
166
|
/**
|
|
163
167
|
* Returns number of CPUs on system that are useful for math.
|
|
164
168
|
*/
|
|
165
|
-
|
|
169
|
+
int32_t cpu_get_num_math() {
|
|
166
170
|
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
|
|
167
|
-
int
|
|
168
|
-
if (
|
|
169
|
-
return
|
|
171
|
+
int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
|
|
172
|
+
if (n_cpu < 1) {
|
|
173
|
+
return cpu_get_num_physical_cores();
|
|
170
174
|
}
|
|
171
175
|
if (is_hybrid_cpu()) {
|
|
172
176
|
cpu_set_t affinity;
|
|
173
177
|
if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
|
|
174
|
-
int result =
|
|
178
|
+
int result = cpu_count_math_cpus(n_cpu);
|
|
175
179
|
pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
|
|
176
180
|
if (result > 0) {
|
|
177
181
|
return result;
|
|
@@ -179,108 +183,103 @@ int get_math_cpu_count() {
|
|
|
179
183
|
}
|
|
180
184
|
}
|
|
181
185
|
#endif
|
|
182
|
-
return
|
|
186
|
+
return cpu_get_num_physical_cores();
|
|
183
187
|
}
|
|
184
188
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
189
|
+
//
|
|
190
|
+
// CLI argument parsing
|
|
191
|
+
//
|
|
188
192
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
case '\'': input[output_idx++] = '\''; break;
|
|
196
|
-
case '\"': input[output_idx++] = '\"'; break;
|
|
197
|
-
case '\\': input[output_idx++] = '\\'; break;
|
|
198
|
-
case 'x':
|
|
199
|
-
// Handle \x12, etc
|
|
200
|
-
if (input_idx + 2 < input_len) {
|
|
201
|
-
const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
|
|
202
|
-
char *err_p = nullptr;
|
|
203
|
-
const long val = std::strtol(x, &err_p, 16);
|
|
204
|
-
if (err_p == x + 2) {
|
|
205
|
-
input_idx += 2;
|
|
206
|
-
input[output_idx++] = char(val);
|
|
207
|
-
break;
|
|
208
|
-
}
|
|
209
|
-
}
|
|
210
|
-
// fall through
|
|
211
|
-
default: input[output_idx++] = '\\';
|
|
212
|
-
input[output_idx++] = input[input_idx]; break;
|
|
193
|
+
void gpt_params_handle_model_default(gpt_params & params) {
|
|
194
|
+
if (!params.hf_repo.empty()) {
|
|
195
|
+
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
196
|
+
if (params.hf_file.empty()) {
|
|
197
|
+
if (params.model.empty()) {
|
|
198
|
+
throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
|
|
213
199
|
}
|
|
214
|
-
|
|
215
|
-
|
|
200
|
+
params.hf_file = params.model;
|
|
201
|
+
} else if (params.model.empty()) {
|
|
202
|
+
std::string cache_directory = fs_get_cache_directory();
|
|
203
|
+
const bool success = fs_create_directory_with_parents(cache_directory);
|
|
204
|
+
if (!success) {
|
|
205
|
+
throw std::runtime_error("failed to create cache directory: " + cache_directory);
|
|
206
|
+
}
|
|
207
|
+
params.model = cache_directory + string_split(params.hf_file, '/').back();
|
|
208
|
+
}
|
|
209
|
+
} else if (!params.model_url.empty()) {
|
|
210
|
+
if (params.model.empty()) {
|
|
211
|
+
auto f = string_split(params.model_url, '#').front();
|
|
212
|
+
f = string_split(f, '?').front();
|
|
213
|
+
f = string_split(f, '/').back();
|
|
214
|
+
params.model = "models/" + f;
|
|
216
215
|
}
|
|
216
|
+
} else if (params.model.empty()) {
|
|
217
|
+
params.model = DEFAULT_MODEL_PATH;
|
|
217
218
|
}
|
|
219
|
+
}
|
|
218
220
|
|
|
219
|
-
|
|
221
|
+
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|
222
|
+
bool invalid_param = false;
|
|
223
|
+
std::string arg;
|
|
224
|
+
const std::string arg_prefix = "--";
|
|
225
|
+
llama_sampling_params & sparams = params.sparams;
|
|
226
|
+
|
|
227
|
+
for (int i = 1; i < argc; i++) {
|
|
228
|
+
arg = argv[i];
|
|
229
|
+
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
|
230
|
+
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
231
|
+
}
|
|
232
|
+
if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
|
|
233
|
+
throw std::invalid_argument("error: unknown argument: " + arg);
|
|
234
|
+
}
|
|
235
|
+
if (invalid_param) {
|
|
236
|
+
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
if (params.prompt_cache_all &&
|
|
241
|
+
(params.interactive || params.interactive_first ||
|
|
242
|
+
params.instruct)) {
|
|
243
|
+
|
|
244
|
+
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
gpt_params_handle_model_default(params);
|
|
248
|
+
|
|
249
|
+
if (params.escape) {
|
|
250
|
+
string_process_escapes(params.prompt);
|
|
251
|
+
string_process_escapes(params.input_prefix);
|
|
252
|
+
string_process_escapes(params.input_suffix);
|
|
253
|
+
string_process_escapes(sparams.cfg_negative_prompt);
|
|
254
|
+
for (auto & antiprompt : params.antiprompt) {
|
|
255
|
+
string_process_escapes(antiprompt);
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
if (!params.kv_overrides.empty()) {
|
|
260
|
+
params.kv_overrides.emplace_back();
|
|
261
|
+
params.kv_overrides.back().key[0] = 0;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
return true;
|
|
220
265
|
}
|
|
221
266
|
|
|
222
267
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
223
268
|
bool result = true;
|
|
224
269
|
try {
|
|
225
270
|
if (!gpt_params_parse_ex(argc, argv, params)) {
|
|
226
|
-
|
|
271
|
+
gpt_params_print_usage(argc, argv, gpt_params());
|
|
227
272
|
exit(0);
|
|
228
273
|
}
|
|
229
274
|
}
|
|
230
275
|
catch (const std::invalid_argument & ex) {
|
|
231
276
|
fprintf(stderr, "%s\n", ex.what());
|
|
232
|
-
|
|
277
|
+
gpt_params_print_usage(argc, argv, gpt_params());
|
|
233
278
|
exit(1);
|
|
234
279
|
}
|
|
235
280
|
return result;
|
|
236
281
|
}
|
|
237
282
|
|
|
238
|
-
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
|
|
239
|
-
const char * sep = strchr(data, '=');
|
|
240
|
-
if (sep == nullptr || sep - data >= 128) {
|
|
241
|
-
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
|
|
242
|
-
return false;
|
|
243
|
-
}
|
|
244
|
-
llama_model_kv_override kvo;
|
|
245
|
-
std::strncpy(kvo.key, data, sep - data);
|
|
246
|
-
kvo.key[sep - data] = 0;
|
|
247
|
-
sep++;
|
|
248
|
-
if (strncmp(sep, "int:", 4) == 0) {
|
|
249
|
-
sep += 4;
|
|
250
|
-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
|
251
|
-
kvo.val_i64 = std::atol(sep);
|
|
252
|
-
} else if (strncmp(sep, "float:", 6) == 0) {
|
|
253
|
-
sep += 6;
|
|
254
|
-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
|
255
|
-
kvo.val_f64 = std::atof(sep);
|
|
256
|
-
} else if (strncmp(sep, "bool:", 5) == 0) {
|
|
257
|
-
sep += 5;
|
|
258
|
-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
|
259
|
-
if (std::strcmp(sep, "true") == 0) {
|
|
260
|
-
kvo.val_bool = true;
|
|
261
|
-
} else if (std::strcmp(sep, "false") == 0) {
|
|
262
|
-
kvo.val_bool = false;
|
|
263
|
-
} else {
|
|
264
|
-
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
|
|
265
|
-
return false;
|
|
266
|
-
}
|
|
267
|
-
} else if (strncmp(sep, "str:", 4) == 0) {
|
|
268
|
-
sep += 4;
|
|
269
|
-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
|
270
|
-
if (strlen(sep) > 127) {
|
|
271
|
-
fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
|
|
272
|
-
return false;
|
|
273
|
-
}
|
|
274
|
-
strncpy(kvo.val_str, sep, 127);
|
|
275
|
-
kvo.val_str[127] = '\0';
|
|
276
|
-
} else {
|
|
277
|
-
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
|
|
278
|
-
return false;
|
|
279
|
-
}
|
|
280
|
-
overrides.emplace_back(std::move(kvo));
|
|
281
|
-
return true;
|
|
282
|
-
}
|
|
283
|
-
|
|
284
283
|
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
|
|
285
284
|
llama_sampling_params & sparams = params.sparams;
|
|
286
285
|
|
|
@@ -546,7 +545,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
546
545
|
return true;
|
|
547
546
|
}
|
|
548
547
|
const auto sampler_names = string_split(argv[i], ';');
|
|
549
|
-
sparams.samplers_sequence =
|
|
548
|
+
sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
|
|
550
549
|
return true;
|
|
551
550
|
}
|
|
552
551
|
if (arg == "--sampling-seq") {
|
|
@@ -554,7 +553,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
554
553
|
invalid_param = true;
|
|
555
554
|
return true;
|
|
556
555
|
}
|
|
557
|
-
sparams.samplers_sequence =
|
|
556
|
+
sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]);
|
|
558
557
|
return true;
|
|
559
558
|
}
|
|
560
559
|
if (arg == "--top-p") {
|
|
@@ -901,6 +900,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
901
900
|
params.interactive = true;
|
|
902
901
|
return true;
|
|
903
902
|
}
|
|
903
|
+
if (arg == "--interactive-specials") {
|
|
904
|
+
params.interactive_specials = true;
|
|
905
|
+
return true;
|
|
906
|
+
}
|
|
907
|
+
if (arg == "--special") {
|
|
908
|
+
params.special = true;
|
|
909
|
+
return true;
|
|
910
|
+
}
|
|
904
911
|
if (arg == "--embedding") {
|
|
905
912
|
params.embedding = true;
|
|
906
913
|
return true;
|
|
@@ -1056,6 +1063,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1056
1063
|
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
|
1057
1064
|
return true;
|
|
1058
1065
|
}
|
|
1066
|
+
if (arg == "--rpc") {
|
|
1067
|
+
if (++i >= argc) {
|
|
1068
|
+
invalid_param = true;
|
|
1069
|
+
return true;
|
|
1070
|
+
}
|
|
1071
|
+
params.rpc_servers = argv[i];
|
|
1072
|
+
return true;
|
|
1073
|
+
}
|
|
1059
1074
|
if (arg == "--no-mmap") {
|
|
1060
1075
|
params.use_mmap = false;
|
|
1061
1076
|
return true;
|
|
@@ -1228,7 +1243,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1228
1243
|
return true;
|
|
1229
1244
|
}
|
|
1230
1245
|
if (arg == "-h" || arg == "--help") {
|
|
1231
|
-
|
|
1246
|
+
gpt_params_print_usage(argc, argv, gpt_params());
|
|
1232
1247
|
exit(0);
|
|
1233
1248
|
}
|
|
1234
1249
|
if (arg == "--version") {
|
|
@@ -1299,7 +1314,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1299
1314
|
invalid_param = true;
|
|
1300
1315
|
return true;
|
|
1301
1316
|
}
|
|
1302
|
-
if (!
|
|
1317
|
+
if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
|
|
1303
1318
|
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
|
1304
1319
|
invalid_param = true;
|
|
1305
1320
|
return true;
|
|
@@ -1333,85 +1348,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1333
1348
|
return false;
|
|
1334
1349
|
}
|
|
1335
1350
|
|
|
1336
|
-
void
|
|
1337
|
-
if (!params.hf_repo.empty()) {
|
|
1338
|
-
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
1339
|
-
if (params.hf_file.empty()) {
|
|
1340
|
-
if (params.model.empty()) {
|
|
1341
|
-
throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
|
|
1342
|
-
}
|
|
1343
|
-
params.hf_file = params.model;
|
|
1344
|
-
} else if (params.model.empty()) {
|
|
1345
|
-
params.model = "models/" + string_split(params.hf_file, '/').back();
|
|
1346
|
-
}
|
|
1347
|
-
} else if (!params.model_url.empty()) {
|
|
1348
|
-
if (params.model.empty()) {
|
|
1349
|
-
auto f = string_split(params.model_url, '#').front();
|
|
1350
|
-
f = string_split(f, '?').front();
|
|
1351
|
-
f = string_split(f, '/').back();
|
|
1352
|
-
params.model = "models/" + f;
|
|
1353
|
-
}
|
|
1354
|
-
} else if (params.model.empty()) {
|
|
1355
|
-
params.model = DEFAULT_MODEL_PATH;
|
|
1356
|
-
}
|
|
1357
|
-
}
|
|
1358
|
-
|
|
1359
|
-
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|
1360
|
-
bool invalid_param = false;
|
|
1361
|
-
std::string arg;
|
|
1362
|
-
const std::string arg_prefix = "--";
|
|
1363
|
-
llama_sampling_params & sparams = params.sparams;
|
|
1364
|
-
|
|
1365
|
-
for (int i = 1; i < argc; i++) {
|
|
1366
|
-
arg = argv[i];
|
|
1367
|
-
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
|
1368
|
-
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
1369
|
-
}
|
|
1370
|
-
|
|
1371
|
-
if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
|
|
1372
|
-
throw std::invalid_argument("error: unknown argument: " + arg);
|
|
1373
|
-
}
|
|
1374
|
-
}
|
|
1375
|
-
|
|
1376
|
-
if (invalid_param) {
|
|
1377
|
-
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
|
1378
|
-
}
|
|
1379
|
-
|
|
1380
|
-
if (params.prompt_cache_all &&
|
|
1381
|
-
(params.interactive || params.interactive_first ||
|
|
1382
|
-
params.instruct)) {
|
|
1383
|
-
|
|
1384
|
-
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
|
1385
|
-
}
|
|
1386
|
-
|
|
1387
|
-
gpt_params_handle_model_default(params);
|
|
1388
|
-
|
|
1389
|
-
if (params.escape) {
|
|
1390
|
-
process_escapes(params.prompt);
|
|
1391
|
-
process_escapes(params.input_prefix);
|
|
1392
|
-
process_escapes(params.input_suffix);
|
|
1393
|
-
process_escapes(sparams.cfg_negative_prompt);
|
|
1394
|
-
for (auto & antiprompt : params.antiprompt) {
|
|
1395
|
-
process_escapes(antiprompt);
|
|
1396
|
-
}
|
|
1397
|
-
}
|
|
1398
|
-
|
|
1399
|
-
if (!params.kv_overrides.empty()) {
|
|
1400
|
-
params.kv_overrides.emplace_back();
|
|
1401
|
-
params.kv_overrides.back().key[0] = 0;
|
|
1402
|
-
}
|
|
1403
|
-
|
|
1404
|
-
return true;
|
|
1405
|
-
}
|
|
1406
|
-
|
|
1407
|
-
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
1351
|
+
void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
1408
1352
|
const llama_sampling_params & sparams = params.sparams;
|
|
1409
1353
|
|
|
1410
1354
|
std::string sampler_type_chars;
|
|
1411
1355
|
std::string sampler_type_names;
|
|
1412
1356
|
for (const auto sampler_type : sparams.samplers_sequence) {
|
|
1413
1357
|
sampler_type_chars += static_cast<char>(sampler_type);
|
|
1414
|
-
sampler_type_names +=
|
|
1358
|
+
sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";";
|
|
1415
1359
|
}
|
|
1416
1360
|
sampler_type_names.pop_back();
|
|
1417
1361
|
|
|
@@ -1422,6 +1366,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
|
1422
1366
|
printf(" -h, --help show this help message and exit\n");
|
|
1423
1367
|
printf(" --version show version and build info\n");
|
|
1424
1368
|
printf(" -i, --interactive run in interactive mode\n");
|
|
1369
|
+
printf(" --special special tokens output enabled\n");
|
|
1370
|
+
printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
|
|
1425
1371
|
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
|
1426
1372
|
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
|
|
1427
1373
|
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
|
@@ -1554,6 +1500,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
|
1554
1500
|
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
|
1555
1501
|
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
|
1556
1502
|
}
|
|
1503
|
+
printf(" --rpc SERVERS comma separated list of RPC servers\n");
|
|
1557
1504
|
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
|
1558
1505
|
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
|
1559
1506
|
printf(" -gan N, --grp-attn-n N\n");
|
|
@@ -1606,7 +1553,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
|
1606
1553
|
#endif // LOG_DISABLE_LOGS
|
|
1607
1554
|
}
|
|
1608
1555
|
|
|
1609
|
-
std::string
|
|
1556
|
+
std::string gpt_params_get_system_info(const gpt_params & params) {
|
|
1610
1557
|
std::ostringstream os;
|
|
1611
1558
|
|
|
1612
1559
|
os << "system_info: n_threads = " << params.n_threads;
|
|
@@ -1618,7 +1565,52 @@ std::string get_system_info(const gpt_params & params) {
|
|
|
1618
1565
|
return os.str();
|
|
1619
1566
|
}
|
|
1620
1567
|
|
|
1621
|
-
|
|
1568
|
+
//
|
|
1569
|
+
// String utils
|
|
1570
|
+
//
|
|
1571
|
+
|
|
1572
|
+
std::vector<std::string> string_split(std::string input, char separator) {
|
|
1573
|
+
std::vector<std::string> parts;
|
|
1574
|
+
size_t separator_pos = input.find(separator);
|
|
1575
|
+
while (separator_pos != std::string::npos) {
|
|
1576
|
+
std::string part = input.substr(0, separator_pos);
|
|
1577
|
+
parts.emplace_back(part);
|
|
1578
|
+
input = input.substr(separator_pos + 1);
|
|
1579
|
+
separator_pos = input.find(separator);
|
|
1580
|
+
}
|
|
1581
|
+
parts.emplace_back(input);
|
|
1582
|
+
return parts;
|
|
1583
|
+
}
|
|
1584
|
+
|
|
1585
|
+
std::string string_strip(const std::string & str) {
|
|
1586
|
+
size_t start = 0;
|
|
1587
|
+
size_t end = str.size();
|
|
1588
|
+
while (start < end && std::isspace(str[start])) {
|
|
1589
|
+
start++;
|
|
1590
|
+
}
|
|
1591
|
+
while (end > start && std::isspace(str[end - 1])) {
|
|
1592
|
+
end--;
|
|
1593
|
+
}
|
|
1594
|
+
return str.substr(start, end - start);
|
|
1595
|
+
}
|
|
1596
|
+
|
|
1597
|
+
std::string string_get_sortable_timestamp() {
|
|
1598
|
+
using clock = std::chrono::system_clock;
|
|
1599
|
+
|
|
1600
|
+
const clock::time_point current_time = clock::now();
|
|
1601
|
+
const time_t as_time_t = clock::to_time_t(current_time);
|
|
1602
|
+
char timestamp_no_ns[100];
|
|
1603
|
+
std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
|
|
1604
|
+
|
|
1605
|
+
const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
|
|
1606
|
+
current_time.time_since_epoch() % 1000000000).count();
|
|
1607
|
+
char timestamp_ns[11];
|
|
1608
|
+
snprintf(timestamp_ns, 11, "%09" PRId64, ns);
|
|
1609
|
+
|
|
1610
|
+
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
|
|
1611
|
+
}
|
|
1612
|
+
|
|
1613
|
+
std::string string_random_prompt(std::mt19937 & rng) {
|
|
1622
1614
|
const int r = rng() % 10;
|
|
1623
1615
|
switch (r) {
|
|
1624
1616
|
case 0: return "So";
|
|
@@ -1636,11 +1628,98 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
|
|
|
1636
1628
|
GGML_UNREACHABLE();
|
|
1637
1629
|
}
|
|
1638
1630
|
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
1631
|
+
void string_process_escapes(std::string & input) {
|
|
1632
|
+
std::size_t input_len = input.length();
|
|
1633
|
+
std::size_t output_idx = 0;
|
|
1634
|
+
|
|
1635
|
+
for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
|
|
1636
|
+
if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
|
|
1637
|
+
switch (input[++input_idx]) {
|
|
1638
|
+
case 'n': input[output_idx++] = '\n'; break;
|
|
1639
|
+
case 'r': input[output_idx++] = '\r'; break;
|
|
1640
|
+
case 't': input[output_idx++] = '\t'; break;
|
|
1641
|
+
case '\'': input[output_idx++] = '\''; break;
|
|
1642
|
+
case '\"': input[output_idx++] = '\"'; break;
|
|
1643
|
+
case '\\': input[output_idx++] = '\\'; break;
|
|
1644
|
+
case 'x':
|
|
1645
|
+
// Handle \x12, etc
|
|
1646
|
+
if (input_idx + 2 < input_len) {
|
|
1647
|
+
const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
|
|
1648
|
+
char *err_p = nullptr;
|
|
1649
|
+
const long val = std::strtol(x, &err_p, 16);
|
|
1650
|
+
if (err_p == x + 2) {
|
|
1651
|
+
input_idx += 2;
|
|
1652
|
+
input[output_idx++] = char(val);
|
|
1653
|
+
break;
|
|
1654
|
+
}
|
|
1655
|
+
}
|
|
1656
|
+
// fall through
|
|
1657
|
+
default: input[output_idx++] = '\\';
|
|
1658
|
+
input[output_idx++] = input[input_idx]; break;
|
|
1659
|
+
}
|
|
1660
|
+
} else {
|
|
1661
|
+
input[output_idx++] = input[input_idx];
|
|
1662
|
+
}
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
input.resize(output_idx);
|
|
1666
|
+
}
|
|
1667
|
+
|
|
1668
|
+
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
|
|
1669
|
+
const char * sep = strchr(data, '=');
|
|
1670
|
+
if (sep == nullptr || sep - data >= 128) {
|
|
1671
|
+
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
|
|
1672
|
+
return false;
|
|
1673
|
+
}
|
|
1674
|
+
llama_model_kv_override kvo;
|
|
1675
|
+
std::strncpy(kvo.key, data, sep - data);
|
|
1676
|
+
kvo.key[sep - data] = 0;
|
|
1677
|
+
sep++;
|
|
1678
|
+
if (strncmp(sep, "int:", 4) == 0) {
|
|
1679
|
+
sep += 4;
|
|
1680
|
+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
|
1681
|
+
kvo.val_i64 = std::atol(sep);
|
|
1682
|
+
} else if (strncmp(sep, "float:", 6) == 0) {
|
|
1683
|
+
sep += 6;
|
|
1684
|
+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
|
1685
|
+
kvo.val_f64 = std::atof(sep);
|
|
1686
|
+
} else if (strncmp(sep, "bool:", 5) == 0) {
|
|
1687
|
+
sep += 5;
|
|
1688
|
+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
|
1689
|
+
if (std::strcmp(sep, "true") == 0) {
|
|
1690
|
+
kvo.val_bool = true;
|
|
1691
|
+
} else if (std::strcmp(sep, "false") == 0) {
|
|
1692
|
+
kvo.val_bool = false;
|
|
1693
|
+
} else {
|
|
1694
|
+
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
|
|
1695
|
+
return false;
|
|
1696
|
+
}
|
|
1697
|
+
} else if (strncmp(sep, "str:", 4) == 0) {
|
|
1698
|
+
sep += 4;
|
|
1699
|
+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
|
1700
|
+
if (strlen(sep) > 127) {
|
|
1701
|
+
fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
|
|
1702
|
+
return false;
|
|
1703
|
+
}
|
|
1704
|
+
strncpy(kvo.val_str, sep, 127);
|
|
1705
|
+
kvo.val_str[127] = '\0';
|
|
1706
|
+
} else {
|
|
1707
|
+
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
|
|
1708
|
+
return false;
|
|
1709
|
+
}
|
|
1710
|
+
overrides.emplace_back(std::move(kvo));
|
|
1711
|
+
return true;
|
|
1712
|
+
}
|
|
1713
|
+
|
|
1714
|
+
//
|
|
1715
|
+
// Filesystem utils
|
|
1716
|
+
//
|
|
1717
|
+
|
|
1718
|
+
// Validate if a filename is safe to use
|
|
1719
|
+
// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
|
|
1720
|
+
bool fs_validate_filename(const std::string & filename) {
|
|
1721
|
+
if (!filename.length()) {
|
|
1722
|
+
// Empty filename invalid
|
|
1644
1723
|
return false;
|
|
1645
1724
|
}
|
|
1646
1725
|
if (filename.length() > 255) {
|
|
@@ -1707,181 +1786,260 @@ bool validate_file_name(const std::string & filename) {
|
|
|
1707
1786
|
return true;
|
|
1708
1787
|
}
|
|
1709
1788
|
|
|
1710
|
-
//
|
|
1711
|
-
|
|
1712
|
-
|
|
1789
|
+
// returns true if successful, false otherwise
|
|
1790
|
+
bool fs_create_directory_with_parents(const std::string & path) {
|
|
1791
|
+
#ifdef _WIN32
|
|
1792
|
+
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
|
1793
|
+
std::wstring wpath = converter.from_bytes(path);
|
|
1713
1794
|
|
|
1714
|
-
|
|
1715
|
-
|
|
1716
|
-
|
|
1717
|
-
|
|
1718
|
-
std::string part = input.substr(0, separator_pos);
|
|
1719
|
-
parts.emplace_back(part);
|
|
1720
|
-
input = input.substr(separator_pos + 1);
|
|
1721
|
-
separator_pos = input.find(separator);
|
|
1795
|
+
// if the path already exists, check whether it's a directory
|
|
1796
|
+
const DWORD attributes = GetFileAttributesW(wpath.c_str());
|
|
1797
|
+
if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
|
|
1798
|
+
return true;
|
|
1722
1799
|
}
|
|
1723
|
-
parts.emplace_back(input);
|
|
1724
|
-
return parts;
|
|
1725
|
-
}
|
|
1726
1800
|
|
|
1727
|
-
|
|
1728
|
-
size_t start = 0;
|
|
1729
|
-
size_t end = str.size();
|
|
1730
|
-
while (start < end && std::isspace(str[start])) {
|
|
1731
|
-
start++;
|
|
1732
|
-
}
|
|
1733
|
-
while (end > start && std::isspace(str[end - 1])) {
|
|
1734
|
-
end--;
|
|
1735
|
-
}
|
|
1736
|
-
return str.substr(start, end - start);
|
|
1737
|
-
}
|
|
1801
|
+
size_t pos_slash = 0;
|
|
1738
1802
|
|
|
1739
|
-
|
|
1740
|
-
std::
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
{"typical_p", llama_sampler_type::TYPICAL_P},
|
|
1744
|
-
{"min_p", llama_sampler_type::MIN_P},
|
|
1745
|
-
{"tfs_z", llama_sampler_type::TFS_Z},
|
|
1746
|
-
{"temperature", llama_sampler_type::TEMPERATURE}
|
|
1747
|
-
};
|
|
1803
|
+
// process path from front to back, procedurally creating directories
|
|
1804
|
+
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
|
1805
|
+
const std::wstring subpath = wpath.substr(0, pos_slash);
|
|
1806
|
+
const wchar_t * test = subpath.c_str();
|
|
1748
1807
|
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
{"top-k", llama_sampler_type::TOP_K},
|
|
1753
|
-
{"top-p", llama_sampler_type::TOP_P},
|
|
1754
|
-
{"nucleus", llama_sampler_type::TOP_P},
|
|
1755
|
-
{"typical-p", llama_sampler_type::TYPICAL_P},
|
|
1756
|
-
{"typical", llama_sampler_type::TYPICAL_P},
|
|
1757
|
-
{"min-p", llama_sampler_type::MIN_P},
|
|
1758
|
-
{"tfs-z", llama_sampler_type::TFS_Z},
|
|
1759
|
-
{"tfs", llama_sampler_type::TFS_Z},
|
|
1760
|
-
{"temp", llama_sampler_type::TEMPERATURE}
|
|
1761
|
-
};
|
|
1808
|
+
const bool success = CreateDirectoryW(test, NULL);
|
|
1809
|
+
if (!success) {
|
|
1810
|
+
const DWORD error = GetLastError();
|
|
1762
1811
|
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
if (sampler_item != sampler_canonical_name_map.end())
|
|
1769
|
-
{
|
|
1770
|
-
sampler_types.push_back(sampler_item->second);
|
|
1771
|
-
}
|
|
1772
|
-
else
|
|
1773
|
-
{
|
|
1774
|
-
if (allow_alt_names)
|
|
1775
|
-
{
|
|
1776
|
-
sampler_item = sampler_alt_name_map.find(name);
|
|
1777
|
-
if (sampler_item != sampler_alt_name_map.end())
|
|
1778
|
-
{
|
|
1779
|
-
sampler_types.push_back(sampler_item->second);
|
|
1812
|
+
// if the path already exists, ensure that it's a directory
|
|
1813
|
+
if (error == ERROR_ALREADY_EXISTS) {
|
|
1814
|
+
const DWORD attributes = GetFileAttributesW(subpath.c_str());
|
|
1815
|
+
if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
|
|
1816
|
+
return false;
|
|
1780
1817
|
}
|
|
1818
|
+
} else {
|
|
1819
|
+
return false;
|
|
1781
1820
|
}
|
|
1782
1821
|
}
|
|
1822
|
+
|
|
1823
|
+
pos_slash += 1;
|
|
1783
1824
|
}
|
|
1784
|
-
return sampler_types;
|
|
1785
|
-
}
|
|
1786
1825
|
|
|
1787
|
-
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
|
|
1794
|
-
|
|
1795
|
-
|
|
1826
|
+
return true;
|
|
1827
|
+
#else
|
|
1828
|
+
// if the path already exists, check whether it's a directory
|
|
1829
|
+
struct stat info;
|
|
1830
|
+
if (stat(path.c_str(), &info) == 0) {
|
|
1831
|
+
return S_ISDIR(info.st_mode);
|
|
1832
|
+
}
|
|
1833
|
+
|
|
1834
|
+
size_t pos_slash = 1; // skip leading slashes for directory creation
|
|
1835
|
+
|
|
1836
|
+
// process path from front to back, procedurally creating directories
|
|
1837
|
+
while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
|
|
1838
|
+
const std::string subpath = path.substr(0, pos_slash);
|
|
1839
|
+
struct stat info;
|
|
1796
1840
|
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
|
|
1841
|
+
// if the path already exists, ensure that it's a directory
|
|
1842
|
+
if (stat(subpath.c_str(), &info) == 0) {
|
|
1843
|
+
if (!S_ISDIR(info.st_mode)) {
|
|
1844
|
+
return false;
|
|
1845
|
+
}
|
|
1846
|
+
} else {
|
|
1847
|
+
// create parent directories
|
|
1848
|
+
const int ret = mkdir(subpath.c_str(), 0755);
|
|
1849
|
+
if (ret != 0) {
|
|
1850
|
+
return false;
|
|
1851
|
+
}
|
|
1803
1852
|
}
|
|
1853
|
+
|
|
1854
|
+
pos_slash += 1;
|
|
1804
1855
|
}
|
|
1805
|
-
|
|
1856
|
+
|
|
1857
|
+
return true;
|
|
1858
|
+
#endif // _WIN32
|
|
1806
1859
|
}
|
|
1807
1860
|
|
|
1808
|
-
std::string
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
1861
|
+
std::string fs_get_cache_directory() {
|
|
1862
|
+
std::string cache_directory = "";
|
|
1863
|
+
auto ensure_trailing_slash = [](std::string p) {
|
|
1864
|
+
// Make sure to add trailing slash
|
|
1865
|
+
if (p.back() != DIRECTORY_SEPARATOR) {
|
|
1866
|
+
p += DIRECTORY_SEPARATOR;
|
|
1867
|
+
}
|
|
1868
|
+
return p;
|
|
1869
|
+
};
|
|
1870
|
+
if (getenv("LLAMA_CACHE")) {
|
|
1871
|
+
cache_directory = std::getenv("LLAMA_CACHE");
|
|
1872
|
+
} else {
|
|
1873
|
+
#ifdef __linux__
|
|
1874
|
+
if (std::getenv("XDG_CACHE_HOME")) {
|
|
1875
|
+
cache_directory = std::getenv("XDG_CACHE_HOME");
|
|
1876
|
+
} else {
|
|
1877
|
+
cache_directory = std::getenv("HOME") + std::string("/.cache/");
|
|
1878
|
+
}
|
|
1879
|
+
#elif defined(__APPLE__)
|
|
1880
|
+
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
|
1881
|
+
#elif defined(_WIN32)
|
|
1882
|
+
cache_directory = std::getenv("LOCALAPPDATA");
|
|
1883
|
+
#endif // __linux__
|
|
1884
|
+
cache_directory = ensure_trailing_slash(cache_directory);
|
|
1885
|
+
cache_directory += "llama.cpp";
|
|
1817
1886
|
}
|
|
1887
|
+
return ensure_trailing_slash(cache_directory);
|
|
1818
1888
|
}
|
|
1819
1889
|
|
|
1890
|
+
|
|
1820
1891
|
//
|
|
1821
1892
|
// Model utils
|
|
1822
1893
|
//
|
|
1823
1894
|
|
|
1824
|
-
struct
|
|
1825
|
-
auto mparams =
|
|
1895
|
+
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
|
|
1896
|
+
auto mparams = llama_model_params_from_gpt_params(params);
|
|
1826
1897
|
|
|
1827
|
-
|
|
1828
|
-
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
1833
|
-
mparams.use_mmap = params.use_mmap;
|
|
1834
|
-
mparams.use_mlock = params.use_mlock;
|
|
1835
|
-
mparams.check_tensors = params.check_tensors;
|
|
1836
|
-
if (params.kv_overrides.empty()) {
|
|
1837
|
-
mparams.kv_overrides = NULL;
|
|
1898
|
+
llama_model * model = nullptr;
|
|
1899
|
+
|
|
1900
|
+
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
|
1901
|
+
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
|
|
1902
|
+
} else if (!params.model_url.empty()) {
|
|
1903
|
+
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
|
|
1838
1904
|
} else {
|
|
1839
|
-
|
|
1840
|
-
mparams.kv_overrides = params.kv_overrides.data();
|
|
1905
|
+
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
|
1841
1906
|
}
|
|
1842
1907
|
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
|
1847
|
-
if (s == "f32") {
|
|
1848
|
-
return GGML_TYPE_F32;
|
|
1849
|
-
}
|
|
1850
|
-
if (s == "f16") {
|
|
1851
|
-
return GGML_TYPE_F16;
|
|
1852
|
-
}
|
|
1853
|
-
if (s == "q8_0") {
|
|
1854
|
-
return GGML_TYPE_Q8_0;
|
|
1855
|
-
}
|
|
1856
|
-
if (s == "q4_0") {
|
|
1857
|
-
return GGML_TYPE_Q4_0;
|
|
1858
|
-
}
|
|
1859
|
-
if (s == "q4_1") {
|
|
1860
|
-
return GGML_TYPE_Q4_1;
|
|
1861
|
-
}
|
|
1862
|
-
if (s == "iq4_nl") {
|
|
1863
|
-
return GGML_TYPE_IQ4_NL;
|
|
1864
|
-
}
|
|
1865
|
-
if (s == "q5_0") {
|
|
1866
|
-
return GGML_TYPE_Q5_0;
|
|
1867
|
-
}
|
|
1868
|
-
if (s == "q5_1") {
|
|
1869
|
-
return GGML_TYPE_Q5_1;
|
|
1908
|
+
if (model == NULL) {
|
|
1909
|
+
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
|
1910
|
+
return std::make_tuple(nullptr, nullptr);
|
|
1870
1911
|
}
|
|
1871
1912
|
|
|
1872
|
-
|
|
1873
|
-
}
|
|
1913
|
+
auto cparams = llama_context_params_from_gpt_params(params);
|
|
1874
1914
|
|
|
1875
|
-
|
|
1876
|
-
|
|
1915
|
+
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
|
1916
|
+
if (lctx == NULL) {
|
|
1917
|
+
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
|
1918
|
+
llama_free_model(model);
|
|
1919
|
+
return std::make_tuple(nullptr, nullptr);
|
|
1920
|
+
}
|
|
1877
1921
|
|
|
1878
|
-
|
|
1879
|
-
|
|
1880
|
-
|
|
1881
|
-
|
|
1882
|
-
|
|
1883
|
-
|
|
1884
|
-
|
|
1922
|
+
if (!params.control_vectors.empty()) {
|
|
1923
|
+
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
|
1924
|
+
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
|
1925
|
+
|
|
1926
|
+
const auto cvec = llama_control_vector_load(params.control_vectors);
|
|
1927
|
+
if (cvec.n_embd == -1) {
|
|
1928
|
+
llama_free(lctx);
|
|
1929
|
+
llama_free_model(model);
|
|
1930
|
+
return std::make_tuple(nullptr, nullptr);
|
|
1931
|
+
}
|
|
1932
|
+
|
|
1933
|
+
int err = llama_control_vector_apply(lctx,
|
|
1934
|
+
cvec.data.data(),
|
|
1935
|
+
cvec.data.size(),
|
|
1936
|
+
cvec.n_embd,
|
|
1937
|
+
params.control_vector_layer_start,
|
|
1938
|
+
params.control_vector_layer_end);
|
|
1939
|
+
if (err) {
|
|
1940
|
+
llama_free(lctx);
|
|
1941
|
+
llama_free_model(model);
|
|
1942
|
+
return std::make_tuple(nullptr, nullptr);
|
|
1943
|
+
}
|
|
1944
|
+
}
|
|
1945
|
+
|
|
1946
|
+
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
|
|
1947
|
+
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
|
|
1948
|
+
float lora_scale = std::get<1>(params.lora_adapter[i]);
|
|
1949
|
+
int err = llama_model_apply_lora_from_file(model,
|
|
1950
|
+
lora_adapter.c_str(),
|
|
1951
|
+
lora_scale,
|
|
1952
|
+
((i > 0) || params.lora_base.empty())
|
|
1953
|
+
? NULL
|
|
1954
|
+
: params.lora_base.c_str(),
|
|
1955
|
+
params.n_threads);
|
|
1956
|
+
if (err != 0) {
|
|
1957
|
+
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
|
1958
|
+
llama_free(lctx);
|
|
1959
|
+
llama_free_model(model);
|
|
1960
|
+
return std::make_tuple(nullptr, nullptr);
|
|
1961
|
+
}
|
|
1962
|
+
}
|
|
1963
|
+
|
|
1964
|
+
if (params.ignore_eos) {
|
|
1965
|
+
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
|
1966
|
+
}
|
|
1967
|
+
|
|
1968
|
+
if (params.warmup) {
|
|
1969
|
+
LOG("warming up the model with an empty run\n");
|
|
1970
|
+
|
|
1971
|
+
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
|
1972
|
+
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
|
1973
|
+
llama_kv_cache_clear(lctx);
|
|
1974
|
+
llama_synchronize(lctx);
|
|
1975
|
+
llama_reset_timings(lctx);
|
|
1976
|
+
}
|
|
1977
|
+
|
|
1978
|
+
return std::make_tuple(model, lctx);
|
|
1979
|
+
}
|
|
1980
|
+
|
|
1981
|
+
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
|
|
1982
|
+
auto mparams = llama_model_default_params();
|
|
1983
|
+
|
|
1984
|
+
if (params.n_gpu_layers != -1) {
|
|
1985
|
+
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
1986
|
+
}
|
|
1987
|
+
mparams.rpc_servers = params.rpc_servers.c_str();
|
|
1988
|
+
mparams.main_gpu = params.main_gpu;
|
|
1989
|
+
mparams.split_mode = params.split_mode;
|
|
1990
|
+
mparams.tensor_split = params.tensor_split;
|
|
1991
|
+
mparams.use_mmap = params.use_mmap;
|
|
1992
|
+
mparams.use_mlock = params.use_mlock;
|
|
1993
|
+
mparams.check_tensors = params.check_tensors;
|
|
1994
|
+
if (params.kv_overrides.empty()) {
|
|
1995
|
+
mparams.kv_overrides = NULL;
|
|
1996
|
+
} else {
|
|
1997
|
+
GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
|
|
1998
|
+
mparams.kv_overrides = params.kv_overrides.data();
|
|
1999
|
+
}
|
|
2000
|
+
|
|
2001
|
+
return mparams;
|
|
2002
|
+
}
|
|
2003
|
+
|
|
2004
|
+
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
|
2005
|
+
if (s == "f32") {
|
|
2006
|
+
return GGML_TYPE_F32;
|
|
2007
|
+
}
|
|
2008
|
+
if (s == "f16") {
|
|
2009
|
+
return GGML_TYPE_F16;
|
|
2010
|
+
}
|
|
2011
|
+
if (s == "q8_0") {
|
|
2012
|
+
return GGML_TYPE_Q8_0;
|
|
2013
|
+
}
|
|
2014
|
+
if (s == "q4_0") {
|
|
2015
|
+
return GGML_TYPE_Q4_0;
|
|
2016
|
+
}
|
|
2017
|
+
if (s == "q4_1") {
|
|
2018
|
+
return GGML_TYPE_Q4_1;
|
|
2019
|
+
}
|
|
2020
|
+
if (s == "iq4_nl") {
|
|
2021
|
+
return GGML_TYPE_IQ4_NL;
|
|
2022
|
+
}
|
|
2023
|
+
if (s == "q5_0") {
|
|
2024
|
+
return GGML_TYPE_Q5_0;
|
|
2025
|
+
}
|
|
2026
|
+
if (s == "q5_1") {
|
|
2027
|
+
return GGML_TYPE_Q5_1;
|
|
2028
|
+
}
|
|
2029
|
+
|
|
2030
|
+
throw std::runtime_error("Invalid cache type: " + s);
|
|
2031
|
+
}
|
|
2032
|
+
|
|
2033
|
+
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
|
2034
|
+
auto cparams = llama_context_default_params();
|
|
2035
|
+
|
|
2036
|
+
cparams.n_ctx = params.n_ctx;
|
|
2037
|
+
cparams.n_seq_max = params.n_parallel;
|
|
2038
|
+
cparams.n_batch = params.n_batch;
|
|
2039
|
+
cparams.n_ubatch = params.n_ubatch;
|
|
2040
|
+
cparams.n_threads = params.n_threads;
|
|
2041
|
+
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
|
2042
|
+
cparams.seed = params.seed;
|
|
1885
2043
|
cparams.logits_all = params.logits_all;
|
|
1886
2044
|
cparams.embeddings = params.embedding;
|
|
1887
2045
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
|
@@ -1905,27 +2063,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
|
1905
2063
|
return cparams;
|
|
1906
2064
|
}
|
|
1907
2065
|
|
|
1908
|
-
void llama_batch_clear(struct llama_batch & batch) {
|
|
1909
|
-
batch.n_tokens = 0;
|
|
1910
|
-
}
|
|
1911
|
-
|
|
1912
|
-
void llama_batch_add(
|
|
1913
|
-
struct llama_batch & batch,
|
|
1914
|
-
llama_token id,
|
|
1915
|
-
llama_pos pos,
|
|
1916
|
-
const std::vector<llama_seq_id> & seq_ids,
|
|
1917
|
-
bool logits) {
|
|
1918
|
-
batch.token [batch.n_tokens] = id;
|
|
1919
|
-
batch.pos [batch.n_tokens] = pos;
|
|
1920
|
-
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
|
|
1921
|
-
for (size_t i = 0; i < seq_ids.size(); ++i) {
|
|
1922
|
-
batch.seq_id[batch.n_tokens][i] = seq_ids[i];
|
|
1923
|
-
}
|
|
1924
|
-
batch.logits [batch.n_tokens] = logits;
|
|
1925
|
-
|
|
1926
|
-
batch.n_tokens++;
|
|
1927
|
-
}
|
|
1928
|
-
|
|
1929
2066
|
#ifdef LLAMA_USE_CURL
|
|
1930
2067
|
|
|
1931
2068
|
static bool starts_with(const std::string & str, const std::string & prefix) {
|
|
@@ -2256,90 +2393,29 @@ struct llama_model * llama_load_model_from_hf(
|
|
|
2256
2393
|
|
|
2257
2394
|
#endif // LLAMA_USE_CURL
|
|
2258
2395
|
|
|
2259
|
-
|
|
2260
|
-
|
|
2261
|
-
|
|
2262
|
-
llama_model * model = nullptr;
|
|
2263
|
-
|
|
2264
|
-
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
|
2265
|
-
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
|
|
2266
|
-
} else if (!params.model_url.empty()) {
|
|
2267
|
-
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
|
|
2268
|
-
} else {
|
|
2269
|
-
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
|
2270
|
-
}
|
|
2271
|
-
|
|
2272
|
-
if (model == NULL) {
|
|
2273
|
-
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
|
2274
|
-
return std::make_tuple(nullptr, nullptr);
|
|
2275
|
-
}
|
|
2276
|
-
|
|
2277
|
-
auto cparams = llama_context_params_from_gpt_params(params);
|
|
2278
|
-
|
|
2279
|
-
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
|
2280
|
-
if (lctx == NULL) {
|
|
2281
|
-
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
|
2282
|
-
llama_free_model(model);
|
|
2283
|
-
return std::make_tuple(nullptr, nullptr);
|
|
2284
|
-
}
|
|
2285
|
-
|
|
2286
|
-
if (!params.control_vectors.empty()) {
|
|
2287
|
-
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
|
2288
|
-
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
|
2289
|
-
|
|
2290
|
-
const auto cvec = llama_control_vector_load(params.control_vectors);
|
|
2291
|
-
if (cvec.n_embd == -1) {
|
|
2292
|
-
llama_free(lctx);
|
|
2293
|
-
llama_free_model(model);
|
|
2294
|
-
return std::make_tuple(nullptr, nullptr);
|
|
2295
|
-
}
|
|
2296
|
-
|
|
2297
|
-
int err = llama_control_vector_apply(lctx,
|
|
2298
|
-
cvec.data.data(),
|
|
2299
|
-
cvec.data.size(),
|
|
2300
|
-
cvec.n_embd,
|
|
2301
|
-
params.control_vector_layer_start,
|
|
2302
|
-
params.control_vector_layer_end);
|
|
2303
|
-
if (err) {
|
|
2304
|
-
llama_free(lctx);
|
|
2305
|
-
llama_free_model(model);
|
|
2306
|
-
return std::make_tuple(nullptr, nullptr);
|
|
2307
|
-
}
|
|
2308
|
-
}
|
|
2309
|
-
|
|
2310
|
-
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
|
|
2311
|
-
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
|
|
2312
|
-
float lora_scale = std::get<1>(params.lora_adapter[i]);
|
|
2313
|
-
int err = llama_model_apply_lora_from_file(model,
|
|
2314
|
-
lora_adapter.c_str(),
|
|
2315
|
-
lora_scale,
|
|
2316
|
-
((i > 0) || params.lora_base.empty())
|
|
2317
|
-
? NULL
|
|
2318
|
-
: params.lora_base.c_str(),
|
|
2319
|
-
params.n_threads);
|
|
2320
|
-
if (err != 0) {
|
|
2321
|
-
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
|
2322
|
-
llama_free(lctx);
|
|
2323
|
-
llama_free_model(model);
|
|
2324
|
-
return std::make_tuple(nullptr, nullptr);
|
|
2325
|
-
}
|
|
2326
|
-
}
|
|
2327
|
-
|
|
2328
|
-
if (params.ignore_eos) {
|
|
2329
|
-
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
|
2330
|
-
}
|
|
2396
|
+
//
|
|
2397
|
+
// Batch utils
|
|
2398
|
+
//
|
|
2331
2399
|
|
|
2332
|
-
|
|
2333
|
-
|
|
2400
|
+
void llama_batch_clear(struct llama_batch & batch) {
|
|
2401
|
+
batch.n_tokens = 0;
|
|
2402
|
+
}
|
|
2334
2403
|
|
|
2335
|
-
|
|
2336
|
-
|
|
2337
|
-
|
|
2338
|
-
|
|
2339
|
-
|
|
2404
|
+
void llama_batch_add(
|
|
2405
|
+
struct llama_batch & batch,
|
|
2406
|
+
llama_token id,
|
|
2407
|
+
llama_pos pos,
|
|
2408
|
+
const std::vector<llama_seq_id> & seq_ids,
|
|
2409
|
+
bool logits) {
|
|
2410
|
+
batch.token [batch.n_tokens] = id;
|
|
2411
|
+
batch.pos [batch.n_tokens] = pos;
|
|
2412
|
+
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
|
|
2413
|
+
for (size_t i = 0; i < seq_ids.size(); ++i) {
|
|
2414
|
+
batch.seq_id[batch.n_tokens][i] = seq_ids[i];
|
|
2340
2415
|
}
|
|
2416
|
+
batch.logits [batch.n_tokens] = logits;
|
|
2341
2417
|
|
|
2342
|
-
|
|
2418
|
+
batch.n_tokens++;
|
|
2343
2419
|
}
|
|
2344
2420
|
|
|
2345
2421
|
//
|
|
@@ -2392,355 +2468,46 @@ std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_to
|
|
|
2392
2468
|
|
|
2393
2469
|
std::string piece;
|
|
2394
2470
|
std::string result;
|
|
2395
|
-
|
|
2396
|
-
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
2397
|
-
piece = llama_token_to_piece(ctx, tokens[i]);
|
|
2398
|
-
|
|
2399
|
-
// remove the leading space of the first non-BOS token
|
|
2400
|
-
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
|
|
2401
|
-
piece = piece.substr(1);
|
|
2402
|
-
}
|
|
2403
|
-
|
|
2404
|
-
result += piece;
|
|
2405
|
-
}
|
|
2406
|
-
|
|
2407
|
-
return result;
|
|
2408
|
-
}
|
|
2409
|
-
|
|
2410
|
-
std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
|
2411
|
-
std::string piece;
|
|
2412
|
-
std::string result;
|
|
2413
|
-
|
|
2414
|
-
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
2415
|
-
piece = llama_token_to_piece(ctx, tokens[i]);
|
|
2416
|
-
|
|
2417
|
-
result += piece;
|
|
2418
|
-
}
|
|
2419
|
-
|
|
2420
|
-
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
|
|
2421
|
-
return result;
|
|
2422
|
-
}
|
|
2423
|
-
|
|
2424
|
-
bool llama_should_add_bos_token(const llama_model * model) {
|
|
2425
|
-
const int add_bos = llama_add_bos_token(model);
|
|
2426
|
-
|
|
2427
|
-
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
|
|
2428
|
-
}
|
|
2429
|
-
|
|
2430
|
-
//
|
|
2431
|
-
// YAML utils
|
|
2432
|
-
//
|
|
2433
|
-
|
|
2434
|
-
// returns true if successful, false otherwise
|
|
2435
|
-
bool create_directory_with_parents(const std::string & path) {
|
|
2436
|
-
#ifdef _WIN32
|
|
2437
|
-
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
|
2438
|
-
std::wstring wpath = converter.from_bytes(path);
|
|
2439
|
-
|
|
2440
|
-
// if the path already exists, check whether it's a directory
|
|
2441
|
-
const DWORD attributes = GetFileAttributesW(wpath.c_str());
|
|
2442
|
-
if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
|
|
2443
|
-
return true;
|
|
2444
|
-
}
|
|
2445
|
-
|
|
2446
|
-
size_t pos_slash = 0;
|
|
2447
|
-
|
|
2448
|
-
// process path from front to back, procedurally creating directories
|
|
2449
|
-
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
|
2450
|
-
const std::wstring subpath = wpath.substr(0, pos_slash);
|
|
2451
|
-
const wchar_t * test = subpath.c_str();
|
|
2452
|
-
|
|
2453
|
-
const bool success = CreateDirectoryW(test, NULL);
|
|
2454
|
-
if (!success) {
|
|
2455
|
-
const DWORD error = GetLastError();
|
|
2456
|
-
|
|
2457
|
-
// if the path already exists, ensure that it's a directory
|
|
2458
|
-
if (error == ERROR_ALREADY_EXISTS) {
|
|
2459
|
-
const DWORD attributes = GetFileAttributesW(subpath.c_str());
|
|
2460
|
-
if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
|
|
2461
|
-
return false;
|
|
2462
|
-
}
|
|
2463
|
-
} else {
|
|
2464
|
-
return false;
|
|
2465
|
-
}
|
|
2466
|
-
}
|
|
2467
|
-
|
|
2468
|
-
pos_slash += 1;
|
|
2469
|
-
}
|
|
2470
|
-
|
|
2471
|
-
return true;
|
|
2472
|
-
#else
|
|
2473
|
-
// if the path already exists, check whether it's a directory
|
|
2474
|
-
struct stat info;
|
|
2475
|
-
if (stat(path.c_str(), &info) == 0) {
|
|
2476
|
-
return S_ISDIR(info.st_mode);
|
|
2477
|
-
}
|
|
2478
|
-
|
|
2479
|
-
size_t pos_slash = 1; // skip leading slashes for directory creation
|
|
2480
|
-
|
|
2481
|
-
// process path from front to back, procedurally creating directories
|
|
2482
|
-
while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
|
|
2483
|
-
const std::string subpath = path.substr(0, pos_slash);
|
|
2484
|
-
struct stat info;
|
|
2485
|
-
|
|
2486
|
-
// if the path already exists, ensure that it's a directory
|
|
2487
|
-
if (stat(subpath.c_str(), &info) == 0) {
|
|
2488
|
-
if (!S_ISDIR(info.st_mode)) {
|
|
2489
|
-
return false;
|
|
2490
|
-
}
|
|
2491
|
-
} else {
|
|
2492
|
-
// create parent directories
|
|
2493
|
-
const int ret = mkdir(subpath.c_str(), 0755);
|
|
2494
|
-
if (ret != 0) {
|
|
2495
|
-
return false;
|
|
2496
|
-
}
|
|
2497
|
-
}
|
|
2498
|
-
|
|
2499
|
-
pos_slash += 1;
|
|
2500
|
-
}
|
|
2501
|
-
|
|
2502
|
-
return true;
|
|
2503
|
-
#endif // _WIN32
|
|
2504
|
-
}
|
|
2505
|
-
|
|
2506
|
-
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data) {
|
|
2507
|
-
if (data.empty()) {
|
|
2508
|
-
fprintf(stream, "%s:\n", prop_name);
|
|
2509
|
-
return;
|
|
2510
|
-
}
|
|
2511
|
-
|
|
2512
|
-
fprintf(stream, "%s: [", prop_name);
|
|
2513
|
-
for (size_t i = 0; i < data.size() - 1; ++i) {
|
|
2514
|
-
fprintf(stream, "%e, ", data[i]);
|
|
2515
|
-
}
|
|
2516
|
-
fprintf(stream, "%e]\n", data.back());
|
|
2517
|
-
}
|
|
2518
|
-
|
|
2519
|
-
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data) {
|
|
2520
|
-
if (data.empty()) {
|
|
2521
|
-
fprintf(stream, "%s:\n", prop_name);
|
|
2522
|
-
return;
|
|
2523
|
-
}
|
|
2524
|
-
|
|
2525
|
-
fprintf(stream, "%s: [", prop_name);
|
|
2526
|
-
for (size_t i = 0; i < data.size() - 1; ++i) {
|
|
2527
|
-
fprintf(stream, "%d, ", data[i]);
|
|
2528
|
-
}
|
|
2529
|
-
fprintf(stream, "%d]\n", data.back());
|
|
2530
|
-
}
|
|
2531
|
-
|
|
2532
|
-
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data) {
|
|
2533
|
-
std::string data_str(data == NULL ? "" : data);
|
|
2534
|
-
|
|
2535
|
-
if (data_str.empty()) {
|
|
2536
|
-
fprintf(stream, "%s:\n", prop_name);
|
|
2537
|
-
return;
|
|
2538
|
-
}
|
|
2539
|
-
|
|
2540
|
-
size_t pos_start = 0;
|
|
2541
|
-
size_t pos_found = 0;
|
|
2542
|
-
|
|
2543
|
-
if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
|
|
2544
|
-
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
|
2545
|
-
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
|
2546
|
-
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
|
|
2547
|
-
data_str = "\"" + data_str + "\"";
|
|
2548
|
-
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
|
2549
|
-
return;
|
|
2550
|
-
}
|
|
2551
|
-
|
|
2552
|
-
if (data_str.find('\n') == std::string::npos) {
|
|
2553
|
-
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
|
2554
|
-
return;
|
|
2555
|
-
}
|
|
2556
|
-
|
|
2557
|
-
fprintf(stream, "%s: |\n", prop_name);
|
|
2558
|
-
while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
|
|
2559
|
-
fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
|
|
2560
|
-
pos_start = pos_found + 1;
|
|
2561
|
-
}
|
|
2562
|
-
}
|
|
2563
|
-
|
|
2564
|
-
std::string get_sortable_timestamp() {
|
|
2565
|
-
using clock = std::chrono::system_clock;
|
|
2566
|
-
|
|
2567
|
-
const clock::time_point current_time = clock::now();
|
|
2568
|
-
const time_t as_time_t = clock::to_time_t(current_time);
|
|
2569
|
-
char timestamp_no_ns[100];
|
|
2570
|
-
std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
|
|
2571
|
-
|
|
2572
|
-
const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
|
|
2573
|
-
current_time.time_since_epoch() % 1000000000).count();
|
|
2574
|
-
char timestamp_ns[11];
|
|
2575
|
-
snprintf(timestamp_ns, 11, "%09" PRId64, ns);
|
|
2576
|
-
|
|
2577
|
-
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
|
|
2578
|
-
}
|
|
2579
|
-
|
|
2580
|
-
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
|
2581
|
-
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
|
2582
|
-
const llama_sampling_params & sparams = params.sparams;
|
|
2583
|
-
|
|
2584
|
-
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
|
2585
|
-
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
|
2586
|
-
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
|
2587
|
-
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
|
2588
|
-
fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
|
|
2589
|
-
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
|
2590
|
-
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
|
2591
|
-
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
|
2592
|
-
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
|
2593
|
-
fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
|
|
2594
|
-
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
|
2595
|
-
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
|
2596
|
-
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
|
2597
|
-
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
|
2598
|
-
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
|
2599
|
-
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
|
2600
|
-
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
|
2601
|
-
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
|
2602
|
-
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
|
2603
|
-
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
|
2604
|
-
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
|
2605
|
-
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
|
2606
|
-
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
|
|
2607
|
-
|
|
2608
|
-
#ifdef NDEBUG
|
|
2609
|
-
fprintf(stream, "debug: false\n");
|
|
2610
|
-
#else
|
|
2611
|
-
fprintf(stream, "debug: true\n");
|
|
2612
|
-
#endif // NDEBUG
|
|
2613
|
-
|
|
2614
|
-
fprintf(stream, "model_desc: %s\n", model_desc);
|
|
2615
|
-
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
|
|
2616
|
-
|
|
2617
|
-
#ifdef __OPTIMIZE__
|
|
2618
|
-
fprintf(stream, "optimize: true\n");
|
|
2619
|
-
#else
|
|
2620
|
-
fprintf(stream, "optimize: false\n");
|
|
2621
|
-
#endif // __OPTIMIZE__
|
|
2622
|
-
|
|
2623
|
-
fprintf(stream, "time: %s\n", timestamp.c_str());
|
|
2624
|
-
|
|
2625
|
-
fprintf(stream, "\n");
|
|
2626
|
-
fprintf(stream, "###############\n");
|
|
2627
|
-
fprintf(stream, "# User Inputs #\n");
|
|
2628
|
-
fprintf(stream, "###############\n");
|
|
2629
|
-
fprintf(stream, "\n");
|
|
2630
|
-
|
|
2631
|
-
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
|
2632
|
-
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
|
2633
|
-
dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
|
|
2634
|
-
fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
|
|
2635
|
-
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
|
2636
|
-
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
|
2637
|
-
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
|
2638
|
-
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
|
2639
|
-
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
|
2640
|
-
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
|
|
2641
|
-
dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str());
|
|
2642
|
-
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
|
2643
|
-
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
|
2644
|
-
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
|
2645
|
-
|
|
2646
|
-
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
|
|
2647
|
-
const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
|
2648
|
-
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
|
2649
|
-
|
|
2650
|
-
dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
|
2651
|
-
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
|
|
2652
|
-
dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
|
|
2653
|
-
fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
|
|
2654
|
-
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
|
|
2655
|
-
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
|
|
2656
|
-
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
|
|
2657
|
-
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
|
2658
|
-
|
|
2659
|
-
fprintf(stream, "logit_bias:\n");
|
|
2660
|
-
for (std::pair<llama_token, float> lb : sparams.logit_bias) {
|
|
2661
|
-
if (ignore_eos && lb.first == logit_bias_eos->first) {
|
|
2662
|
-
continue;
|
|
2663
|
-
}
|
|
2664
|
-
fprintf(stream, " %d: %f", lb.first, lb.second);
|
|
2665
|
-
}
|
|
2666
|
-
|
|
2667
|
-
fprintf(stream, "lora:\n");
|
|
2668
|
-
for (std::tuple<std::string, float> la : params.lora_adapter) {
|
|
2669
|
-
if (std::get<1>(la) != 1.0f) {
|
|
2670
|
-
continue;
|
|
2671
|
-
}
|
|
2672
|
-
fprintf(stream, " - %s\n", std::get<0>(la).c_str());
|
|
2673
|
-
}
|
|
2674
|
-
fprintf(stream, "lora_scaled:\n");
|
|
2675
|
-
for (std::tuple<std::string, float> la : params.lora_adapter) {
|
|
2676
|
-
if (std::get<1>(la) == 1.0f) {
|
|
2677
|
-
continue;
|
|
2678
|
-
}
|
|
2679
|
-
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
|
|
2680
|
-
}
|
|
2681
|
-
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
|
2682
|
-
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
|
2683
|
-
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
|
2684
|
-
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
|
2685
|
-
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
|
2686
|
-
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
|
2687
|
-
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
|
2688
|
-
fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
|
|
2689
|
-
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
|
2690
|
-
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
|
2691
|
-
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
|
2692
|
-
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
|
2693
|
-
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
|
|
2694
|
-
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
|
2695
|
-
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
|
|
2696
|
-
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
|
2697
|
-
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
|
2698
|
-
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
|
|
2699
|
-
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
|
|
2700
|
-
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
|
2701
|
-
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
|
2702
|
-
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
|
2703
|
-
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
|
|
2704
|
-
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
|
2705
|
-
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
|
|
2706
|
-
|
|
2707
|
-
fprintf(stream, "reverse_prompt:\n");
|
|
2708
|
-
for (std::string ap : params.antiprompt) {
|
|
2709
|
-
size_t pos = 0;
|
|
2710
|
-
while ((pos = ap.find('\n', pos)) != std::string::npos) {
|
|
2711
|
-
ap.replace(pos, 1, "\\n");
|
|
2712
|
-
pos += 1;
|
|
2471
|
+
|
|
2472
|
+
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
2473
|
+
piece = llama_token_to_piece(ctx, tokens[i]);
|
|
2474
|
+
|
|
2475
|
+
// remove the leading space of the first non-BOS token
|
|
2476
|
+
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
|
|
2477
|
+
piece = piece.substr(1);
|
|
2713
2478
|
}
|
|
2714
2479
|
|
|
2715
|
-
|
|
2480
|
+
result += piece;
|
|
2716
2481
|
}
|
|
2717
2482
|
|
|
2718
|
-
|
|
2719
|
-
|
|
2720
|
-
fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
|
|
2721
|
-
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
|
2722
|
-
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
|
2723
|
-
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
|
2724
|
-
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
|
2483
|
+
return result;
|
|
2484
|
+
}
|
|
2725
2485
|
|
|
2726
|
-
|
|
2727
|
-
|
|
2486
|
+
std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
|
2487
|
+
std::string piece;
|
|
2488
|
+
std::string result;
|
|
2728
2489
|
|
|
2729
|
-
|
|
2730
|
-
|
|
2731
|
-
|
|
2732
|
-
|
|
2733
|
-
|
|
2734
|
-
|
|
2735
|
-
|
|
2736
|
-
|
|
2490
|
+
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
2491
|
+
piece = llama_token_to_piece(ctx, tokens[i]);
|
|
2492
|
+
|
|
2493
|
+
result += piece;
|
|
2494
|
+
}
|
|
2495
|
+
|
|
2496
|
+
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
|
|
2497
|
+
return result;
|
|
2498
|
+
}
|
|
2499
|
+
|
|
2500
|
+
bool llama_should_add_bos_token(const llama_model * model) {
|
|
2501
|
+
const int add_bos = llama_add_bos_token(model);
|
|
2502
|
+
|
|
2503
|
+
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
|
|
2737
2504
|
}
|
|
2738
2505
|
|
|
2739
2506
|
//
|
|
2740
2507
|
// KV cache utils
|
|
2741
2508
|
//
|
|
2742
2509
|
|
|
2743
|
-
void
|
|
2510
|
+
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
|
2744
2511
|
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
|
2745
2512
|
|
|
2746
2513
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
|
@@ -2763,7 +2530,7 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
|
|
|
2763
2530
|
printf("\n=== Done dumping\n");
|
|
2764
2531
|
}
|
|
2765
2532
|
|
|
2766
|
-
void
|
|
2533
|
+
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
|
2767
2534
|
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
|
2768
2535
|
|
|
2769
2536
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
|
@@ -2811,6 +2578,10 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
|
|
2811
2578
|
printf("\n=== Done dumping\n");
|
|
2812
2579
|
}
|
|
2813
2580
|
|
|
2581
|
+
//
|
|
2582
|
+
// Embedding utils
|
|
2583
|
+
//
|
|
2584
|
+
|
|
2814
2585
|
void llama_embd_normalize(const float * inp, float * out, int n) {
|
|
2815
2586
|
double sum = 0.0;
|
|
2816
2587
|
for (int i = 0; i < n; i++) {
|
|
@@ -2995,3 +2766,226 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|
|
2995
2766
|
|
|
2996
2767
|
return result;
|
|
2997
2768
|
}
|
|
2769
|
+
|
|
2770
|
+
//
|
|
2771
|
+
// YAML utils
|
|
2772
|
+
//
|
|
2773
|
+
|
|
2774
|
+
void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
|
|
2775
|
+
if (data.empty()) {
|
|
2776
|
+
fprintf(stream, "%s:\n", prop_name);
|
|
2777
|
+
return;
|
|
2778
|
+
}
|
|
2779
|
+
|
|
2780
|
+
fprintf(stream, "%s: [", prop_name);
|
|
2781
|
+
for (size_t i = 0; i < data.size() - 1; ++i) {
|
|
2782
|
+
fprintf(stream, "%e, ", data[i]);
|
|
2783
|
+
}
|
|
2784
|
+
fprintf(stream, "%e]\n", data.back());
|
|
2785
|
+
}
|
|
2786
|
+
|
|
2787
|
+
void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
|
|
2788
|
+
if (data.empty()) {
|
|
2789
|
+
fprintf(stream, "%s:\n", prop_name);
|
|
2790
|
+
return;
|
|
2791
|
+
}
|
|
2792
|
+
|
|
2793
|
+
fprintf(stream, "%s: [", prop_name);
|
|
2794
|
+
for (size_t i = 0; i < data.size() - 1; ++i) {
|
|
2795
|
+
fprintf(stream, "%d, ", data[i]);
|
|
2796
|
+
}
|
|
2797
|
+
fprintf(stream, "%d]\n", data.back());
|
|
2798
|
+
}
|
|
2799
|
+
|
|
2800
|
+
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
|
|
2801
|
+
std::string data_str(data == NULL ? "" : data);
|
|
2802
|
+
|
|
2803
|
+
if (data_str.empty()) {
|
|
2804
|
+
fprintf(stream, "%s:\n", prop_name);
|
|
2805
|
+
return;
|
|
2806
|
+
}
|
|
2807
|
+
|
|
2808
|
+
size_t pos_start = 0;
|
|
2809
|
+
size_t pos_found = 0;
|
|
2810
|
+
|
|
2811
|
+
if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
|
|
2812
|
+
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
|
2813
|
+
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
|
2814
|
+
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
|
|
2815
|
+
data_str = "\"" + data_str + "\"";
|
|
2816
|
+
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
|
2817
|
+
return;
|
|
2818
|
+
}
|
|
2819
|
+
|
|
2820
|
+
if (data_str.find('\n') == std::string::npos) {
|
|
2821
|
+
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
|
2822
|
+
return;
|
|
2823
|
+
}
|
|
2824
|
+
|
|
2825
|
+
fprintf(stream, "%s: |\n", prop_name);
|
|
2826
|
+
while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
|
|
2827
|
+
fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
|
|
2828
|
+
pos_start = pos_found + 1;
|
|
2829
|
+
}
|
|
2830
|
+
}
|
|
2831
|
+
|
|
2832
|
+
void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
|
2833
|
+
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
|
2834
|
+
const llama_sampling_params & sparams = params.sparams;
|
|
2835
|
+
|
|
2836
|
+
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
|
2837
|
+
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
|
2838
|
+
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
|
2839
|
+
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
|
2840
|
+
fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
|
|
2841
|
+
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
|
2842
|
+
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
|
2843
|
+
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
|
2844
|
+
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
|
2845
|
+
fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
|
|
2846
|
+
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
|
2847
|
+
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
|
2848
|
+
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
|
2849
|
+
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
|
2850
|
+
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
|
2851
|
+
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
|
2852
|
+
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
|
|
2853
|
+
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
|
2854
|
+
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
|
2855
|
+
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
|
2856
|
+
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
|
2857
|
+
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
|
2858
|
+
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
|
2859
|
+
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
|
|
2860
|
+
|
|
2861
|
+
#ifdef NDEBUG
|
|
2862
|
+
fprintf(stream, "debug: false\n");
|
|
2863
|
+
#else
|
|
2864
|
+
fprintf(stream, "debug: true\n");
|
|
2865
|
+
#endif // NDEBUG
|
|
2866
|
+
|
|
2867
|
+
fprintf(stream, "model_desc: %s\n", model_desc);
|
|
2868
|
+
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
|
|
2869
|
+
|
|
2870
|
+
#ifdef __OPTIMIZE__
|
|
2871
|
+
fprintf(stream, "optimize: true\n");
|
|
2872
|
+
#else
|
|
2873
|
+
fprintf(stream, "optimize: false\n");
|
|
2874
|
+
#endif // __OPTIMIZE__
|
|
2875
|
+
|
|
2876
|
+
fprintf(stream, "time: %s\n", timestamp.c_str());
|
|
2877
|
+
|
|
2878
|
+
fprintf(stream, "\n");
|
|
2879
|
+
fprintf(stream, "###############\n");
|
|
2880
|
+
fprintf(stream, "# User Inputs #\n");
|
|
2881
|
+
fprintf(stream, "###############\n");
|
|
2882
|
+
fprintf(stream, "\n");
|
|
2883
|
+
|
|
2884
|
+
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
|
2885
|
+
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
|
2886
|
+
yaml_dump_string_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
|
|
2887
|
+
fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
|
|
2888
|
+
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
|
2889
|
+
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
|
2890
|
+
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
|
2891
|
+
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
|
2892
|
+
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
|
2893
|
+
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
|
|
2894
|
+
yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
|
|
2895
|
+
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
|
2896
|
+
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
|
2897
|
+
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
|
2898
|
+
|
|
2899
|
+
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
|
|
2900
|
+
const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
|
2901
|
+
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
|
2902
|
+
|
|
2903
|
+
yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
|
2904
|
+
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
|
|
2905
|
+
yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
|
|
2906
|
+
fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
|
|
2907
|
+
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
|
|
2908
|
+
fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
|
|
2909
|
+
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
|
|
2910
|
+
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
|
|
2911
|
+
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
|
2912
|
+
|
|
2913
|
+
fprintf(stream, "logit_bias:\n");
|
|
2914
|
+
for (std::pair<llama_token, float> lb : sparams.logit_bias) {
|
|
2915
|
+
if (ignore_eos && lb.first == logit_bias_eos->first) {
|
|
2916
|
+
continue;
|
|
2917
|
+
}
|
|
2918
|
+
fprintf(stream, " %d: %f", lb.first, lb.second);
|
|
2919
|
+
}
|
|
2920
|
+
|
|
2921
|
+
fprintf(stream, "lora:\n");
|
|
2922
|
+
for (std::tuple<std::string, float> la : params.lora_adapter) {
|
|
2923
|
+
if (std::get<1>(la) != 1.0f) {
|
|
2924
|
+
continue;
|
|
2925
|
+
}
|
|
2926
|
+
fprintf(stream, " - %s\n", std::get<0>(la).c_str());
|
|
2927
|
+
}
|
|
2928
|
+
fprintf(stream, "lora_scaled:\n");
|
|
2929
|
+
for (std::tuple<std::string, float> la : params.lora_adapter) {
|
|
2930
|
+
if (std::get<1>(la) == 1.0f) {
|
|
2931
|
+
continue;
|
|
2932
|
+
}
|
|
2933
|
+
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
|
|
2934
|
+
}
|
|
2935
|
+
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
|
2936
|
+
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
|
2937
|
+
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
|
2938
|
+
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
|
2939
|
+
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
|
2940
|
+
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
|
2941
|
+
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
|
2942
|
+
fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
|
|
2943
|
+
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
|
2944
|
+
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
|
2945
|
+
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
|
2946
|
+
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
|
2947
|
+
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
|
|
2948
|
+
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
|
2949
|
+
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
|
|
2950
|
+
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
|
2951
|
+
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
|
2952
|
+
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
|
|
2953
|
+
yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
|
|
2954
|
+
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
|
2955
|
+
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
|
2956
|
+
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
|
2957
|
+
yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
|
|
2958
|
+
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
|
2959
|
+
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
|
|
2960
|
+
|
|
2961
|
+
fprintf(stream, "reverse_prompt:\n");
|
|
2962
|
+
for (std::string ap : params.antiprompt) {
|
|
2963
|
+
size_t pos = 0;
|
|
2964
|
+
while ((pos = ap.find('\n', pos)) != std::string::npos) {
|
|
2965
|
+
ap.replace(pos, 1, "\\n");
|
|
2966
|
+
pos += 1;
|
|
2967
|
+
}
|
|
2968
|
+
|
|
2969
|
+
fprintf(stream, " - %s\n", ap.c_str());
|
|
2970
|
+
}
|
|
2971
|
+
|
|
2972
|
+
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
|
|
2973
|
+
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
|
|
2974
|
+
fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
|
|
2975
|
+
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
|
2976
|
+
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
|
2977
|
+
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
|
2978
|
+
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
|
2979
|
+
|
|
2980
|
+
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
|
|
2981
|
+
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
|
|
2982
|
+
|
|
2983
|
+
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
|
2984
|
+
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
|
|
2985
|
+
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
|
2986
|
+
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
|
2987
|
+
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
|
2988
|
+
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
|
2989
|
+
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
|
2990
|
+
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
|
2991
|
+
}
|