@fugood/llama.node 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/CMakeLists.txt +72 -46
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
- package/src/llama.cpp/common/common.cpp +732 -752
- package/src/llama.cpp/common/common.h +47 -41
- package/src/llama.cpp/common/grammar-parser.cpp +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
- package/src/llama.cpp/common/log.h +5 -5
- package/src/llama.cpp/common/sampling.cpp +89 -7
- package/src/llama.cpp/common/sampling.h +5 -0
- package/src/llama.cpp/common/train.cpp +2 -2
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
- package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
- package/src/llama.cpp/examples/infill/infill.cpp +8 -8
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +13 -8
- package/src/llama.cpp/examples/llava/clip.h +1 -1
- package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/llava.cpp +0 -15
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +24 -16
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
- package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +78 -14
- package/src/llama.cpp/examples/server/server.cpp +21 -9
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
- package/src/llama.cpp/ggml-backend.c +0 -1
- package/src/llama.cpp/ggml-common.h +0 -54
- package/src/llama.cpp/ggml-cuda.h +1 -0
- package/src/llama.cpp/ggml-impl.h +51 -0
- package/src/llama.cpp/ggml-kompute.cpp +4 -0
- package/src/llama.cpp/ggml-opencl.cpp +4 -1
- package/src/llama.cpp/ggml-quants.c +3700 -2041
- package/src/llama.cpp/ggml-rpc.cpp +188 -56
- package/src/llama.cpp/ggml-sycl.cpp +99 -530
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- package/src/llama.cpp/ggml-vulkan.cpp +202 -225
- package/src/llama.cpp/ggml.c +1034 -1154
- package/src/llama.cpp/ggml.h +59 -31
- package/src/llama.cpp/llama.cpp +859 -609
- package/src/llama.cpp/llama.h +19 -6
- package/src/llama.cpp/requirements.txt +0 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +113 -47
- package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
- package/src/llama.cpp/tests/test-grad0.cpp +43 -83
- package/src/llama.cpp/unicode-data.cpp +6969 -2169
- package/src/llama.cpp/unicode-data.h +15 -12
- package/src/llama.cpp/unicode.cpp +89 -111
- package/src/llama.cpp/unicode.h +44 -12
- package/src/llama.cpp/build.zig +0 -172
- package/src/llama.cpp/ggml-mpi.c +0 -216
- package/src/llama.cpp/ggml-mpi.h +0 -39
- package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
|
@@ -73,7 +73,11 @@
|
|
|
73
73
|
|
|
74
74
|
using json = nlohmann::ordered_json;
|
|
75
75
|
|
|
76
|
-
|
|
76
|
+
//
|
|
77
|
+
// CPU utils
|
|
78
|
+
//
|
|
79
|
+
|
|
80
|
+
int32_t cpu_get_num_physical_cores() {
|
|
77
81
|
#ifdef __linux__
|
|
78
82
|
// enumerate the set of thread siblings, num entries is num cores
|
|
79
83
|
std::unordered_set<std::string> siblings;
|
|
@@ -142,9 +146,9 @@ static bool is_running_on_efficiency_core(void) {
|
|
|
142
146
|
return core_type == intel_atom;
|
|
143
147
|
}
|
|
144
148
|
|
|
145
|
-
static int
|
|
149
|
+
static int cpu_count_math_cpus(int n_cpu) {
|
|
146
150
|
int result = 0;
|
|
147
|
-
for (int cpu = 0; cpu <
|
|
151
|
+
for (int cpu = 0; cpu < n_cpu; ++cpu) {
|
|
148
152
|
if (pin_cpu(cpu)) {
|
|
149
153
|
return -1;
|
|
150
154
|
}
|
|
@@ -162,16 +166,16 @@ static int count_math_cpus(int cpu_count) {
|
|
|
162
166
|
/**
|
|
163
167
|
* Returns number of CPUs on system that are useful for math.
|
|
164
168
|
*/
|
|
165
|
-
|
|
169
|
+
int32_t cpu_get_num_math() {
|
|
166
170
|
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
|
|
167
|
-
int
|
|
168
|
-
if (
|
|
169
|
-
return
|
|
171
|
+
int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
|
|
172
|
+
if (n_cpu < 1) {
|
|
173
|
+
return cpu_get_num_physical_cores();
|
|
170
174
|
}
|
|
171
175
|
if (is_hybrid_cpu()) {
|
|
172
176
|
cpu_set_t affinity;
|
|
173
177
|
if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
|
|
174
|
-
int result =
|
|
178
|
+
int result = cpu_count_math_cpus(n_cpu);
|
|
175
179
|
pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
|
|
176
180
|
if (result > 0) {
|
|
177
181
|
return result;
|
|
@@ -179,108 +183,103 @@ int get_math_cpu_count() {
|
|
|
179
183
|
}
|
|
180
184
|
}
|
|
181
185
|
#endif
|
|
182
|
-
return
|
|
186
|
+
return cpu_get_num_physical_cores();
|
|
183
187
|
}
|
|
184
188
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
189
|
+
//
|
|
190
|
+
// CLI argument parsing
|
|
191
|
+
//
|
|
188
192
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
case '\'': input[output_idx++] = '\''; break;
|
|
196
|
-
case '\"': input[output_idx++] = '\"'; break;
|
|
197
|
-
case '\\': input[output_idx++] = '\\'; break;
|
|
198
|
-
case 'x':
|
|
199
|
-
// Handle \x12, etc
|
|
200
|
-
if (input_idx + 2 < input_len) {
|
|
201
|
-
const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
|
|
202
|
-
char *err_p = nullptr;
|
|
203
|
-
const long val = std::strtol(x, &err_p, 16);
|
|
204
|
-
if (err_p == x + 2) {
|
|
205
|
-
input_idx += 2;
|
|
206
|
-
input[output_idx++] = char(val);
|
|
207
|
-
break;
|
|
208
|
-
}
|
|
209
|
-
}
|
|
210
|
-
// fall through
|
|
211
|
-
default: input[output_idx++] = '\\';
|
|
212
|
-
input[output_idx++] = input[input_idx]; break;
|
|
193
|
+
void gpt_params_handle_model_default(gpt_params & params) {
|
|
194
|
+
if (!params.hf_repo.empty()) {
|
|
195
|
+
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
196
|
+
if (params.hf_file.empty()) {
|
|
197
|
+
if (params.model.empty()) {
|
|
198
|
+
throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
|
|
213
199
|
}
|
|
214
|
-
|
|
215
|
-
|
|
200
|
+
params.hf_file = params.model;
|
|
201
|
+
} else if (params.model.empty()) {
|
|
202
|
+
std::string cache_directory = fs_get_cache_directory();
|
|
203
|
+
const bool success = fs_create_directory_with_parents(cache_directory);
|
|
204
|
+
if (!success) {
|
|
205
|
+
throw std::runtime_error("failed to create cache directory: " + cache_directory);
|
|
206
|
+
}
|
|
207
|
+
params.model = cache_directory + string_split(params.hf_file, '/').back();
|
|
208
|
+
}
|
|
209
|
+
} else if (!params.model_url.empty()) {
|
|
210
|
+
if (params.model.empty()) {
|
|
211
|
+
auto f = string_split(params.model_url, '#').front();
|
|
212
|
+
f = string_split(f, '?').front();
|
|
213
|
+
f = string_split(f, '/').back();
|
|
214
|
+
params.model = "models/" + f;
|
|
216
215
|
}
|
|
216
|
+
} else if (params.model.empty()) {
|
|
217
|
+
params.model = DEFAULT_MODEL_PATH;
|
|
217
218
|
}
|
|
219
|
+
}
|
|
218
220
|
|
|
219
|
-
|
|
221
|
+
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|
222
|
+
bool invalid_param = false;
|
|
223
|
+
std::string arg;
|
|
224
|
+
const std::string arg_prefix = "--";
|
|
225
|
+
llama_sampling_params & sparams = params.sparams;
|
|
226
|
+
|
|
227
|
+
for (int i = 1; i < argc; i++) {
|
|
228
|
+
arg = argv[i];
|
|
229
|
+
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
|
230
|
+
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
231
|
+
}
|
|
232
|
+
if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
|
|
233
|
+
throw std::invalid_argument("error: unknown argument: " + arg);
|
|
234
|
+
}
|
|
235
|
+
if (invalid_param) {
|
|
236
|
+
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
if (params.prompt_cache_all &&
|
|
241
|
+
(params.interactive || params.interactive_first ||
|
|
242
|
+
params.instruct)) {
|
|
243
|
+
|
|
244
|
+
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
gpt_params_handle_model_default(params);
|
|
248
|
+
|
|
249
|
+
if (params.escape) {
|
|
250
|
+
string_process_escapes(params.prompt);
|
|
251
|
+
string_process_escapes(params.input_prefix);
|
|
252
|
+
string_process_escapes(params.input_suffix);
|
|
253
|
+
string_process_escapes(sparams.cfg_negative_prompt);
|
|
254
|
+
for (auto & antiprompt : params.antiprompt) {
|
|
255
|
+
string_process_escapes(antiprompt);
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
if (!params.kv_overrides.empty()) {
|
|
260
|
+
params.kv_overrides.emplace_back();
|
|
261
|
+
params.kv_overrides.back().key[0] = 0;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
return true;
|
|
220
265
|
}
|
|
221
266
|
|
|
222
267
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
223
268
|
bool result = true;
|
|
224
269
|
try {
|
|
225
270
|
if (!gpt_params_parse_ex(argc, argv, params)) {
|
|
226
|
-
|
|
271
|
+
gpt_params_print_usage(argc, argv, gpt_params());
|
|
227
272
|
exit(0);
|
|
228
273
|
}
|
|
229
274
|
}
|
|
230
275
|
catch (const std::invalid_argument & ex) {
|
|
231
276
|
fprintf(stderr, "%s\n", ex.what());
|
|
232
|
-
|
|
277
|
+
gpt_params_print_usage(argc, argv, gpt_params());
|
|
233
278
|
exit(1);
|
|
234
279
|
}
|
|
235
280
|
return result;
|
|
236
281
|
}
|
|
237
282
|
|
|
238
|
-
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
|
|
239
|
-
const char * sep = strchr(data, '=');
|
|
240
|
-
if (sep == nullptr || sep - data >= 128) {
|
|
241
|
-
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
|
|
242
|
-
return false;
|
|
243
|
-
}
|
|
244
|
-
llama_model_kv_override kvo;
|
|
245
|
-
std::strncpy(kvo.key, data, sep - data);
|
|
246
|
-
kvo.key[sep - data] = 0;
|
|
247
|
-
sep++;
|
|
248
|
-
if (strncmp(sep, "int:", 4) == 0) {
|
|
249
|
-
sep += 4;
|
|
250
|
-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
|
251
|
-
kvo.val_i64 = std::atol(sep);
|
|
252
|
-
} else if (strncmp(sep, "float:", 6) == 0) {
|
|
253
|
-
sep += 6;
|
|
254
|
-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
|
255
|
-
kvo.val_f64 = std::atof(sep);
|
|
256
|
-
} else if (strncmp(sep, "bool:", 5) == 0) {
|
|
257
|
-
sep += 5;
|
|
258
|
-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
|
259
|
-
if (std::strcmp(sep, "true") == 0) {
|
|
260
|
-
kvo.val_bool = true;
|
|
261
|
-
} else if (std::strcmp(sep, "false") == 0) {
|
|
262
|
-
kvo.val_bool = false;
|
|
263
|
-
} else {
|
|
264
|
-
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
|
|
265
|
-
return false;
|
|
266
|
-
}
|
|
267
|
-
} else if (strncmp(sep, "str:", 4) == 0) {
|
|
268
|
-
sep += 4;
|
|
269
|
-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
|
270
|
-
if (strlen(sep) > 127) {
|
|
271
|
-
fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
|
|
272
|
-
return false;
|
|
273
|
-
}
|
|
274
|
-
strncpy(kvo.val_str, sep, 127);
|
|
275
|
-
kvo.val_str[127] = '\0';
|
|
276
|
-
} else {
|
|
277
|
-
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
|
|
278
|
-
return false;
|
|
279
|
-
}
|
|
280
|
-
overrides.emplace_back(std::move(kvo));
|
|
281
|
-
return true;
|
|
282
|
-
}
|
|
283
|
-
|
|
284
283
|
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
|
|
285
284
|
llama_sampling_params & sparams = params.sparams;
|
|
286
285
|
|
|
@@ -546,7 +545,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
546
545
|
return true;
|
|
547
546
|
}
|
|
548
547
|
const auto sampler_names = string_split(argv[i], ';');
|
|
549
|
-
sparams.samplers_sequence =
|
|
548
|
+
sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
|
|
550
549
|
return true;
|
|
551
550
|
}
|
|
552
551
|
if (arg == "--sampling-seq") {
|
|
@@ -554,7 +553,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
554
553
|
invalid_param = true;
|
|
555
554
|
return true;
|
|
556
555
|
}
|
|
557
|
-
sparams.samplers_sequence =
|
|
556
|
+
sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]);
|
|
558
557
|
return true;
|
|
559
558
|
}
|
|
560
559
|
if (arg == "--top-p") {
|
|
@@ -905,6 +904,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
905
904
|
params.interactive_specials = true;
|
|
906
905
|
return true;
|
|
907
906
|
}
|
|
907
|
+
if (arg == "--special") {
|
|
908
|
+
params.special = true;
|
|
909
|
+
return true;
|
|
910
|
+
}
|
|
908
911
|
if (arg == "--embedding") {
|
|
909
912
|
params.embedding = true;
|
|
910
913
|
return true;
|
|
@@ -1240,7 +1243,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1240
1243
|
return true;
|
|
1241
1244
|
}
|
|
1242
1245
|
if (arg == "-h" || arg == "--help") {
|
|
1243
|
-
|
|
1246
|
+
gpt_params_print_usage(argc, argv, gpt_params());
|
|
1244
1247
|
exit(0);
|
|
1245
1248
|
}
|
|
1246
1249
|
if (arg == "--version") {
|
|
@@ -1311,7 +1314,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1311
1314
|
invalid_param = true;
|
|
1312
1315
|
return true;
|
|
1313
1316
|
}
|
|
1314
|
-
if (!
|
|
1317
|
+
if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
|
|
1315
1318
|
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
|
1316
1319
|
invalid_param = true;
|
|
1317
1320
|
return true;
|
|
@@ -1345,83 +1348,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1345
1348
|
return false;
|
|
1346
1349
|
}
|
|
1347
1350
|
|
|
1348
|
-
void
|
|
1349
|
-
if (!params.hf_repo.empty()) {
|
|
1350
|
-
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
1351
|
-
if (params.hf_file.empty()) {
|
|
1352
|
-
if (params.model.empty()) {
|
|
1353
|
-
throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
|
|
1354
|
-
}
|
|
1355
|
-
params.hf_file = params.model;
|
|
1356
|
-
} else if (params.model.empty()) {
|
|
1357
|
-
params.model = "models/" + string_split(params.hf_file, '/').back();
|
|
1358
|
-
}
|
|
1359
|
-
} else if (!params.model_url.empty()) {
|
|
1360
|
-
if (params.model.empty()) {
|
|
1361
|
-
auto f = string_split(params.model_url, '#').front();
|
|
1362
|
-
f = string_split(f, '?').front();
|
|
1363
|
-
f = string_split(f, '/').back();
|
|
1364
|
-
params.model = "models/" + f;
|
|
1365
|
-
}
|
|
1366
|
-
} else if (params.model.empty()) {
|
|
1367
|
-
params.model = DEFAULT_MODEL_PATH;
|
|
1368
|
-
}
|
|
1369
|
-
}
|
|
1370
|
-
|
|
1371
|
-
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|
1372
|
-
bool invalid_param = false;
|
|
1373
|
-
std::string arg;
|
|
1374
|
-
const std::string arg_prefix = "--";
|
|
1375
|
-
llama_sampling_params & sparams = params.sparams;
|
|
1376
|
-
|
|
1377
|
-
for (int i = 1; i < argc; i++) {
|
|
1378
|
-
arg = argv[i];
|
|
1379
|
-
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
|
1380
|
-
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
1381
|
-
}
|
|
1382
|
-
if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
|
|
1383
|
-
throw std::invalid_argument("error: unknown argument: " + arg);
|
|
1384
|
-
}
|
|
1385
|
-
if (invalid_param) {
|
|
1386
|
-
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
|
1387
|
-
}
|
|
1388
|
-
}
|
|
1389
|
-
|
|
1390
|
-
if (params.prompt_cache_all &&
|
|
1391
|
-
(params.interactive || params.interactive_first ||
|
|
1392
|
-
params.instruct)) {
|
|
1393
|
-
|
|
1394
|
-
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
|
1395
|
-
}
|
|
1396
|
-
|
|
1397
|
-
gpt_params_handle_model_default(params);
|
|
1398
|
-
|
|
1399
|
-
if (params.escape) {
|
|
1400
|
-
process_escapes(params.prompt);
|
|
1401
|
-
process_escapes(params.input_prefix);
|
|
1402
|
-
process_escapes(params.input_suffix);
|
|
1403
|
-
process_escapes(sparams.cfg_negative_prompt);
|
|
1404
|
-
for (auto & antiprompt : params.antiprompt) {
|
|
1405
|
-
process_escapes(antiprompt);
|
|
1406
|
-
}
|
|
1407
|
-
}
|
|
1408
|
-
|
|
1409
|
-
if (!params.kv_overrides.empty()) {
|
|
1410
|
-
params.kv_overrides.emplace_back();
|
|
1411
|
-
params.kv_overrides.back().key[0] = 0;
|
|
1412
|
-
}
|
|
1413
|
-
|
|
1414
|
-
return true;
|
|
1415
|
-
}
|
|
1416
|
-
|
|
1417
|
-
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
1351
|
+
void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
1418
1352
|
const llama_sampling_params & sparams = params.sparams;
|
|
1419
1353
|
|
|
1420
1354
|
std::string sampler_type_chars;
|
|
1421
1355
|
std::string sampler_type_names;
|
|
1422
1356
|
for (const auto sampler_type : sparams.samplers_sequence) {
|
|
1423
1357
|
sampler_type_chars += static_cast<char>(sampler_type);
|
|
1424
|
-
sampler_type_names +=
|
|
1358
|
+
sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";";
|
|
1425
1359
|
}
|
|
1426
1360
|
sampler_type_names.pop_back();
|
|
1427
1361
|
|
|
@@ -1432,6 +1366,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
|
1432
1366
|
printf(" -h, --help show this help message and exit\n");
|
|
1433
1367
|
printf(" --version show version and build info\n");
|
|
1434
1368
|
printf(" -i, --interactive run in interactive mode\n");
|
|
1369
|
+
printf(" --special special tokens output enabled\n");
|
|
1435
1370
|
printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
|
|
1436
1371
|
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
|
1437
1372
|
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
|
|
@@ -1618,7 +1553,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
|
1618
1553
|
#endif // LOG_DISABLE_LOGS
|
|
1619
1554
|
}
|
|
1620
1555
|
|
|
1621
|
-
std::string
|
|
1556
|
+
std::string gpt_params_get_system_info(const gpt_params & params) {
|
|
1622
1557
|
std::ostringstream os;
|
|
1623
1558
|
|
|
1624
1559
|
os << "system_info: n_threads = " << params.n_threads;
|
|
@@ -1630,7 +1565,52 @@ std::string get_system_info(const gpt_params & params) {
|
|
|
1630
1565
|
return os.str();
|
|
1631
1566
|
}
|
|
1632
1567
|
|
|
1633
|
-
|
|
1568
|
+
//
|
|
1569
|
+
// String utils
|
|
1570
|
+
//
|
|
1571
|
+
|
|
1572
|
+
std::vector<std::string> string_split(std::string input, char separator) {
|
|
1573
|
+
std::vector<std::string> parts;
|
|
1574
|
+
size_t separator_pos = input.find(separator);
|
|
1575
|
+
while (separator_pos != std::string::npos) {
|
|
1576
|
+
std::string part = input.substr(0, separator_pos);
|
|
1577
|
+
parts.emplace_back(part);
|
|
1578
|
+
input = input.substr(separator_pos + 1);
|
|
1579
|
+
separator_pos = input.find(separator);
|
|
1580
|
+
}
|
|
1581
|
+
parts.emplace_back(input);
|
|
1582
|
+
return parts;
|
|
1583
|
+
}
|
|
1584
|
+
|
|
1585
|
+
std::string string_strip(const std::string & str) {
|
|
1586
|
+
size_t start = 0;
|
|
1587
|
+
size_t end = str.size();
|
|
1588
|
+
while (start < end && std::isspace(str[start])) {
|
|
1589
|
+
start++;
|
|
1590
|
+
}
|
|
1591
|
+
while (end > start && std::isspace(str[end - 1])) {
|
|
1592
|
+
end--;
|
|
1593
|
+
}
|
|
1594
|
+
return str.substr(start, end - start);
|
|
1595
|
+
}
|
|
1596
|
+
|
|
1597
|
+
std::string string_get_sortable_timestamp() {
|
|
1598
|
+
using clock = std::chrono::system_clock;
|
|
1599
|
+
|
|
1600
|
+
const clock::time_point current_time = clock::now();
|
|
1601
|
+
const time_t as_time_t = clock::to_time_t(current_time);
|
|
1602
|
+
char timestamp_no_ns[100];
|
|
1603
|
+
std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
|
|
1604
|
+
|
|
1605
|
+
const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
|
|
1606
|
+
current_time.time_since_epoch() % 1000000000).count();
|
|
1607
|
+
char timestamp_ns[11];
|
|
1608
|
+
snprintf(timestamp_ns, 11, "%09" PRId64, ns);
|
|
1609
|
+
|
|
1610
|
+
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
|
|
1611
|
+
}
|
|
1612
|
+
|
|
1613
|
+
std::string string_random_prompt(std::mt19937 & rng) {
|
|
1634
1614
|
const int r = rng() % 10;
|
|
1635
1615
|
switch (r) {
|
|
1636
1616
|
case 0: return "So";
|
|
@@ -1648,17 +1628,104 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
|
|
|
1648
1628
|
GGML_UNREACHABLE();
|
|
1649
1629
|
}
|
|
1650
1630
|
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1631
|
+
void string_process_escapes(std::string & input) {
|
|
1632
|
+
std::size_t input_len = input.length();
|
|
1633
|
+
std::size_t output_idx = 0;
|
|
1634
|
+
|
|
1635
|
+
for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
|
|
1636
|
+
if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
|
|
1637
|
+
switch (input[++input_idx]) {
|
|
1638
|
+
case 'n': input[output_idx++] = '\n'; break;
|
|
1639
|
+
case 'r': input[output_idx++] = '\r'; break;
|
|
1640
|
+
case 't': input[output_idx++] = '\t'; break;
|
|
1641
|
+
case '\'': input[output_idx++] = '\''; break;
|
|
1642
|
+
case '\"': input[output_idx++] = '\"'; break;
|
|
1643
|
+
case '\\': input[output_idx++] = '\\'; break;
|
|
1644
|
+
case 'x':
|
|
1645
|
+
// Handle \x12, etc
|
|
1646
|
+
if (input_idx + 2 < input_len) {
|
|
1647
|
+
const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
|
|
1648
|
+
char *err_p = nullptr;
|
|
1649
|
+
const long val = std::strtol(x, &err_p, 16);
|
|
1650
|
+
if (err_p == x + 2) {
|
|
1651
|
+
input_idx += 2;
|
|
1652
|
+
input[output_idx++] = char(val);
|
|
1653
|
+
break;
|
|
1654
|
+
}
|
|
1655
|
+
}
|
|
1656
|
+
// fall through
|
|
1657
|
+
default: input[output_idx++] = '\\';
|
|
1658
|
+
input[output_idx++] = input[input_idx]; break;
|
|
1659
|
+
}
|
|
1660
|
+
} else {
|
|
1661
|
+
input[output_idx++] = input[input_idx];
|
|
1662
|
+
}
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
input.resize(output_idx);
|
|
1666
|
+
}
|
|
1667
|
+
|
|
1668
|
+
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
|
|
1669
|
+
const char * sep = strchr(data, '=');
|
|
1670
|
+
if (sep == nullptr || sep - data >= 128) {
|
|
1671
|
+
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
|
|
1672
|
+
return false;
|
|
1673
|
+
}
|
|
1674
|
+
llama_model_kv_override kvo;
|
|
1675
|
+
std::strncpy(kvo.key, data, sep - data);
|
|
1676
|
+
kvo.key[sep - data] = 0;
|
|
1677
|
+
sep++;
|
|
1678
|
+
if (strncmp(sep, "int:", 4) == 0) {
|
|
1679
|
+
sep += 4;
|
|
1680
|
+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
|
1681
|
+
kvo.val_i64 = std::atol(sep);
|
|
1682
|
+
} else if (strncmp(sep, "float:", 6) == 0) {
|
|
1683
|
+
sep += 6;
|
|
1684
|
+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
|
1685
|
+
kvo.val_f64 = std::atof(sep);
|
|
1686
|
+
} else if (strncmp(sep, "bool:", 5) == 0) {
|
|
1687
|
+
sep += 5;
|
|
1688
|
+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
|
1689
|
+
if (std::strcmp(sep, "true") == 0) {
|
|
1690
|
+
kvo.val_bool = true;
|
|
1691
|
+
} else if (std::strcmp(sep, "false") == 0) {
|
|
1692
|
+
kvo.val_bool = false;
|
|
1693
|
+
} else {
|
|
1694
|
+
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
|
|
1695
|
+
return false;
|
|
1696
|
+
}
|
|
1697
|
+
} else if (strncmp(sep, "str:", 4) == 0) {
|
|
1698
|
+
sep += 4;
|
|
1699
|
+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
|
1700
|
+
if (strlen(sep) > 127) {
|
|
1701
|
+
fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
|
|
1702
|
+
return false;
|
|
1703
|
+
}
|
|
1704
|
+
strncpy(kvo.val_str, sep, 127);
|
|
1705
|
+
kvo.val_str[127] = '\0';
|
|
1706
|
+
} else {
|
|
1707
|
+
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
|
|
1708
|
+
return false;
|
|
1709
|
+
}
|
|
1710
|
+
overrides.emplace_back(std::move(kvo));
|
|
1711
|
+
return true;
|
|
1712
|
+
}
|
|
1713
|
+
|
|
1714
|
+
//
|
|
1715
|
+
// Filesystem utils
|
|
1716
|
+
//
|
|
1717
|
+
|
|
1718
|
+
// Validate if a filename is safe to use
|
|
1719
|
+
// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
|
|
1720
|
+
bool fs_validate_filename(const std::string & filename) {
|
|
1721
|
+
if (!filename.length()) {
|
|
1722
|
+
// Empty filename invalid
|
|
1723
|
+
return false;
|
|
1724
|
+
}
|
|
1725
|
+
if (filename.length() > 255) {
|
|
1726
|
+
// Limit at common largest possible filename on Linux filesystems
|
|
1727
|
+
// to avoid unnecessary further validation
|
|
1728
|
+
// (On systems with smaller limits it will be caught by the OS)
|
|
1662
1729
|
return false;
|
|
1663
1730
|
}
|
|
1664
1731
|
|
|
@@ -1719,174 +1786,252 @@ bool validate_file_name(const std::string & filename) {
|
|
|
1719
1786
|
return true;
|
|
1720
1787
|
}
|
|
1721
1788
|
|
|
1722
|
-
//
|
|
1723
|
-
|
|
1724
|
-
|
|
1789
|
+
// returns true if successful, false otherwise
|
|
1790
|
+
bool fs_create_directory_with_parents(const std::string & path) {
|
|
1791
|
+
#ifdef _WIN32
|
|
1792
|
+
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
|
1793
|
+
std::wstring wpath = converter.from_bytes(path);
|
|
1725
1794
|
|
|
1726
|
-
|
|
1727
|
-
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
std::string part = input.substr(0, separator_pos);
|
|
1731
|
-
parts.emplace_back(part);
|
|
1732
|
-
input = input.substr(separator_pos + 1);
|
|
1733
|
-
separator_pos = input.find(separator);
|
|
1795
|
+
// if the path already exists, check whether it's a directory
|
|
1796
|
+
const DWORD attributes = GetFileAttributesW(wpath.c_str());
|
|
1797
|
+
if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
|
|
1798
|
+
return true;
|
|
1734
1799
|
}
|
|
1735
|
-
parts.emplace_back(input);
|
|
1736
|
-
return parts;
|
|
1737
|
-
}
|
|
1738
1800
|
|
|
1739
|
-
|
|
1740
|
-
size_t start = 0;
|
|
1741
|
-
size_t end = str.size();
|
|
1742
|
-
while (start < end && std::isspace(str[start])) {
|
|
1743
|
-
start++;
|
|
1744
|
-
}
|
|
1745
|
-
while (end > start && std::isspace(str[end - 1])) {
|
|
1746
|
-
end--;
|
|
1747
|
-
}
|
|
1748
|
-
return str.substr(start, end - start);
|
|
1749
|
-
}
|
|
1801
|
+
size_t pos_slash = 0;
|
|
1750
1802
|
|
|
1751
|
-
|
|
1752
|
-
std::
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
{"typical_p", llama_sampler_type::TYPICAL_P},
|
|
1756
|
-
{"min_p", llama_sampler_type::MIN_P},
|
|
1757
|
-
{"tfs_z", llama_sampler_type::TFS_Z},
|
|
1758
|
-
{"temperature", llama_sampler_type::TEMPERATURE}
|
|
1759
|
-
};
|
|
1803
|
+
// process path from front to back, procedurally creating directories
|
|
1804
|
+
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
|
1805
|
+
const std::wstring subpath = wpath.substr(0, pos_slash);
|
|
1806
|
+
const wchar_t * test = subpath.c_str();
|
|
1760
1807
|
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
{"top-k", llama_sampler_type::TOP_K},
|
|
1765
|
-
{"top-p", llama_sampler_type::TOP_P},
|
|
1766
|
-
{"nucleus", llama_sampler_type::TOP_P},
|
|
1767
|
-
{"typical-p", llama_sampler_type::TYPICAL_P},
|
|
1768
|
-
{"typical", llama_sampler_type::TYPICAL_P},
|
|
1769
|
-
{"min-p", llama_sampler_type::MIN_P},
|
|
1770
|
-
{"tfs-z", llama_sampler_type::TFS_Z},
|
|
1771
|
-
{"tfs", llama_sampler_type::TFS_Z},
|
|
1772
|
-
{"temp", llama_sampler_type::TEMPERATURE}
|
|
1773
|
-
};
|
|
1808
|
+
const bool success = CreateDirectoryW(test, NULL);
|
|
1809
|
+
if (!success) {
|
|
1810
|
+
const DWORD error = GetLastError();
|
|
1774
1811
|
|
|
1775
|
-
|
|
1776
|
-
|
|
1777
|
-
|
|
1778
|
-
|
|
1779
|
-
|
|
1780
|
-
if (sampler_item != sampler_canonical_name_map.end())
|
|
1781
|
-
{
|
|
1782
|
-
sampler_types.push_back(sampler_item->second);
|
|
1783
|
-
}
|
|
1784
|
-
else
|
|
1785
|
-
{
|
|
1786
|
-
if (allow_alt_names)
|
|
1787
|
-
{
|
|
1788
|
-
sampler_item = sampler_alt_name_map.find(name);
|
|
1789
|
-
if (sampler_item != sampler_alt_name_map.end())
|
|
1790
|
-
{
|
|
1791
|
-
sampler_types.push_back(sampler_item->second);
|
|
1812
|
+
// if the path already exists, ensure that it's a directory
|
|
1813
|
+
if (error == ERROR_ALREADY_EXISTS) {
|
|
1814
|
+
const DWORD attributes = GetFileAttributesW(subpath.c_str());
|
|
1815
|
+
if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
|
|
1816
|
+
return false;
|
|
1792
1817
|
}
|
|
1818
|
+
} else {
|
|
1819
|
+
return false;
|
|
1793
1820
|
}
|
|
1794
1821
|
}
|
|
1822
|
+
|
|
1823
|
+
pos_slash += 1;
|
|
1795
1824
|
}
|
|
1796
|
-
return sampler_types;
|
|
1797
|
-
}
|
|
1798
1825
|
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
|
|
1805
|
-
|
|
1806
|
-
|
|
1807
|
-
|
|
1826
|
+
return true;
|
|
1827
|
+
#else
|
|
1828
|
+
// if the path already exists, check whether it's a directory
|
|
1829
|
+
struct stat info;
|
|
1830
|
+
if (stat(path.c_str(), &info) == 0) {
|
|
1831
|
+
return S_ISDIR(info.st_mode);
|
|
1832
|
+
}
|
|
1833
|
+
|
|
1834
|
+
size_t pos_slash = 1; // skip leading slashes for directory creation
|
|
1835
|
+
|
|
1836
|
+
// process path from front to back, procedurally creating directories
|
|
1837
|
+
while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
|
|
1838
|
+
const std::string subpath = path.substr(0, pos_slash);
|
|
1839
|
+
struct stat info;
|
|
1808
1840
|
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
|
|
1813
|
-
|
|
1814
|
-
|
|
1841
|
+
// if the path already exists, ensure that it's a directory
|
|
1842
|
+
if (stat(subpath.c_str(), &info) == 0) {
|
|
1843
|
+
if (!S_ISDIR(info.st_mode)) {
|
|
1844
|
+
return false;
|
|
1845
|
+
}
|
|
1846
|
+
} else {
|
|
1847
|
+
// create parent directories
|
|
1848
|
+
const int ret = mkdir(subpath.c_str(), 0755);
|
|
1849
|
+
if (ret != 0) {
|
|
1850
|
+
return false;
|
|
1851
|
+
}
|
|
1815
1852
|
}
|
|
1853
|
+
|
|
1854
|
+
pos_slash += 1;
|
|
1816
1855
|
}
|
|
1817
|
-
|
|
1856
|
+
|
|
1857
|
+
return true;
|
|
1858
|
+
#endif // _WIN32
|
|
1818
1859
|
}
|
|
1819
1860
|
|
|
1820
|
-
std::string
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
|
|
1827
|
-
|
|
1828
|
-
|
|
1861
|
+
std::string fs_get_cache_directory() {
|
|
1862
|
+
std::string cache_directory = "";
|
|
1863
|
+
auto ensure_trailing_slash = [](std::string p) {
|
|
1864
|
+
// Make sure to add trailing slash
|
|
1865
|
+
if (p.back() != DIRECTORY_SEPARATOR) {
|
|
1866
|
+
p += DIRECTORY_SEPARATOR;
|
|
1867
|
+
}
|
|
1868
|
+
return p;
|
|
1869
|
+
};
|
|
1870
|
+
if (getenv("LLAMA_CACHE")) {
|
|
1871
|
+
cache_directory = std::getenv("LLAMA_CACHE");
|
|
1872
|
+
} else {
|
|
1873
|
+
#ifdef __linux__
|
|
1874
|
+
if (std::getenv("XDG_CACHE_HOME")) {
|
|
1875
|
+
cache_directory = std::getenv("XDG_CACHE_HOME");
|
|
1876
|
+
} else {
|
|
1877
|
+
cache_directory = std::getenv("HOME") + std::string("/.cache/");
|
|
1878
|
+
}
|
|
1879
|
+
#elif defined(__APPLE__)
|
|
1880
|
+
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
|
1881
|
+
#elif defined(_WIN32)
|
|
1882
|
+
cache_directory = std::getenv("LOCALAPPDATA");
|
|
1883
|
+
#endif // __linux__
|
|
1884
|
+
cache_directory = ensure_trailing_slash(cache_directory);
|
|
1885
|
+
cache_directory += "llama.cpp";
|
|
1829
1886
|
}
|
|
1887
|
+
return ensure_trailing_slash(cache_directory);
|
|
1830
1888
|
}
|
|
1831
1889
|
|
|
1890
|
+
|
|
1832
1891
|
//
|
|
1833
1892
|
// Model utils
|
|
1834
1893
|
//
|
|
1835
1894
|
|
|
1836
|
-
struct
|
|
1837
|
-
auto mparams =
|
|
1895
|
+
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
|
|
1896
|
+
auto mparams = llama_model_params_from_gpt_params(params);
|
|
1838
1897
|
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
mparams.tensor_split = params.tensor_split;
|
|
1846
|
-
mparams.use_mmap = params.use_mmap;
|
|
1847
|
-
mparams.use_mlock = params.use_mlock;
|
|
1848
|
-
mparams.check_tensors = params.check_tensors;
|
|
1849
|
-
if (params.kv_overrides.empty()) {
|
|
1850
|
-
mparams.kv_overrides = NULL;
|
|
1898
|
+
llama_model * model = nullptr;
|
|
1899
|
+
|
|
1900
|
+
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
|
1901
|
+
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
|
|
1902
|
+
} else if (!params.model_url.empty()) {
|
|
1903
|
+
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
|
|
1851
1904
|
} else {
|
|
1852
|
-
|
|
1853
|
-
mparams.kv_overrides = params.kv_overrides.data();
|
|
1905
|
+
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
|
1854
1906
|
}
|
|
1855
1907
|
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
|
1860
|
-
if (s == "f32") {
|
|
1861
|
-
return GGML_TYPE_F32;
|
|
1862
|
-
}
|
|
1863
|
-
if (s == "f16") {
|
|
1864
|
-
return GGML_TYPE_F16;
|
|
1865
|
-
}
|
|
1866
|
-
if (s == "q8_0") {
|
|
1867
|
-
return GGML_TYPE_Q8_0;
|
|
1868
|
-
}
|
|
1869
|
-
if (s == "q4_0") {
|
|
1870
|
-
return GGML_TYPE_Q4_0;
|
|
1871
|
-
}
|
|
1872
|
-
if (s == "q4_1") {
|
|
1873
|
-
return GGML_TYPE_Q4_1;
|
|
1874
|
-
}
|
|
1875
|
-
if (s == "iq4_nl") {
|
|
1876
|
-
return GGML_TYPE_IQ4_NL;
|
|
1877
|
-
}
|
|
1878
|
-
if (s == "q5_0") {
|
|
1879
|
-
return GGML_TYPE_Q5_0;
|
|
1880
|
-
}
|
|
1881
|
-
if (s == "q5_1") {
|
|
1882
|
-
return GGML_TYPE_Q5_1;
|
|
1908
|
+
if (model == NULL) {
|
|
1909
|
+
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
|
1910
|
+
return std::make_tuple(nullptr, nullptr);
|
|
1883
1911
|
}
|
|
1884
1912
|
|
|
1885
|
-
|
|
1886
|
-
}
|
|
1913
|
+
auto cparams = llama_context_params_from_gpt_params(params);
|
|
1887
1914
|
|
|
1888
|
-
|
|
1889
|
-
|
|
1915
|
+
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
|
1916
|
+
if (lctx == NULL) {
|
|
1917
|
+
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
|
1918
|
+
llama_free_model(model);
|
|
1919
|
+
return std::make_tuple(nullptr, nullptr);
|
|
1920
|
+
}
|
|
1921
|
+
|
|
1922
|
+
if (!params.control_vectors.empty()) {
|
|
1923
|
+
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
|
1924
|
+
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
|
1925
|
+
|
|
1926
|
+
const auto cvec = llama_control_vector_load(params.control_vectors);
|
|
1927
|
+
if (cvec.n_embd == -1) {
|
|
1928
|
+
llama_free(lctx);
|
|
1929
|
+
llama_free_model(model);
|
|
1930
|
+
return std::make_tuple(nullptr, nullptr);
|
|
1931
|
+
}
|
|
1932
|
+
|
|
1933
|
+
int err = llama_control_vector_apply(lctx,
|
|
1934
|
+
cvec.data.data(),
|
|
1935
|
+
cvec.data.size(),
|
|
1936
|
+
cvec.n_embd,
|
|
1937
|
+
params.control_vector_layer_start,
|
|
1938
|
+
params.control_vector_layer_end);
|
|
1939
|
+
if (err) {
|
|
1940
|
+
llama_free(lctx);
|
|
1941
|
+
llama_free_model(model);
|
|
1942
|
+
return std::make_tuple(nullptr, nullptr);
|
|
1943
|
+
}
|
|
1944
|
+
}
|
|
1945
|
+
|
|
1946
|
+
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
|
|
1947
|
+
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
|
|
1948
|
+
float lora_scale = std::get<1>(params.lora_adapter[i]);
|
|
1949
|
+
int err = llama_model_apply_lora_from_file(model,
|
|
1950
|
+
lora_adapter.c_str(),
|
|
1951
|
+
lora_scale,
|
|
1952
|
+
((i > 0) || params.lora_base.empty())
|
|
1953
|
+
? NULL
|
|
1954
|
+
: params.lora_base.c_str(),
|
|
1955
|
+
params.n_threads);
|
|
1956
|
+
if (err != 0) {
|
|
1957
|
+
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
|
1958
|
+
llama_free(lctx);
|
|
1959
|
+
llama_free_model(model);
|
|
1960
|
+
return std::make_tuple(nullptr, nullptr);
|
|
1961
|
+
}
|
|
1962
|
+
}
|
|
1963
|
+
|
|
1964
|
+
if (params.ignore_eos) {
|
|
1965
|
+
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
|
1966
|
+
}
|
|
1967
|
+
|
|
1968
|
+
if (params.warmup) {
|
|
1969
|
+
LOG("warming up the model with an empty run\n");
|
|
1970
|
+
|
|
1971
|
+
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
|
1972
|
+
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
|
1973
|
+
llama_kv_cache_clear(lctx);
|
|
1974
|
+
llama_synchronize(lctx);
|
|
1975
|
+
llama_reset_timings(lctx);
|
|
1976
|
+
}
|
|
1977
|
+
|
|
1978
|
+
return std::make_tuple(model, lctx);
|
|
1979
|
+
}
|
|
1980
|
+
|
|
1981
|
+
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
|
|
1982
|
+
auto mparams = llama_model_default_params();
|
|
1983
|
+
|
|
1984
|
+
if (params.n_gpu_layers != -1) {
|
|
1985
|
+
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
1986
|
+
}
|
|
1987
|
+
mparams.rpc_servers = params.rpc_servers.c_str();
|
|
1988
|
+
mparams.main_gpu = params.main_gpu;
|
|
1989
|
+
mparams.split_mode = params.split_mode;
|
|
1990
|
+
mparams.tensor_split = params.tensor_split;
|
|
1991
|
+
mparams.use_mmap = params.use_mmap;
|
|
1992
|
+
mparams.use_mlock = params.use_mlock;
|
|
1993
|
+
mparams.check_tensors = params.check_tensors;
|
|
1994
|
+
if (params.kv_overrides.empty()) {
|
|
1995
|
+
mparams.kv_overrides = NULL;
|
|
1996
|
+
} else {
|
|
1997
|
+
GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
|
|
1998
|
+
mparams.kv_overrides = params.kv_overrides.data();
|
|
1999
|
+
}
|
|
2000
|
+
|
|
2001
|
+
return mparams;
|
|
2002
|
+
}
|
|
2003
|
+
|
|
2004
|
+
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
|
2005
|
+
if (s == "f32") {
|
|
2006
|
+
return GGML_TYPE_F32;
|
|
2007
|
+
}
|
|
2008
|
+
if (s == "f16") {
|
|
2009
|
+
return GGML_TYPE_F16;
|
|
2010
|
+
}
|
|
2011
|
+
if (s == "q8_0") {
|
|
2012
|
+
return GGML_TYPE_Q8_0;
|
|
2013
|
+
}
|
|
2014
|
+
if (s == "q4_0") {
|
|
2015
|
+
return GGML_TYPE_Q4_0;
|
|
2016
|
+
}
|
|
2017
|
+
if (s == "q4_1") {
|
|
2018
|
+
return GGML_TYPE_Q4_1;
|
|
2019
|
+
}
|
|
2020
|
+
if (s == "iq4_nl") {
|
|
2021
|
+
return GGML_TYPE_IQ4_NL;
|
|
2022
|
+
}
|
|
2023
|
+
if (s == "q5_0") {
|
|
2024
|
+
return GGML_TYPE_Q5_0;
|
|
2025
|
+
}
|
|
2026
|
+
if (s == "q5_1") {
|
|
2027
|
+
return GGML_TYPE_Q5_1;
|
|
2028
|
+
}
|
|
2029
|
+
|
|
2030
|
+
throw std::runtime_error("Invalid cache type: " + s);
|
|
2031
|
+
}
|
|
2032
|
+
|
|
2033
|
+
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
|
2034
|
+
auto cparams = llama_context_default_params();
|
|
1890
2035
|
|
|
1891
2036
|
cparams.n_ctx = params.n_ctx;
|
|
1892
2037
|
cparams.n_seq_max = params.n_parallel;
|
|
@@ -1918,27 +2063,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
|
1918
2063
|
return cparams;
|
|
1919
2064
|
}
|
|
1920
2065
|
|
|
1921
|
-
void llama_batch_clear(struct llama_batch & batch) {
|
|
1922
|
-
batch.n_tokens = 0;
|
|
1923
|
-
}
|
|
1924
|
-
|
|
1925
|
-
void llama_batch_add(
|
|
1926
|
-
struct llama_batch & batch,
|
|
1927
|
-
llama_token id,
|
|
1928
|
-
llama_pos pos,
|
|
1929
|
-
const std::vector<llama_seq_id> & seq_ids,
|
|
1930
|
-
bool logits) {
|
|
1931
|
-
batch.token [batch.n_tokens] = id;
|
|
1932
|
-
batch.pos [batch.n_tokens] = pos;
|
|
1933
|
-
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
|
|
1934
|
-
for (size_t i = 0; i < seq_ids.size(); ++i) {
|
|
1935
|
-
batch.seq_id[batch.n_tokens][i] = seq_ids[i];
|
|
1936
|
-
}
|
|
1937
|
-
batch.logits [batch.n_tokens] = logits;
|
|
1938
|
-
|
|
1939
|
-
batch.n_tokens++;
|
|
1940
|
-
}
|
|
1941
|
-
|
|
1942
2066
|
#ifdef LLAMA_USE_CURL
|
|
1943
2067
|
|
|
1944
2068
|
static bool starts_with(const std::string & str, const std::string & prefix) {
|
|
@@ -2269,90 +2393,29 @@ struct llama_model * llama_load_model_from_hf(
|
|
|
2269
2393
|
|
|
2270
2394
|
#endif // LLAMA_USE_CURL
|
|
2271
2395
|
|
|
2272
|
-
|
|
2273
|
-
|
|
2274
|
-
|
|
2275
|
-
llama_model * model = nullptr;
|
|
2276
|
-
|
|
2277
|
-
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
|
2278
|
-
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
|
|
2279
|
-
} else if (!params.model_url.empty()) {
|
|
2280
|
-
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
|
|
2281
|
-
} else {
|
|
2282
|
-
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
|
2283
|
-
}
|
|
2284
|
-
|
|
2285
|
-
if (model == NULL) {
|
|
2286
|
-
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
|
2287
|
-
return std::make_tuple(nullptr, nullptr);
|
|
2288
|
-
}
|
|
2289
|
-
|
|
2290
|
-
auto cparams = llama_context_params_from_gpt_params(params);
|
|
2291
|
-
|
|
2292
|
-
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
|
2293
|
-
if (lctx == NULL) {
|
|
2294
|
-
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
|
2295
|
-
llama_free_model(model);
|
|
2296
|
-
return std::make_tuple(nullptr, nullptr);
|
|
2297
|
-
}
|
|
2298
|
-
|
|
2299
|
-
if (!params.control_vectors.empty()) {
|
|
2300
|
-
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
|
2301
|
-
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
|
2302
|
-
|
|
2303
|
-
const auto cvec = llama_control_vector_load(params.control_vectors);
|
|
2304
|
-
if (cvec.n_embd == -1) {
|
|
2305
|
-
llama_free(lctx);
|
|
2306
|
-
llama_free_model(model);
|
|
2307
|
-
return std::make_tuple(nullptr, nullptr);
|
|
2308
|
-
}
|
|
2309
|
-
|
|
2310
|
-
int err = llama_control_vector_apply(lctx,
|
|
2311
|
-
cvec.data.data(),
|
|
2312
|
-
cvec.data.size(),
|
|
2313
|
-
cvec.n_embd,
|
|
2314
|
-
params.control_vector_layer_start,
|
|
2315
|
-
params.control_vector_layer_end);
|
|
2316
|
-
if (err) {
|
|
2317
|
-
llama_free(lctx);
|
|
2318
|
-
llama_free_model(model);
|
|
2319
|
-
return std::make_tuple(nullptr, nullptr);
|
|
2320
|
-
}
|
|
2321
|
-
}
|
|
2322
|
-
|
|
2323
|
-
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
|
|
2324
|
-
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
|
|
2325
|
-
float lora_scale = std::get<1>(params.lora_adapter[i]);
|
|
2326
|
-
int err = llama_model_apply_lora_from_file(model,
|
|
2327
|
-
lora_adapter.c_str(),
|
|
2328
|
-
lora_scale,
|
|
2329
|
-
((i > 0) || params.lora_base.empty())
|
|
2330
|
-
? NULL
|
|
2331
|
-
: params.lora_base.c_str(),
|
|
2332
|
-
params.n_threads);
|
|
2333
|
-
if (err != 0) {
|
|
2334
|
-
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
|
2335
|
-
llama_free(lctx);
|
|
2336
|
-
llama_free_model(model);
|
|
2337
|
-
return std::make_tuple(nullptr, nullptr);
|
|
2338
|
-
}
|
|
2339
|
-
}
|
|
2340
|
-
|
|
2341
|
-
if (params.ignore_eos) {
|
|
2342
|
-
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
|
2343
|
-
}
|
|
2396
|
+
//
|
|
2397
|
+
// Batch utils
|
|
2398
|
+
//
|
|
2344
2399
|
|
|
2345
|
-
|
|
2346
|
-
|
|
2400
|
+
void llama_batch_clear(struct llama_batch & batch) {
|
|
2401
|
+
batch.n_tokens = 0;
|
|
2402
|
+
}
|
|
2347
2403
|
|
|
2348
|
-
|
|
2349
|
-
|
|
2350
|
-
|
|
2351
|
-
|
|
2352
|
-
|
|
2404
|
+
void llama_batch_add(
|
|
2405
|
+
struct llama_batch & batch,
|
|
2406
|
+
llama_token id,
|
|
2407
|
+
llama_pos pos,
|
|
2408
|
+
const std::vector<llama_seq_id> & seq_ids,
|
|
2409
|
+
bool logits) {
|
|
2410
|
+
batch.token [batch.n_tokens] = id;
|
|
2411
|
+
batch.pos [batch.n_tokens] = pos;
|
|
2412
|
+
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
|
|
2413
|
+
for (size_t i = 0; i < seq_ids.size(); ++i) {
|
|
2414
|
+
batch.seq_id[batch.n_tokens][i] = seq_ids[i];
|
|
2353
2415
|
}
|
|
2416
|
+
batch.logits [batch.n_tokens] = logits;
|
|
2354
2417
|
|
|
2355
|
-
|
|
2418
|
+
batch.n_tokens++;
|
|
2356
2419
|
}
|
|
2357
2420
|
|
|
2358
2421
|
//
|
|
@@ -2406,355 +2469,45 @@ std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_to
|
|
|
2406
2469
|
std::string piece;
|
|
2407
2470
|
std::string result;
|
|
2408
2471
|
|
|
2409
|
-
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
2410
|
-
piece = llama_token_to_piece(ctx, tokens[i]);
|
|
2411
|
-
|
|
2412
|
-
// remove the leading space of the first non-BOS token
|
|
2413
|
-
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
|
|
2414
|
-
piece = piece.substr(1);
|
|
2415
|
-
}
|
|
2416
|
-
|
|
2417
|
-
result += piece;
|
|
2418
|
-
}
|
|
2419
|
-
|
|
2420
|
-
return result;
|
|
2421
|
-
}
|
|
2422
|
-
|
|
2423
|
-
std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
|
2424
|
-
std::string piece;
|
|
2425
|
-
std::string result;
|
|
2426
|
-
|
|
2427
|
-
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
2428
|
-
piece = llama_token_to_piece(ctx, tokens[i]);
|
|
2429
|
-
|
|
2430
|
-
result += piece;
|
|
2431
|
-
}
|
|
2432
|
-
|
|
2433
|
-
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
|
|
2434
|
-
return result;
|
|
2435
|
-
}
|
|
2436
|
-
|
|
2437
|
-
bool llama_should_add_bos_token(const llama_model * model) {
|
|
2438
|
-
const int add_bos = llama_add_bos_token(model);
|
|
2439
|
-
|
|
2440
|
-
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
|
|
2441
|
-
}
|
|
2442
|
-
|
|
2443
|
-
//
|
|
2444
|
-
// YAML utils
|
|
2445
|
-
//
|
|
2446
|
-
|
|
2447
|
-
// returns true if successful, false otherwise
|
|
2448
|
-
bool create_directory_with_parents(const std::string & path) {
|
|
2449
|
-
#ifdef _WIN32
|
|
2450
|
-
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
|
2451
|
-
std::wstring wpath = converter.from_bytes(path);
|
|
2452
|
-
|
|
2453
|
-
// if the path already exists, check whether it's a directory
|
|
2454
|
-
const DWORD attributes = GetFileAttributesW(wpath.c_str());
|
|
2455
|
-
if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
|
|
2456
|
-
return true;
|
|
2457
|
-
}
|
|
2458
|
-
|
|
2459
|
-
size_t pos_slash = 0;
|
|
2460
|
-
|
|
2461
|
-
// process path from front to back, procedurally creating directories
|
|
2462
|
-
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
|
2463
|
-
const std::wstring subpath = wpath.substr(0, pos_slash);
|
|
2464
|
-
const wchar_t * test = subpath.c_str();
|
|
2465
|
-
|
|
2466
|
-
const bool success = CreateDirectoryW(test, NULL);
|
|
2467
|
-
if (!success) {
|
|
2468
|
-
const DWORD error = GetLastError();
|
|
2469
|
-
|
|
2470
|
-
// if the path already exists, ensure that it's a directory
|
|
2471
|
-
if (error == ERROR_ALREADY_EXISTS) {
|
|
2472
|
-
const DWORD attributes = GetFileAttributesW(subpath.c_str());
|
|
2473
|
-
if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
|
|
2474
|
-
return false;
|
|
2475
|
-
}
|
|
2476
|
-
} else {
|
|
2477
|
-
return false;
|
|
2478
|
-
}
|
|
2479
|
-
}
|
|
2480
|
-
|
|
2481
|
-
pos_slash += 1;
|
|
2482
|
-
}
|
|
2483
|
-
|
|
2484
|
-
return true;
|
|
2485
|
-
#else
|
|
2486
|
-
// if the path already exists, check whether it's a directory
|
|
2487
|
-
struct stat info;
|
|
2488
|
-
if (stat(path.c_str(), &info) == 0) {
|
|
2489
|
-
return S_ISDIR(info.st_mode);
|
|
2490
|
-
}
|
|
2491
|
-
|
|
2492
|
-
size_t pos_slash = 1; // skip leading slashes for directory creation
|
|
2493
|
-
|
|
2494
|
-
// process path from front to back, procedurally creating directories
|
|
2495
|
-
while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
|
|
2496
|
-
const std::string subpath = path.substr(0, pos_slash);
|
|
2497
|
-
struct stat info;
|
|
2498
|
-
|
|
2499
|
-
// if the path already exists, ensure that it's a directory
|
|
2500
|
-
if (stat(subpath.c_str(), &info) == 0) {
|
|
2501
|
-
if (!S_ISDIR(info.st_mode)) {
|
|
2502
|
-
return false;
|
|
2503
|
-
}
|
|
2504
|
-
} else {
|
|
2505
|
-
// create parent directories
|
|
2506
|
-
const int ret = mkdir(subpath.c_str(), 0755);
|
|
2507
|
-
if (ret != 0) {
|
|
2508
|
-
return false;
|
|
2509
|
-
}
|
|
2510
|
-
}
|
|
2511
|
-
|
|
2512
|
-
pos_slash += 1;
|
|
2513
|
-
}
|
|
2514
|
-
|
|
2515
|
-
return true;
|
|
2516
|
-
#endif // _WIN32
|
|
2517
|
-
}
|
|
2518
|
-
|
|
2519
|
-
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data) {
|
|
2520
|
-
if (data.empty()) {
|
|
2521
|
-
fprintf(stream, "%s:\n", prop_name);
|
|
2522
|
-
return;
|
|
2523
|
-
}
|
|
2524
|
-
|
|
2525
|
-
fprintf(stream, "%s: [", prop_name);
|
|
2526
|
-
for (size_t i = 0; i < data.size() - 1; ++i) {
|
|
2527
|
-
fprintf(stream, "%e, ", data[i]);
|
|
2528
|
-
}
|
|
2529
|
-
fprintf(stream, "%e]\n", data.back());
|
|
2530
|
-
}
|
|
2531
|
-
|
|
2532
|
-
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data) {
|
|
2533
|
-
if (data.empty()) {
|
|
2534
|
-
fprintf(stream, "%s:\n", prop_name);
|
|
2535
|
-
return;
|
|
2536
|
-
}
|
|
2537
|
-
|
|
2538
|
-
fprintf(stream, "%s: [", prop_name);
|
|
2539
|
-
for (size_t i = 0; i < data.size() - 1; ++i) {
|
|
2540
|
-
fprintf(stream, "%d, ", data[i]);
|
|
2541
|
-
}
|
|
2542
|
-
fprintf(stream, "%d]\n", data.back());
|
|
2543
|
-
}
|
|
2544
|
-
|
|
2545
|
-
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data) {
|
|
2546
|
-
std::string data_str(data == NULL ? "" : data);
|
|
2547
|
-
|
|
2548
|
-
if (data_str.empty()) {
|
|
2549
|
-
fprintf(stream, "%s:\n", prop_name);
|
|
2550
|
-
return;
|
|
2551
|
-
}
|
|
2552
|
-
|
|
2553
|
-
size_t pos_start = 0;
|
|
2554
|
-
size_t pos_found = 0;
|
|
2555
|
-
|
|
2556
|
-
if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
|
|
2557
|
-
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
|
2558
|
-
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
|
2559
|
-
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
|
|
2560
|
-
data_str = "\"" + data_str + "\"";
|
|
2561
|
-
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
|
2562
|
-
return;
|
|
2563
|
-
}
|
|
2564
|
-
|
|
2565
|
-
if (data_str.find('\n') == std::string::npos) {
|
|
2566
|
-
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
|
2567
|
-
return;
|
|
2568
|
-
}
|
|
2569
|
-
|
|
2570
|
-
fprintf(stream, "%s: |\n", prop_name);
|
|
2571
|
-
while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
|
|
2572
|
-
fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
|
|
2573
|
-
pos_start = pos_found + 1;
|
|
2574
|
-
}
|
|
2575
|
-
}
|
|
2576
|
-
|
|
2577
|
-
std::string get_sortable_timestamp() {
|
|
2578
|
-
using clock = std::chrono::system_clock;
|
|
2579
|
-
|
|
2580
|
-
const clock::time_point current_time = clock::now();
|
|
2581
|
-
const time_t as_time_t = clock::to_time_t(current_time);
|
|
2582
|
-
char timestamp_no_ns[100];
|
|
2583
|
-
std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
|
|
2584
|
-
|
|
2585
|
-
const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
|
|
2586
|
-
current_time.time_since_epoch() % 1000000000).count();
|
|
2587
|
-
char timestamp_ns[11];
|
|
2588
|
-
snprintf(timestamp_ns, 11, "%09" PRId64, ns);
|
|
2589
|
-
|
|
2590
|
-
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
|
|
2591
|
-
}
|
|
2592
|
-
|
|
2593
|
-
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
|
2594
|
-
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
|
2595
|
-
const llama_sampling_params & sparams = params.sparams;
|
|
2596
|
-
|
|
2597
|
-
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
|
2598
|
-
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
|
2599
|
-
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
|
2600
|
-
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
|
2601
|
-
fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
|
|
2602
|
-
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
|
2603
|
-
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
|
2604
|
-
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
|
2605
|
-
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
|
2606
|
-
fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
|
|
2607
|
-
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
|
2608
|
-
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
|
2609
|
-
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
|
2610
|
-
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
|
2611
|
-
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
|
2612
|
-
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
|
2613
|
-
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
|
2614
|
-
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
|
2615
|
-
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
|
2616
|
-
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
|
2617
|
-
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
|
2618
|
-
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
|
2619
|
-
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
|
|
2620
|
-
|
|
2621
|
-
#ifdef NDEBUG
|
|
2622
|
-
fprintf(stream, "debug: false\n");
|
|
2623
|
-
#else
|
|
2624
|
-
fprintf(stream, "debug: true\n");
|
|
2625
|
-
#endif // NDEBUG
|
|
2626
|
-
|
|
2627
|
-
fprintf(stream, "model_desc: %s\n", model_desc);
|
|
2628
|
-
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
|
|
2629
|
-
|
|
2630
|
-
#ifdef __OPTIMIZE__
|
|
2631
|
-
fprintf(stream, "optimize: true\n");
|
|
2632
|
-
#else
|
|
2633
|
-
fprintf(stream, "optimize: false\n");
|
|
2634
|
-
#endif // __OPTIMIZE__
|
|
2635
|
-
|
|
2636
|
-
fprintf(stream, "time: %s\n", timestamp.c_str());
|
|
2637
|
-
|
|
2638
|
-
fprintf(stream, "\n");
|
|
2639
|
-
fprintf(stream, "###############\n");
|
|
2640
|
-
fprintf(stream, "# User Inputs #\n");
|
|
2641
|
-
fprintf(stream, "###############\n");
|
|
2642
|
-
fprintf(stream, "\n");
|
|
2643
|
-
|
|
2644
|
-
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
|
2645
|
-
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
|
2646
|
-
dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
|
|
2647
|
-
fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
|
|
2648
|
-
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
|
2649
|
-
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
|
2650
|
-
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
|
2651
|
-
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
|
2652
|
-
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
|
2653
|
-
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
|
|
2654
|
-
dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str());
|
|
2655
|
-
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
|
2656
|
-
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
|
2657
|
-
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
|
2658
|
-
|
|
2659
|
-
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
|
|
2660
|
-
const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
|
2661
|
-
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
|
2662
|
-
|
|
2663
|
-
dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
|
2664
|
-
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
|
|
2665
|
-
dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
|
|
2666
|
-
fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
|
|
2667
|
-
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
|
|
2668
|
-
fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
|
|
2669
|
-
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
|
|
2670
|
-
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
|
|
2671
|
-
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
|
2672
|
-
|
|
2673
|
-
fprintf(stream, "logit_bias:\n");
|
|
2674
|
-
for (std::pair<llama_token, float> lb : sparams.logit_bias) {
|
|
2675
|
-
if (ignore_eos && lb.first == logit_bias_eos->first) {
|
|
2676
|
-
continue;
|
|
2677
|
-
}
|
|
2678
|
-
fprintf(stream, " %d: %f", lb.first, lb.second);
|
|
2679
|
-
}
|
|
2680
|
-
|
|
2681
|
-
fprintf(stream, "lora:\n");
|
|
2682
|
-
for (std::tuple<std::string, float> la : params.lora_adapter) {
|
|
2683
|
-
if (std::get<1>(la) != 1.0f) {
|
|
2684
|
-
continue;
|
|
2685
|
-
}
|
|
2686
|
-
fprintf(stream, " - %s\n", std::get<0>(la).c_str());
|
|
2687
|
-
}
|
|
2688
|
-
fprintf(stream, "lora_scaled:\n");
|
|
2689
|
-
for (std::tuple<std::string, float> la : params.lora_adapter) {
|
|
2690
|
-
if (std::get<1>(la) == 1.0f) {
|
|
2691
|
-
continue;
|
|
2692
|
-
}
|
|
2693
|
-
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
|
|
2694
|
-
}
|
|
2695
|
-
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
|
2696
|
-
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
|
2697
|
-
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
|
2698
|
-
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
|
2699
|
-
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
|
2700
|
-
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
|
2701
|
-
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
|
2702
|
-
fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
|
|
2703
|
-
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
|
2704
|
-
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
|
2705
|
-
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
|
2706
|
-
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
|
2707
|
-
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
|
|
2708
|
-
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
|
2709
|
-
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
|
|
2710
|
-
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
|
2711
|
-
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
|
2712
|
-
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
|
|
2713
|
-
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
|
|
2714
|
-
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
|
2715
|
-
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
|
2716
|
-
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
|
2717
|
-
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
|
|
2718
|
-
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
|
2719
|
-
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
|
|
2720
|
-
|
|
2721
|
-
fprintf(stream, "reverse_prompt:\n");
|
|
2722
|
-
for (std::string ap : params.antiprompt) {
|
|
2723
|
-
size_t pos = 0;
|
|
2724
|
-
while ((pos = ap.find('\n', pos)) != std::string::npos) {
|
|
2725
|
-
ap.replace(pos, 1, "\\n");
|
|
2726
|
-
pos += 1;
|
|
2472
|
+
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
2473
|
+
piece = llama_token_to_piece(ctx, tokens[i]);
|
|
2474
|
+
|
|
2475
|
+
// remove the leading space of the first non-BOS token
|
|
2476
|
+
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
|
|
2477
|
+
piece = piece.substr(1);
|
|
2727
2478
|
}
|
|
2728
2479
|
|
|
2729
|
-
|
|
2480
|
+
result += piece;
|
|
2730
2481
|
}
|
|
2731
2482
|
|
|
2732
|
-
|
|
2733
|
-
|
|
2734
|
-
fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
|
|
2735
|
-
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
|
2736
|
-
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
|
2737
|
-
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
|
2738
|
-
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
|
2483
|
+
return result;
|
|
2484
|
+
}
|
|
2739
2485
|
|
|
2740
|
-
|
|
2741
|
-
|
|
2486
|
+
std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
|
2487
|
+
std::string piece;
|
|
2488
|
+
std::string result;
|
|
2742
2489
|
|
|
2743
|
-
|
|
2744
|
-
|
|
2745
|
-
|
|
2746
|
-
|
|
2747
|
-
|
|
2748
|
-
|
|
2749
|
-
|
|
2750
|
-
|
|
2490
|
+
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
2491
|
+
piece = llama_token_to_piece(ctx, tokens[i]);
|
|
2492
|
+
|
|
2493
|
+
result += piece;
|
|
2494
|
+
}
|
|
2495
|
+
|
|
2496
|
+
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
|
|
2497
|
+
return result;
|
|
2498
|
+
}
|
|
2499
|
+
|
|
2500
|
+
bool llama_should_add_bos_token(const llama_model * model) {
|
|
2501
|
+
const int add_bos = llama_add_bos_token(model);
|
|
2502
|
+
|
|
2503
|
+
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
|
|
2751
2504
|
}
|
|
2752
2505
|
|
|
2753
2506
|
//
|
|
2754
2507
|
// KV cache utils
|
|
2755
2508
|
//
|
|
2756
2509
|
|
|
2757
|
-
void
|
|
2510
|
+
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
|
2758
2511
|
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
|
2759
2512
|
|
|
2760
2513
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
|
@@ -2777,7 +2530,7 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
|
|
|
2777
2530
|
printf("\n=== Done dumping\n");
|
|
2778
2531
|
}
|
|
2779
2532
|
|
|
2780
|
-
void
|
|
2533
|
+
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
|
2781
2534
|
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
|
2782
2535
|
|
|
2783
2536
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
|
@@ -2825,6 +2578,10 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
|
|
2825
2578
|
printf("\n=== Done dumping\n");
|
|
2826
2579
|
}
|
|
2827
2580
|
|
|
2581
|
+
//
|
|
2582
|
+
// Embedding utils
|
|
2583
|
+
//
|
|
2584
|
+
|
|
2828
2585
|
void llama_embd_normalize(const float * inp, float * out, int n) {
|
|
2829
2586
|
double sum = 0.0;
|
|
2830
2587
|
for (int i = 0; i < n; i++) {
|
|
@@ -3009,3 +2766,226 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|
|
3009
2766
|
|
|
3010
2767
|
return result;
|
|
3011
2768
|
}
|
|
2769
|
+
|
|
2770
|
+
//
|
|
2771
|
+
// YAML utils
|
|
2772
|
+
//
|
|
2773
|
+
|
|
2774
|
+
void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
|
|
2775
|
+
if (data.empty()) {
|
|
2776
|
+
fprintf(stream, "%s:\n", prop_name);
|
|
2777
|
+
return;
|
|
2778
|
+
}
|
|
2779
|
+
|
|
2780
|
+
fprintf(stream, "%s: [", prop_name);
|
|
2781
|
+
for (size_t i = 0; i < data.size() - 1; ++i) {
|
|
2782
|
+
fprintf(stream, "%e, ", data[i]);
|
|
2783
|
+
}
|
|
2784
|
+
fprintf(stream, "%e]\n", data.back());
|
|
2785
|
+
}
|
|
2786
|
+
|
|
2787
|
+
void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
|
|
2788
|
+
if (data.empty()) {
|
|
2789
|
+
fprintf(stream, "%s:\n", prop_name);
|
|
2790
|
+
return;
|
|
2791
|
+
}
|
|
2792
|
+
|
|
2793
|
+
fprintf(stream, "%s: [", prop_name);
|
|
2794
|
+
for (size_t i = 0; i < data.size() - 1; ++i) {
|
|
2795
|
+
fprintf(stream, "%d, ", data[i]);
|
|
2796
|
+
}
|
|
2797
|
+
fprintf(stream, "%d]\n", data.back());
|
|
2798
|
+
}
|
|
2799
|
+
|
|
2800
|
+
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
|
|
2801
|
+
std::string data_str(data == NULL ? "" : data);
|
|
2802
|
+
|
|
2803
|
+
if (data_str.empty()) {
|
|
2804
|
+
fprintf(stream, "%s:\n", prop_name);
|
|
2805
|
+
return;
|
|
2806
|
+
}
|
|
2807
|
+
|
|
2808
|
+
size_t pos_start = 0;
|
|
2809
|
+
size_t pos_found = 0;
|
|
2810
|
+
|
|
2811
|
+
if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
|
|
2812
|
+
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
|
2813
|
+
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
|
2814
|
+
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
|
|
2815
|
+
data_str = "\"" + data_str + "\"";
|
|
2816
|
+
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
|
2817
|
+
return;
|
|
2818
|
+
}
|
|
2819
|
+
|
|
2820
|
+
if (data_str.find('\n') == std::string::npos) {
|
|
2821
|
+
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
|
2822
|
+
return;
|
|
2823
|
+
}
|
|
2824
|
+
|
|
2825
|
+
fprintf(stream, "%s: |\n", prop_name);
|
|
2826
|
+
while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
|
|
2827
|
+
fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
|
|
2828
|
+
pos_start = pos_found + 1;
|
|
2829
|
+
}
|
|
2830
|
+
}
|
|
2831
|
+
|
|
2832
|
+
void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
|
2833
|
+
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
|
2834
|
+
const llama_sampling_params & sparams = params.sparams;
|
|
2835
|
+
|
|
2836
|
+
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
|
2837
|
+
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
|
2838
|
+
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
|
2839
|
+
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
|
2840
|
+
fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
|
|
2841
|
+
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
|
2842
|
+
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
|
2843
|
+
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
|
2844
|
+
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
|
2845
|
+
fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
|
|
2846
|
+
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
|
2847
|
+
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
|
2848
|
+
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
|
2849
|
+
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
|
2850
|
+
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
|
2851
|
+
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
|
2852
|
+
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
|
|
2853
|
+
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
|
2854
|
+
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
|
2855
|
+
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
|
2856
|
+
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
|
2857
|
+
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
|
2858
|
+
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
|
2859
|
+
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
|
|
2860
|
+
|
|
2861
|
+
#ifdef NDEBUG
|
|
2862
|
+
fprintf(stream, "debug: false\n");
|
|
2863
|
+
#else
|
|
2864
|
+
fprintf(stream, "debug: true\n");
|
|
2865
|
+
#endif // NDEBUG
|
|
2866
|
+
|
|
2867
|
+
fprintf(stream, "model_desc: %s\n", model_desc);
|
|
2868
|
+
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
|
|
2869
|
+
|
|
2870
|
+
#ifdef __OPTIMIZE__
|
|
2871
|
+
fprintf(stream, "optimize: true\n");
|
|
2872
|
+
#else
|
|
2873
|
+
fprintf(stream, "optimize: false\n");
|
|
2874
|
+
#endif // __OPTIMIZE__
|
|
2875
|
+
|
|
2876
|
+
fprintf(stream, "time: %s\n", timestamp.c_str());
|
|
2877
|
+
|
|
2878
|
+
fprintf(stream, "\n");
|
|
2879
|
+
fprintf(stream, "###############\n");
|
|
2880
|
+
fprintf(stream, "# User Inputs #\n");
|
|
2881
|
+
fprintf(stream, "###############\n");
|
|
2882
|
+
fprintf(stream, "\n");
|
|
2883
|
+
|
|
2884
|
+
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
|
2885
|
+
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
|
2886
|
+
yaml_dump_string_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
|
|
2887
|
+
fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
|
|
2888
|
+
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
|
2889
|
+
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
|
2890
|
+
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
|
2891
|
+
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
|
2892
|
+
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
|
2893
|
+
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
|
|
2894
|
+
yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
|
|
2895
|
+
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
|
2896
|
+
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
|
2897
|
+
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
|
2898
|
+
|
|
2899
|
+
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
|
|
2900
|
+
const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
|
2901
|
+
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
|
2902
|
+
|
|
2903
|
+
yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
|
2904
|
+
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
|
|
2905
|
+
yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
|
|
2906
|
+
fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
|
|
2907
|
+
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
|
|
2908
|
+
fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
|
|
2909
|
+
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
|
|
2910
|
+
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
|
|
2911
|
+
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
|
2912
|
+
|
|
2913
|
+
fprintf(stream, "logit_bias:\n");
|
|
2914
|
+
for (std::pair<llama_token, float> lb : sparams.logit_bias) {
|
|
2915
|
+
if (ignore_eos && lb.first == logit_bias_eos->first) {
|
|
2916
|
+
continue;
|
|
2917
|
+
}
|
|
2918
|
+
fprintf(stream, " %d: %f", lb.first, lb.second);
|
|
2919
|
+
}
|
|
2920
|
+
|
|
2921
|
+
fprintf(stream, "lora:\n");
|
|
2922
|
+
for (std::tuple<std::string, float> la : params.lora_adapter) {
|
|
2923
|
+
if (std::get<1>(la) != 1.0f) {
|
|
2924
|
+
continue;
|
|
2925
|
+
}
|
|
2926
|
+
fprintf(stream, " - %s\n", std::get<0>(la).c_str());
|
|
2927
|
+
}
|
|
2928
|
+
fprintf(stream, "lora_scaled:\n");
|
|
2929
|
+
for (std::tuple<std::string, float> la : params.lora_adapter) {
|
|
2930
|
+
if (std::get<1>(la) == 1.0f) {
|
|
2931
|
+
continue;
|
|
2932
|
+
}
|
|
2933
|
+
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
|
|
2934
|
+
}
|
|
2935
|
+
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
|
2936
|
+
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
|
2937
|
+
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
|
2938
|
+
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
|
2939
|
+
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
|
2940
|
+
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
|
2941
|
+
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
|
2942
|
+
fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
|
|
2943
|
+
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
|
2944
|
+
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
|
2945
|
+
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
|
2946
|
+
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
|
2947
|
+
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
|
|
2948
|
+
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
|
2949
|
+
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
|
|
2950
|
+
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
|
2951
|
+
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
|
2952
|
+
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
|
|
2953
|
+
yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
|
|
2954
|
+
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
|
2955
|
+
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
|
2956
|
+
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
|
2957
|
+
yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
|
|
2958
|
+
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
|
2959
|
+
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
|
|
2960
|
+
|
|
2961
|
+
fprintf(stream, "reverse_prompt:\n");
|
|
2962
|
+
for (std::string ap : params.antiprompt) {
|
|
2963
|
+
size_t pos = 0;
|
|
2964
|
+
while ((pos = ap.find('\n', pos)) != std::string::npos) {
|
|
2965
|
+
ap.replace(pos, 1, "\\n");
|
|
2966
|
+
pos += 1;
|
|
2967
|
+
}
|
|
2968
|
+
|
|
2969
|
+
fprintf(stream, " - %s\n", ap.c_str());
|
|
2970
|
+
}
|
|
2971
|
+
|
|
2972
|
+
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
|
|
2973
|
+
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
|
|
2974
|
+
fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
|
|
2975
|
+
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
|
2976
|
+
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
|
2977
|
+
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
|
2978
|
+
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
|
2979
|
+
|
|
2980
|
+
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
|
|
2981
|
+
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
|
|
2982
|
+
|
|
2983
|
+
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
|
2984
|
+
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
|
|
2985
|
+
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
|
2986
|
+
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
|
2987
|
+
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
|
2988
|
+
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
|
2989
|
+
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
|
2990
|
+
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
|
2991
|
+
}
|