@fugood/llama.node 0.0.1-alpha.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +42 -7
- package/README.md +10 -0
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/lib/binding.js +1 -1
- package/lib/binding.ts +16 -2
- package/lib/index.ts +2 -2
- package/package.json +15 -3
- package/src/DetokenizeWorker.cpp +22 -0
- package/src/DetokenizeWorker.h +19 -0
- package/src/EmbeddingWorker.cpp +46 -0
- package/src/EmbeddingWorker.h +23 -0
- package/src/LlamaCompletionWorker.cpp +5 -1
- package/src/LlamaCompletionWorker.h +4 -0
- package/src/LlamaContext.cpp +80 -1
- package/src/LlamaContext.h +3 -0
- package/src/TokenizeWorker.cpp +26 -0
- package/src/TokenizeWorker.h +23 -0
- package/src/common.hpp +12 -7
- package/src/llama.cpp/CMakeLists.txt +13 -7
- package/src/llama.cpp/common/common.cpp +221 -173
- package/src/llama.cpp/common/common.h +19 -8
- package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
- package/src/llama.cpp/common/log.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +17 -1
- package/src/llama.cpp/common/sampling.h +28 -20
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
- package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
- package/src/llama.cpp/examples/llava/clip.cpp +74 -23
- package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
- package/src/llama.cpp/examples/main/main.cpp +10 -8
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/server/server.cpp +97 -86
- package/src/llama.cpp/examples/server/utils.hpp +17 -15
- package/src/llama.cpp/ggml-backend.c +7 -5
- package/src/llama.cpp/ggml-impl.h +339 -4
- package/src/llama.cpp/ggml-kompute.cpp +7 -0
- package/src/llama.cpp/ggml-opencl.cpp +1 -0
- package/src/llama.cpp/ggml-quants.c +302 -293
- package/src/llama.cpp/ggml-sycl.cpp +28 -16
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- package/src/llama.cpp/ggml-vulkan.cpp +951 -263
- package/src/llama.cpp/ggml.c +1469 -116
- package/src/llama.cpp/ggml.h +37 -7
- package/src/llama.cpp/llama.cpp +969 -432
- package/src/llama.cpp/llama.h +46 -14
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
- package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/sgemm.cpp +134 -103
- package/src/llama.cpp/sgemm.h +4 -2
- package/src/llama.cpp/tests/CMakeLists.txt +96 -36
- package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
- package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
- package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
- package/src/llama.cpp/unicode-data.cpp +1188 -656
- package/src/llama.cpp/unicode-data.h +4 -3
- package/src/llama.cpp/unicode.cpp +590 -49
- package/src/llama.cpp/unicode.h +6 -3
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
- package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
|
+
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
3
|
+
#define JSON_ASSERT GGML_ASSERT
|
|
2
4
|
#include "json.hpp"
|
|
3
5
|
#include "json-schema-to-grammar.h"
|
|
4
6
|
#include "llama.h"
|
|
@@ -67,7 +69,6 @@
|
|
|
67
69
|
#include <sys/syslimits.h>
|
|
68
70
|
#endif
|
|
69
71
|
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
|
70
|
-
#define LLAMA_CURL_MAX_HEADER_LENGTH 256
|
|
71
72
|
#endif // LLAMA_USE_CURL
|
|
72
73
|
|
|
73
74
|
using json = nlohmann::ordered_json;
|
|
@@ -77,7 +78,7 @@ int32_t get_num_physical_cores() {
|
|
|
77
78
|
// enumerate the set of thread siblings, num entries is num cores
|
|
78
79
|
std::unordered_set<std::string> siblings;
|
|
79
80
|
for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
|
|
80
|
-
std::ifstream thread_siblings("/sys/devices/system/cpu"
|
|
81
|
+
std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
|
|
81
82
|
+ std::to_string(cpu) + "/topology/thread_siblings");
|
|
82
83
|
if (!thread_siblings.is_open()) {
|
|
83
84
|
break; // no more cpus
|
|
@@ -234,15 +235,63 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
|
234
235
|
return result;
|
|
235
236
|
}
|
|
236
237
|
|
|
238
|
+
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
|
|
239
|
+
const char * sep = strchr(data, '=');
|
|
240
|
+
if (sep == nullptr || sep - data >= 128) {
|
|
241
|
+
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
|
|
242
|
+
return false;
|
|
243
|
+
}
|
|
244
|
+
llama_model_kv_override kvo;
|
|
245
|
+
std::strncpy(kvo.key, data, sep - data);
|
|
246
|
+
kvo.key[sep - data] = 0;
|
|
247
|
+
sep++;
|
|
248
|
+
if (strncmp(sep, "int:", 4) == 0) {
|
|
249
|
+
sep += 4;
|
|
250
|
+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
|
251
|
+
kvo.val_i64 = std::atol(sep);
|
|
252
|
+
} else if (strncmp(sep, "float:", 6) == 0) {
|
|
253
|
+
sep += 6;
|
|
254
|
+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
|
255
|
+
kvo.val_f64 = std::atof(sep);
|
|
256
|
+
} else if (strncmp(sep, "bool:", 5) == 0) {
|
|
257
|
+
sep += 5;
|
|
258
|
+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
|
259
|
+
if (std::strcmp(sep, "true") == 0) {
|
|
260
|
+
kvo.val_bool = true;
|
|
261
|
+
} else if (std::strcmp(sep, "false") == 0) {
|
|
262
|
+
kvo.val_bool = false;
|
|
263
|
+
} else {
|
|
264
|
+
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
|
|
265
|
+
return false;
|
|
266
|
+
}
|
|
267
|
+
} else if (strncmp(sep, "str:", 4) == 0) {
|
|
268
|
+
sep += 4;
|
|
269
|
+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
|
270
|
+
if (strlen(sep) > 127) {
|
|
271
|
+
fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
|
|
272
|
+
return false;
|
|
273
|
+
}
|
|
274
|
+
strncpy(kvo.val_str, sep, 127);
|
|
275
|
+
kvo.val_str[127] = '\0';
|
|
276
|
+
} else {
|
|
277
|
+
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
|
|
278
|
+
return false;
|
|
279
|
+
}
|
|
280
|
+
overrides.emplace_back(std::move(kvo));
|
|
281
|
+
return true;
|
|
282
|
+
}
|
|
283
|
+
|
|
237
284
|
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
|
|
238
|
-
llama_sampling_params& sparams = params.sparams;
|
|
285
|
+
llama_sampling_params & sparams = params.sparams;
|
|
239
286
|
|
|
240
287
|
if (arg == "-s" || arg == "--seed") {
|
|
241
288
|
if (++i >= argc) {
|
|
242
289
|
invalid_param = true;
|
|
243
290
|
return true;
|
|
244
291
|
}
|
|
292
|
+
// This is temporary, in the future the samplign state will be moved fully to llama_sampling_context.
|
|
245
293
|
params.seed = std::stoul(argv[i]);
|
|
294
|
+
sparams.seed = std::stoul(argv[i]);
|
|
246
295
|
return true;
|
|
247
296
|
}
|
|
248
297
|
if (arg == "-t" || arg == "--threads") {
|
|
@@ -845,7 +894,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
845
894
|
invalid_param = true;
|
|
846
895
|
return true;
|
|
847
896
|
}
|
|
848
|
-
params.image
|
|
897
|
+
params.image.emplace_back(argv[i]);
|
|
849
898
|
return true;
|
|
850
899
|
}
|
|
851
900
|
if (arg == "-i" || arg == "--interactive") {
|
|
@@ -864,6 +913,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
864
913
|
params.instruct = true;
|
|
865
914
|
return true;
|
|
866
915
|
}
|
|
916
|
+
if (arg == "-cnv" || arg == "--conversation") {
|
|
917
|
+
params.conversation = true;
|
|
918
|
+
return true;
|
|
919
|
+
}
|
|
867
920
|
if (arg == "-cml" || arg == "--chatml") {
|
|
868
921
|
params.chatml = true;
|
|
869
922
|
return true;
|
|
@@ -900,6 +953,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
900
953
|
params.cont_batching = true;
|
|
901
954
|
return true;
|
|
902
955
|
}
|
|
956
|
+
if (arg == "-fa" || arg == "--flash-attn") {
|
|
957
|
+
params.flash_attn = true;
|
|
958
|
+
return true;
|
|
959
|
+
}
|
|
903
960
|
if (arg == "--color") {
|
|
904
961
|
params.use_color = true;
|
|
905
962
|
return true;
|
|
@@ -1087,6 +1144,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1087
1144
|
params.n_print = std::stoi(argv[i]);
|
|
1088
1145
|
return true;
|
|
1089
1146
|
}
|
|
1147
|
+
if (arg == "--check-tensors") {
|
|
1148
|
+
params.check_tensors = true;
|
|
1149
|
+
return true;
|
|
1150
|
+
}
|
|
1090
1151
|
if (arg == "--ppl-output-type") {
|
|
1091
1152
|
if (++i >= argc) {
|
|
1092
1153
|
invalid_param = true;
|
|
@@ -1238,47 +1299,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1238
1299
|
invalid_param = true;
|
|
1239
1300
|
return true;
|
|
1240
1301
|
}
|
|
1241
|
-
|
|
1242
|
-
if (sep == nullptr || sep - argv[i] >= 128) {
|
|
1243
|
-
fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
|
|
1244
|
-
invalid_param = true;
|
|
1245
|
-
return true;
|
|
1246
|
-
}
|
|
1247
|
-
struct llama_model_kv_override kvo;
|
|
1248
|
-
std::strncpy(kvo.key, argv[i], sep - argv[i]);
|
|
1249
|
-
kvo.key[sep - argv[i]] = 0;
|
|
1250
|
-
sep++;
|
|
1251
|
-
if (strncmp(sep, "int:", 4) == 0) {
|
|
1252
|
-
sep += 4;
|
|
1253
|
-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
|
1254
|
-
kvo.int_value = std::atol(sep);
|
|
1255
|
-
}
|
|
1256
|
-
else if (strncmp(sep, "float:", 6) == 0) {
|
|
1257
|
-
sep += 6;
|
|
1258
|
-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
|
1259
|
-
kvo.float_value = std::atof(sep);
|
|
1260
|
-
}
|
|
1261
|
-
else if (strncmp(sep, "bool:", 5) == 0) {
|
|
1262
|
-
sep += 5;
|
|
1263
|
-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
|
1264
|
-
if (std::strcmp(sep, "true") == 0) {
|
|
1265
|
-
kvo.bool_value = true;
|
|
1266
|
-
}
|
|
1267
|
-
else if (std::strcmp(sep, "false") == 0) {
|
|
1268
|
-
kvo.bool_value = false;
|
|
1269
|
-
}
|
|
1270
|
-
else {
|
|
1271
|
-
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
|
|
1272
|
-
invalid_param = true;
|
|
1273
|
-
return true;
|
|
1274
|
-
}
|
|
1275
|
-
}
|
|
1276
|
-
else {
|
|
1302
|
+
if (!parse_kv_override(argv[i], params.kv_overrides)) {
|
|
1277
1303
|
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
|
1278
1304
|
invalid_param = true;
|
|
1279
1305
|
return true;
|
|
1280
1306
|
}
|
|
1281
|
-
params.kv_overrides.push_back(kvo);
|
|
1282
1307
|
return true;
|
|
1283
1308
|
}
|
|
1284
1309
|
#ifndef LOG_DISABLE_LOGS
|
|
@@ -1308,6 +1333,29 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1308
1333
|
return false;
|
|
1309
1334
|
}
|
|
1310
1335
|
|
|
1336
|
+
void gpt_params_handle_model_default(gpt_params & params) {
|
|
1337
|
+
if (!params.hf_repo.empty()) {
|
|
1338
|
+
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
1339
|
+
if (params.hf_file.empty()) {
|
|
1340
|
+
if (params.model.empty()) {
|
|
1341
|
+
throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
|
|
1342
|
+
}
|
|
1343
|
+
params.hf_file = params.model;
|
|
1344
|
+
} else if (params.model.empty()) {
|
|
1345
|
+
params.model = "models/" + string_split(params.hf_file, '/').back();
|
|
1346
|
+
}
|
|
1347
|
+
} else if (!params.model_url.empty()) {
|
|
1348
|
+
if (params.model.empty()) {
|
|
1349
|
+
auto f = string_split(params.model_url, '#').front();
|
|
1350
|
+
f = string_split(f, '?').front();
|
|
1351
|
+
f = string_split(f, '/').back();
|
|
1352
|
+
params.model = "models/" + f;
|
|
1353
|
+
}
|
|
1354
|
+
} else if (params.model.empty()) {
|
|
1355
|
+
params.model = DEFAULT_MODEL_PATH;
|
|
1356
|
+
}
|
|
1357
|
+
}
|
|
1358
|
+
|
|
1311
1359
|
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|
1312
1360
|
bool invalid_param = false;
|
|
1313
1361
|
std::string arg;
|
|
@@ -1336,10 +1384,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|
|
1336
1384
|
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
|
1337
1385
|
}
|
|
1338
1386
|
|
|
1339
|
-
|
|
1340
|
-
if (!params.hf_repo.empty() && params.hf_file.empty()) {
|
|
1341
|
-
params.hf_file = params.model;
|
|
1342
|
-
}
|
|
1387
|
+
gpt_params_handle_model_default(params);
|
|
1343
1388
|
|
|
1344
1389
|
if (params.escape) {
|
|
1345
1390
|
process_escapes(params.prompt);
|
|
@@ -1378,6 +1423,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
|
1378
1423
|
printf(" --version show version and build info\n");
|
|
1379
1424
|
printf(" -i, --interactive run in interactive mode\n");
|
|
1380
1425
|
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
|
1426
|
+
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
|
|
1381
1427
|
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
|
1382
1428
|
printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
|
|
1383
1429
|
printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
|
|
@@ -1478,8 +1524,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
|
1478
1524
|
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
|
|
1479
1525
|
printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
|
|
1480
1526
|
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
|
1527
|
+
printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
|
|
1481
1528
|
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
|
|
1482
|
-
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
|
|
1529
|
+
printf(" --image IMAGE_FILE path to an image file. use with multimodal models. Specify multiple times for batching\n");
|
|
1483
1530
|
if (llama_supports_mlock()) {
|
|
1484
1531
|
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
|
1485
1532
|
}
|
|
@@ -1532,7 +1579,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
|
1532
1579
|
printf(" --control-vector-layer-range START END\n");
|
|
1533
1580
|
printf(" layer range to apply the control vector(s) to, start and end inclusive\n");
|
|
1534
1581
|
printf(" -m FNAME, --model FNAME\n");
|
|
1535
|
-
printf(" model path (default: %s)\n",
|
|
1582
|
+
printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
|
|
1536
1583
|
printf(" -md FNAME, --model-draft FNAME\n");
|
|
1537
1584
|
printf(" draft model for speculative decoding (default: unused)\n");
|
|
1538
1585
|
printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
|
|
@@ -1549,9 +1596,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
|
1549
1596
|
printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
|
|
1550
1597
|
printf(" --override-kv KEY=TYPE:VALUE\n");
|
|
1551
1598
|
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
|
1552
|
-
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
|
1599
|
+
printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
|
1553
1600
|
printf(" -ptc N, --print-token-count N\n");
|
|
1554
1601
|
printf(" print token count every N tokens (default: %d)\n", params.n_print);
|
|
1602
|
+
printf(" --check-tensors check model tensor data for invalid values\n");
|
|
1555
1603
|
printf("\n");
|
|
1556
1604
|
#ifndef LOG_DISABLE_LOGS
|
|
1557
1605
|
log_print_usage();
|
|
@@ -1676,6 +1724,18 @@ std::vector<std::string> string_split(std::string input, char separator) {
|
|
|
1676
1724
|
return parts;
|
|
1677
1725
|
}
|
|
1678
1726
|
|
|
1727
|
+
std::string string_strip(const std::string & str) {
|
|
1728
|
+
size_t start = 0;
|
|
1729
|
+
size_t end = str.size();
|
|
1730
|
+
while (start < end && std::isspace(str[start])) {
|
|
1731
|
+
start++;
|
|
1732
|
+
}
|
|
1733
|
+
while (end > start && std::isspace(str[end - 1])) {
|
|
1734
|
+
end--;
|
|
1735
|
+
}
|
|
1736
|
+
return str.substr(start, end - start);
|
|
1737
|
+
}
|
|
1738
|
+
|
|
1679
1739
|
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
|
1680
1740
|
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
|
|
1681
1741
|
{"top_k", llama_sampler_type::TOP_K},
|
|
@@ -1772,6 +1832,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
|
|
1772
1832
|
mparams.tensor_split = params.tensor_split;
|
|
1773
1833
|
mparams.use_mmap = params.use_mmap;
|
|
1774
1834
|
mparams.use_mlock = params.use_mlock;
|
|
1835
|
+
mparams.check_tensors = params.check_tensors;
|
|
1775
1836
|
if (params.kv_overrides.empty()) {
|
|
1776
1837
|
mparams.kv_overrides = NULL;
|
|
1777
1838
|
} else {
|
|
@@ -1836,6 +1897,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
|
1836
1897
|
cparams.cb_eval = params.cb_eval;
|
|
1837
1898
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
|
1838
1899
|
cparams.offload_kqv = !params.no_kv_offload;
|
|
1900
|
+
cparams.flash_attn = params.flash_attn;
|
|
1839
1901
|
|
|
1840
1902
|
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
|
1841
1903
|
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
|
@@ -1866,59 +1928,75 @@ void llama_batch_add(
|
|
|
1866
1928
|
|
|
1867
1929
|
#ifdef LLAMA_USE_CURL
|
|
1868
1930
|
|
|
1869
|
-
static bool
|
|
1931
|
+
static bool starts_with(const std::string & str, const std::string & prefix) {
|
|
1932
|
+
// While we wait for C++20's std::string::starts_with...
|
|
1933
|
+
return str.rfind(prefix, 0) == 0;
|
|
1934
|
+
}
|
|
1935
|
+
|
|
1936
|
+
static bool llama_download_file(const std::string & url, const std::string & path) {
|
|
1937
|
+
|
|
1938
|
+
// Initialize libcurl
|
|
1939
|
+
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
|
1940
|
+
if (!curl) {
|
|
1941
|
+
fprintf(stderr, "%s: error initializing libcurl\n", __func__);
|
|
1942
|
+
return false;
|
|
1943
|
+
}
|
|
1944
|
+
|
|
1870
1945
|
bool force_download = false;
|
|
1871
1946
|
|
|
1872
1947
|
// Set the URL, allow to follow http redirection
|
|
1873
|
-
curl_easy_setopt(curl, CURLOPT_URL, url);
|
|
1874
|
-
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
|
1948
|
+
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
1949
|
+
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
|
1875
1950
|
|
|
1876
1951
|
#if defined(_WIN32)
|
|
1877
1952
|
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
|
1878
1953
|
// operating system. Currently implemented under MS-Windows.
|
|
1879
|
-
curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
1954
|
+
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
1880
1955
|
#endif
|
|
1881
1956
|
|
|
1882
1957
|
// Check if the file already exists locally
|
|
1883
1958
|
struct stat model_file_info;
|
|
1884
|
-
auto file_exists = (stat(path, &model_file_info) == 0);
|
|
1959
|
+
auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
|
|
1885
1960
|
|
|
1886
|
-
// If the file exists, check
|
|
1887
|
-
|
|
1888
|
-
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
|
|
1892
|
-
char last_modified_path[PATH_MAX] = {0};
|
|
1893
|
-
snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path);
|
|
1961
|
+
// If the file exists, check its JSON metadata companion file.
|
|
1962
|
+
std::string metadata_path = path + ".json";
|
|
1963
|
+
nlohmann::json metadata;
|
|
1964
|
+
std::string etag;
|
|
1965
|
+
std::string last_modified;
|
|
1894
1966
|
|
|
1895
1967
|
if (file_exists) {
|
|
1896
|
-
|
|
1897
|
-
|
|
1898
|
-
|
|
1899
|
-
|
|
1900
|
-
|
|
1901
|
-
fprintf(stderr, "%s: previous file found %s: %s\n", __func__,
|
|
1902
|
-
|
|
1903
|
-
|
|
1904
|
-
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
|
-
|
|
1911
|
-
|
|
1912
|
-
|
|
1968
|
+
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
|
1969
|
+
std::ifstream metadata_in(metadata_path);
|
|
1970
|
+
if (metadata_in.good()) {
|
|
1971
|
+
try {
|
|
1972
|
+
metadata_in >> metadata;
|
|
1973
|
+
fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
|
1974
|
+
if (metadata.contains("url") && metadata.at("url").is_string()) {
|
|
1975
|
+
auto previous_url = metadata.at("url").get<std::string>();
|
|
1976
|
+
if (previous_url != url) {
|
|
1977
|
+
fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
|
|
1978
|
+
return false;
|
|
1979
|
+
}
|
|
1980
|
+
}
|
|
1981
|
+
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
|
1982
|
+
etag = metadata.at("etag");
|
|
1983
|
+
}
|
|
1984
|
+
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
|
1985
|
+
last_modified = metadata.at("lastModified");
|
|
1986
|
+
}
|
|
1987
|
+
} catch (const nlohmann::json::exception & e) {
|
|
1988
|
+
fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
|
1989
|
+
return false;
|
|
1913
1990
|
}
|
|
1914
|
-
fclose(f_last_modified);
|
|
1915
1991
|
}
|
|
1992
|
+
} else {
|
|
1993
|
+
fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str());
|
|
1916
1994
|
}
|
|
1917
1995
|
|
|
1918
1996
|
// Send a HEAD request to retrieve the etag and last-modified headers
|
|
1919
1997
|
struct llama_load_model_from_url_headers {
|
|
1920
|
-
|
|
1921
|
-
|
|
1998
|
+
std::string etag;
|
|
1999
|
+
std::string last_modified;
|
|
1922
2000
|
};
|
|
1923
2001
|
llama_load_model_from_url_headers headers;
|
|
1924
2002
|
{
|
|
@@ -1926,38 +2004,37 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
|
|
|
1926
2004
|
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
|
1927
2005
|
llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
|
|
1928
2006
|
|
|
1929
|
-
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
|
|
1934
|
-
|
|
1935
|
-
if (
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
2007
|
+
static std::regex header_regex("([^:]+): (.*)\r\n");
|
|
2008
|
+
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
|
2009
|
+
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
|
2010
|
+
|
|
2011
|
+
std::string header(buffer, n_items);
|
|
2012
|
+
std::smatch match;
|
|
2013
|
+
if (std::regex_match(header, match, header_regex)) {
|
|
2014
|
+
const std::string & key = match[1];
|
|
2015
|
+
const std::string & value = match[2];
|
|
2016
|
+
if (std::regex_match(key, match, etag_regex)) {
|
|
2017
|
+
headers->etag = value;
|
|
2018
|
+
} else if (std::regex_match(key, match, last_modified_regex)) {
|
|
2019
|
+
headers->last_modified = value;
|
|
2020
|
+
}
|
|
1943
2021
|
}
|
|
1944
2022
|
return n_items;
|
|
1945
2023
|
};
|
|
1946
2024
|
|
|
1947
|
-
curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
|
1948
|
-
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
|
1949
|
-
curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
|
1950
|
-
curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers);
|
|
2025
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
|
2026
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
|
2027
|
+
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
|
2028
|
+
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
|
1951
2029
|
|
|
1952
|
-
CURLcode res = curl_easy_perform(curl);
|
|
2030
|
+
CURLcode res = curl_easy_perform(curl.get());
|
|
1953
2031
|
if (res != CURLE_OK) {
|
|
1954
|
-
curl_easy_cleanup(curl);
|
|
1955
2032
|
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
|
|
1956
2033
|
return false;
|
|
1957
2034
|
}
|
|
1958
2035
|
|
|
1959
2036
|
long http_code = 0;
|
|
1960
|
-
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code);
|
|
2037
|
+
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
|
1961
2038
|
if (http_code != 200) {
|
|
1962
2039
|
// HEAD not supported, we don't know if the file has changed
|
|
1963
2040
|
// force trigger downloading
|
|
@@ -1966,28 +2043,30 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
|
|
|
1966
2043
|
}
|
|
1967
2044
|
}
|
|
1968
2045
|
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
|
|
1972
|
-
|
|
1973
|
-
|
|
2046
|
+
bool should_download = !file_exists || force_download;
|
|
2047
|
+
if (!should_download) {
|
|
2048
|
+
if (!etag.empty() && etag != headers.etag) {
|
|
2049
|
+
fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|
|
2050
|
+
should_download = true;
|
|
2051
|
+
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
|
|
2052
|
+
fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
|
|
2053
|
+
should_download = true;
|
|
2054
|
+
}
|
|
2055
|
+
}
|
|
1974
2056
|
if (should_download) {
|
|
1975
|
-
|
|
1976
|
-
snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path);
|
|
2057
|
+
std::string path_temporary = path + ".downloadInProgress";
|
|
1977
2058
|
if (file_exists) {
|
|
1978
|
-
fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path);
|
|
1979
|
-
if (remove(path) != 0) {
|
|
1980
|
-
|
|
1981
|
-
fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path);
|
|
2059
|
+
fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
|
2060
|
+
if (remove(path.c_str()) != 0) {
|
|
2061
|
+
fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str());
|
|
1982
2062
|
return false;
|
|
1983
2063
|
}
|
|
1984
2064
|
}
|
|
1985
2065
|
|
|
1986
2066
|
// Set the output file
|
|
1987
|
-
|
|
2067
|
+
std::unique_ptr<FILE, decltype(&fclose)> outfile(fopen(path_temporary.c_str(), "wb"), fclose);
|
|
1988
2068
|
if (!outfile) {
|
|
1989
|
-
|
|
1990
|
-
fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path);
|
|
2069
|
+
fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
|
|
1991
2070
|
return false;
|
|
1992
2071
|
}
|
|
1993
2072
|
|
|
@@ -1995,12 +2074,12 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
|
|
|
1995
2074
|
auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
|
|
1996
2075
|
return fwrite(data, size, nmemb, (FILE *)fd);
|
|
1997
2076
|
};
|
|
1998
|
-
curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
|
|
1999
|
-
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
|
2000
|
-
curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile);
|
|
2077
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
|
|
2078
|
+
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
|
2079
|
+
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
|
|
2001
2080
|
|
|
2002
2081
|
// display download progress
|
|
2003
|
-
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
|
|
2082
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
|
|
2004
2083
|
|
|
2005
2084
|
// helper function to hide password in URL
|
|
2006
2085
|
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
|
|
@@ -2019,51 +2098,34 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
|
|
|
2019
2098
|
|
|
2020
2099
|
// start the download
|
|
2021
2100
|
fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
|
2022
|
-
llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified);
|
|
2023
|
-
auto res = curl_easy_perform(curl);
|
|
2101
|
+
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
|
2102
|
+
auto res = curl_easy_perform(curl.get());
|
|
2024
2103
|
if (res != CURLE_OK) {
|
|
2025
|
-
fclose(outfile);
|
|
2026
|
-
curl_easy_cleanup(curl);
|
|
2027
2104
|
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
|
|
2028
2105
|
return false;
|
|
2029
2106
|
}
|
|
2030
2107
|
|
|
2031
2108
|
long http_code = 0;
|
|
2032
|
-
curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code);
|
|
2109
|
+
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
|
2033
2110
|
if (http_code < 200 || http_code >= 400) {
|
|
2034
|
-
fclose(outfile);
|
|
2035
|
-
curl_easy_cleanup(curl);
|
|
2036
2111
|
fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
|
|
2037
2112
|
return false;
|
|
2038
2113
|
}
|
|
2039
2114
|
|
|
2040
|
-
//
|
|
2041
|
-
|
|
2115
|
+
// Causes file to be closed explicitly here before we rename it.
|
|
2116
|
+
outfile.reset();
|
|
2042
2117
|
|
|
2043
|
-
// Write the
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
|
|
2047
|
-
|
|
2048
|
-
|
|
2049
|
-
|
|
2050
|
-
|
|
2051
|
-
}
|
|
2118
|
+
// Write the updated JSON metadata file.
|
|
2119
|
+
metadata.update({
|
|
2120
|
+
{"url", url},
|
|
2121
|
+
{"etag", headers.etag},
|
|
2122
|
+
{"lastModified", headers.last_modified}
|
|
2123
|
+
});
|
|
2124
|
+
std::ofstream(metadata_path) << metadata.dump(4);
|
|
2125
|
+
fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
|
2052
2126
|
|
|
2053
|
-
|
|
2054
|
-
|
|
2055
|
-
auto * last_modified_file = fopen(last_modified_path, "w");
|
|
2056
|
-
if (last_modified_file) {
|
|
2057
|
-
fputs(headers.last_modified, last_modified_file);
|
|
2058
|
-
fclose(last_modified_file);
|
|
2059
|
-
fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path,
|
|
2060
|
-
headers.last_modified);
|
|
2061
|
-
}
|
|
2062
|
-
}
|
|
2063
|
-
|
|
2064
|
-
if (rename(path_temporary, path) != 0) {
|
|
2065
|
-
curl_easy_cleanup(curl);
|
|
2066
|
-
fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path);
|
|
2127
|
+
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
|
2128
|
+
fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
|
2067
2129
|
return false;
|
|
2068
2130
|
}
|
|
2069
2131
|
}
|
|
@@ -2081,15 +2143,7 @@ struct llama_model * llama_load_model_from_url(
|
|
|
2081
2143
|
return NULL;
|
|
2082
2144
|
}
|
|
2083
2145
|
|
|
2084
|
-
|
|
2085
|
-
auto * curl = curl_easy_init();
|
|
2086
|
-
|
|
2087
|
-
if (!curl) {
|
|
2088
|
-
fprintf(stderr, "%s: error initializing libcurl\n", __func__);
|
|
2089
|
-
return NULL;
|
|
2090
|
-
}
|
|
2091
|
-
|
|
2092
|
-
if (!llama_download_file(curl, model_url, path_model)) {
|
|
2146
|
+
if (!llama_download_file(model_url, path_model)) {
|
|
2093
2147
|
return NULL;
|
|
2094
2148
|
}
|
|
2095
2149
|
|
|
@@ -2103,7 +2157,6 @@ struct llama_model * llama_load_model_from_url(
|
|
|
2103
2157
|
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
|
|
2104
2158
|
if (!ctx_gguf) {
|
|
2105
2159
|
fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model);
|
|
2106
|
-
curl_easy_cleanup(curl);
|
|
2107
2160
|
return NULL;
|
|
2108
2161
|
}
|
|
2109
2162
|
|
|
@@ -2115,8 +2168,6 @@ struct llama_model * llama_load_model_from_url(
|
|
|
2115
2168
|
gguf_free(ctx_gguf);
|
|
2116
2169
|
}
|
|
2117
2170
|
|
|
2118
|
-
curl_easy_cleanup(curl);
|
|
2119
|
-
|
|
2120
2171
|
if (n_split > 1) {
|
|
2121
2172
|
char split_prefix[PATH_MAX] = {0};
|
|
2122
2173
|
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
|
@@ -2147,11 +2198,7 @@ struct llama_model * llama_load_model_from_url(
|
|
|
2147
2198
|
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
|
2148
2199
|
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
|
|
2149
2200
|
|
|
2150
|
-
|
|
2151
|
-
bool res = llama_download_file(curl, split_url, split_path);
|
|
2152
|
-
curl_easy_cleanup(curl);
|
|
2153
|
-
|
|
2154
|
-
return res;
|
|
2201
|
+
return llama_download_file(split_url, split_path);
|
|
2155
2202
|
}, idx));
|
|
2156
2203
|
}
|
|
2157
2204
|
|
|
@@ -2326,12 +2373,12 @@ std::vector<llama_token> llama_tokenize(
|
|
|
2326
2373
|
return result;
|
|
2327
2374
|
}
|
|
2328
2375
|
|
|
2329
|
-
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
|
2376
|
+
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
|
2330
2377
|
std::vector<char> result(8, 0);
|
|
2331
|
-
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(),
|
|
2378
|
+
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
|
2332
2379
|
if (n_tokens < 0) {
|
|
2333
2380
|
result.resize(-n_tokens);
|
|
2334
|
-
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(),
|
|
2381
|
+
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
|
2335
2382
|
GGML_ASSERT(check == -n_tokens);
|
|
2336
2383
|
} else {
|
|
2337
2384
|
result.resize(n_tokens);
|
|
@@ -2638,7 +2685,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|
|
2638
2685
|
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
|
2639
2686
|
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
|
2640
2687
|
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
|
2641
|
-
fprintf(stream, "model: %s # default:
|
|
2688
|
+
fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
|
|
2642
2689
|
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
|
2643
2690
|
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
|
2644
2691
|
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
|
@@ -2673,6 +2720,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|
|
2673
2720
|
fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
|
|
2674
2721
|
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
|
2675
2722
|
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
|
2723
|
+
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
|
2676
2724
|
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
|
2677
2725
|
|
|
2678
2726
|
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
|