@fugood/llama.node 0.0.1-alpha.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/CMakeLists.txt +42 -7
  2. package/README.md +10 -0
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  10. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  11. package/lib/binding.js +1 -1
  12. package/lib/binding.ts +16 -2
  13. package/lib/index.ts +2 -2
  14. package/package.json +15 -3
  15. package/src/DetokenizeWorker.cpp +22 -0
  16. package/src/DetokenizeWorker.h +19 -0
  17. package/src/EmbeddingWorker.cpp +46 -0
  18. package/src/EmbeddingWorker.h +23 -0
  19. package/src/LlamaCompletionWorker.cpp +5 -1
  20. package/src/LlamaCompletionWorker.h +4 -0
  21. package/src/LlamaContext.cpp +80 -1
  22. package/src/LlamaContext.h +3 -0
  23. package/src/TokenizeWorker.cpp +26 -0
  24. package/src/TokenizeWorker.h +23 -0
  25. package/src/common.hpp +12 -7
  26. package/src/llama.cpp/CMakeLists.txt +13 -7
  27. package/src/llama.cpp/common/common.cpp +221 -173
  28. package/src/llama.cpp/common/common.h +19 -8
  29. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  30. package/src/llama.cpp/common/log.h +2 -2
  31. package/src/llama.cpp/common/sampling.cpp +17 -1
  32. package/src/llama.cpp/common/sampling.h +28 -20
  33. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
  34. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
  35. package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
  36. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
  37. package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
  38. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
  39. package/src/llama.cpp/examples/llava/clip.cpp +74 -23
  40. package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
  41. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
  42. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
  43. package/src/llama.cpp/examples/main/main.cpp +10 -8
  44. package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
  45. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  46. package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
  47. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  48. package/src/llama.cpp/examples/server/server.cpp +97 -86
  49. package/src/llama.cpp/examples/server/utils.hpp +17 -15
  50. package/src/llama.cpp/ggml-backend.c +7 -5
  51. package/src/llama.cpp/ggml-impl.h +339 -4
  52. package/src/llama.cpp/ggml-kompute.cpp +7 -0
  53. package/src/llama.cpp/ggml-opencl.cpp +1 -0
  54. package/src/llama.cpp/ggml-quants.c +302 -293
  55. package/src/llama.cpp/ggml-sycl.cpp +28 -16
  56. package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
  57. package/src/llama.cpp/ggml-vulkan.cpp +951 -263
  58. package/src/llama.cpp/ggml.c +1469 -116
  59. package/src/llama.cpp/ggml.h +37 -7
  60. package/src/llama.cpp/llama.cpp +969 -432
  61. package/src/llama.cpp/llama.h +46 -14
  62. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
  63. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
  64. package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
  65. package/src/llama.cpp/requirements.txt +1 -0
  66. package/src/llama.cpp/sgemm.cpp +134 -103
  67. package/src/llama.cpp/sgemm.h +4 -2
  68. package/src/llama.cpp/tests/CMakeLists.txt +96 -36
  69. package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
  70. package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
  71. package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
  72. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
  73. package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
  74. package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
  75. package/src/llama.cpp/unicode-data.cpp +1188 -656
  76. package/src/llama.cpp/unicode-data.h +4 -3
  77. package/src/llama.cpp/unicode.cpp +590 -49
  78. package/src/llama.cpp/unicode.h +6 -3
  79. package/bin/win32/arm64/llama-node.node +0 -0
  80. package/bin/win32/arm64/node.lib +0 -0
  81. package/bin/win32/x64/llama-node.node +0 -0
  82. package/bin/win32/x64/node.lib +0 -0
  83. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
  84. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
@@ -1,4 +1,6 @@
1
1
  #include "common.h"
2
+ // Change JSON_ASSERT from assert() to GGML_ASSERT:
3
+ #define JSON_ASSERT GGML_ASSERT
2
4
  #include "json.hpp"
3
5
  #include "json-schema-to-grammar.h"
4
6
  #include "llama.h"
@@ -67,7 +69,6 @@
67
69
  #include <sys/syslimits.h>
68
70
  #endif
69
71
  #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
70
- #define LLAMA_CURL_MAX_HEADER_LENGTH 256
71
72
  #endif // LLAMA_USE_CURL
72
73
 
73
74
  using json = nlohmann::ordered_json;
@@ -77,7 +78,7 @@ int32_t get_num_physical_cores() {
77
78
  // enumerate the set of thread siblings, num entries is num cores
78
79
  std::unordered_set<std::string> siblings;
79
80
  for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
80
- std::ifstream thread_siblings("/sys/devices/system/cpu"
81
+ std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
81
82
  + std::to_string(cpu) + "/topology/thread_siblings");
82
83
  if (!thread_siblings.is_open()) {
83
84
  break; // no more cpus
@@ -234,15 +235,63 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
234
235
  return result;
235
236
  }
236
237
 
238
+ bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
239
+ const char * sep = strchr(data, '=');
240
+ if (sep == nullptr || sep - data >= 128) {
241
+ fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
242
+ return false;
243
+ }
244
+ llama_model_kv_override kvo;
245
+ std::strncpy(kvo.key, data, sep - data);
246
+ kvo.key[sep - data] = 0;
247
+ sep++;
248
+ if (strncmp(sep, "int:", 4) == 0) {
249
+ sep += 4;
250
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
251
+ kvo.val_i64 = std::atol(sep);
252
+ } else if (strncmp(sep, "float:", 6) == 0) {
253
+ sep += 6;
254
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
255
+ kvo.val_f64 = std::atof(sep);
256
+ } else if (strncmp(sep, "bool:", 5) == 0) {
257
+ sep += 5;
258
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
259
+ if (std::strcmp(sep, "true") == 0) {
260
+ kvo.val_bool = true;
261
+ } else if (std::strcmp(sep, "false") == 0) {
262
+ kvo.val_bool = false;
263
+ } else {
264
+ fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
265
+ return false;
266
+ }
267
+ } else if (strncmp(sep, "str:", 4) == 0) {
268
+ sep += 4;
269
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
270
+ if (strlen(sep) > 127) {
271
+ fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
272
+ return false;
273
+ }
274
+ strncpy(kvo.val_str, sep, 127);
275
+ kvo.val_str[127] = '\0';
276
+ } else {
277
+ fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
278
+ return false;
279
+ }
280
+ overrides.emplace_back(std::move(kvo));
281
+ return true;
282
+ }
283
+
237
284
  bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
238
- llama_sampling_params& sparams = params.sparams;
285
+ llama_sampling_params & sparams = params.sparams;
239
286
 
240
287
  if (arg == "-s" || arg == "--seed") {
241
288
  if (++i >= argc) {
242
289
  invalid_param = true;
243
290
  return true;
244
291
  }
292
+ // This is temporary, in the future the samplign state will be moved fully to llama_sampling_context.
245
293
  params.seed = std::stoul(argv[i]);
294
+ sparams.seed = std::stoul(argv[i]);
246
295
  return true;
247
296
  }
248
297
  if (arg == "-t" || arg == "--threads") {
@@ -845,7 +894,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
845
894
  invalid_param = true;
846
895
  return true;
847
896
  }
848
- params.image = argv[i];
897
+ params.image.emplace_back(argv[i]);
849
898
  return true;
850
899
  }
851
900
  if (arg == "-i" || arg == "--interactive") {
@@ -864,6 +913,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
864
913
  params.instruct = true;
865
914
  return true;
866
915
  }
916
+ if (arg == "-cnv" || arg == "--conversation") {
917
+ params.conversation = true;
918
+ return true;
919
+ }
867
920
  if (arg == "-cml" || arg == "--chatml") {
868
921
  params.chatml = true;
869
922
  return true;
@@ -900,6 +953,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
900
953
  params.cont_batching = true;
901
954
  return true;
902
955
  }
956
+ if (arg == "-fa" || arg == "--flash-attn") {
957
+ params.flash_attn = true;
958
+ return true;
959
+ }
903
960
  if (arg == "--color") {
904
961
  params.use_color = true;
905
962
  return true;
@@ -1087,6 +1144,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1087
1144
  params.n_print = std::stoi(argv[i]);
1088
1145
  return true;
1089
1146
  }
1147
+ if (arg == "--check-tensors") {
1148
+ params.check_tensors = true;
1149
+ return true;
1150
+ }
1090
1151
  if (arg == "--ppl-output-type") {
1091
1152
  if (++i >= argc) {
1092
1153
  invalid_param = true;
@@ -1238,47 +1299,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1238
1299
  invalid_param = true;
1239
1300
  return true;
1240
1301
  }
1241
- char* sep = strchr(argv[i], '=');
1242
- if (sep == nullptr || sep - argv[i] >= 128) {
1243
- fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
1244
- invalid_param = true;
1245
- return true;
1246
- }
1247
- struct llama_model_kv_override kvo;
1248
- std::strncpy(kvo.key, argv[i], sep - argv[i]);
1249
- kvo.key[sep - argv[i]] = 0;
1250
- sep++;
1251
- if (strncmp(sep, "int:", 4) == 0) {
1252
- sep += 4;
1253
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
1254
- kvo.int_value = std::atol(sep);
1255
- }
1256
- else if (strncmp(sep, "float:", 6) == 0) {
1257
- sep += 6;
1258
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
1259
- kvo.float_value = std::atof(sep);
1260
- }
1261
- else if (strncmp(sep, "bool:", 5) == 0) {
1262
- sep += 5;
1263
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
1264
- if (std::strcmp(sep, "true") == 0) {
1265
- kvo.bool_value = true;
1266
- }
1267
- else if (std::strcmp(sep, "false") == 0) {
1268
- kvo.bool_value = false;
1269
- }
1270
- else {
1271
- fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
1272
- invalid_param = true;
1273
- return true;
1274
- }
1275
- }
1276
- else {
1302
+ if (!parse_kv_override(argv[i], params.kv_overrides)) {
1277
1303
  fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
1278
1304
  invalid_param = true;
1279
1305
  return true;
1280
1306
  }
1281
- params.kv_overrides.push_back(kvo);
1282
1307
  return true;
1283
1308
  }
1284
1309
  #ifndef LOG_DISABLE_LOGS
@@ -1308,6 +1333,29 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1308
1333
  return false;
1309
1334
  }
1310
1335
 
1336
+ void gpt_params_handle_model_default(gpt_params & params) {
1337
+ if (!params.hf_repo.empty()) {
1338
+ // short-hand to avoid specifying --hf-file -> default it to --model
1339
+ if (params.hf_file.empty()) {
1340
+ if (params.model.empty()) {
1341
+ throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
1342
+ }
1343
+ params.hf_file = params.model;
1344
+ } else if (params.model.empty()) {
1345
+ params.model = "models/" + string_split(params.hf_file, '/').back();
1346
+ }
1347
+ } else if (!params.model_url.empty()) {
1348
+ if (params.model.empty()) {
1349
+ auto f = string_split(params.model_url, '#').front();
1350
+ f = string_split(f, '?').front();
1351
+ f = string_split(f, '/').back();
1352
+ params.model = "models/" + f;
1353
+ }
1354
+ } else if (params.model.empty()) {
1355
+ params.model = DEFAULT_MODEL_PATH;
1356
+ }
1357
+ }
1358
+
1311
1359
  bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
1312
1360
  bool invalid_param = false;
1313
1361
  std::string arg;
@@ -1336,10 +1384,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
1336
1384
  throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
1337
1385
  }
1338
1386
 
1339
- // short-hand to avoid specifying --hf-file -> default it to --model
1340
- if (!params.hf_repo.empty() && params.hf_file.empty()) {
1341
- params.hf_file = params.model;
1342
- }
1387
+ gpt_params_handle_model_default(params);
1343
1388
 
1344
1389
  if (params.escape) {
1345
1390
  process_escapes(params.prompt);
@@ -1378,6 +1423,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1378
1423
  printf(" --version show version and build info\n");
1379
1424
  printf(" -i, --interactive run in interactive mode\n");
1380
1425
  printf(" --interactive-first run in interactive mode and wait for input right away\n");
1426
+ printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
1381
1427
  printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
1382
1428
  printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
1383
1429
  printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
@@ -1478,8 +1524,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1478
1524
  printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
1479
1525
  printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
1480
1526
  printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
1527
+ printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
1481
1528
  printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
1482
- printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
1529
+ printf(" --image IMAGE_FILE path to an image file. use with multimodal models. Specify multiple times for batching\n");
1483
1530
  if (llama_supports_mlock()) {
1484
1531
  printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
1485
1532
  }
@@ -1532,7 +1579,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1532
1579
  printf(" --control-vector-layer-range START END\n");
1533
1580
  printf(" layer range to apply the control vector(s) to, start and end inclusive\n");
1534
1581
  printf(" -m FNAME, --model FNAME\n");
1535
- printf(" model path (default: %s)\n", params.model.c_str());
1582
+ printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
1536
1583
  printf(" -md FNAME, --model-draft FNAME\n");
1537
1584
  printf(" draft model for speculative decoding (default: unused)\n");
1538
1585
  printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
@@ -1549,9 +1596,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1549
1596
  printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
1550
1597
  printf(" --override-kv KEY=TYPE:VALUE\n");
1551
1598
  printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
1552
- printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
1599
+ printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
1553
1600
  printf(" -ptc N, --print-token-count N\n");
1554
1601
  printf(" print token count every N tokens (default: %d)\n", params.n_print);
1602
+ printf(" --check-tensors check model tensor data for invalid values\n");
1555
1603
  printf("\n");
1556
1604
  #ifndef LOG_DISABLE_LOGS
1557
1605
  log_print_usage();
@@ -1676,6 +1724,18 @@ std::vector<std::string> string_split(std::string input, char separator) {
1676
1724
  return parts;
1677
1725
  }
1678
1726
 
1727
+ std::string string_strip(const std::string & str) {
1728
+ size_t start = 0;
1729
+ size_t end = str.size();
1730
+ while (start < end && std::isspace(str[start])) {
1731
+ start++;
1732
+ }
1733
+ while (end > start && std::isspace(str[end - 1])) {
1734
+ end--;
1735
+ }
1736
+ return str.substr(start, end - start);
1737
+ }
1738
+
1679
1739
  std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
1680
1740
  std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
1681
1741
  {"top_k", llama_sampler_type::TOP_K},
@@ -1772,6 +1832,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
1772
1832
  mparams.tensor_split = params.tensor_split;
1773
1833
  mparams.use_mmap = params.use_mmap;
1774
1834
  mparams.use_mlock = params.use_mlock;
1835
+ mparams.check_tensors = params.check_tensors;
1775
1836
  if (params.kv_overrides.empty()) {
1776
1837
  mparams.kv_overrides = NULL;
1777
1838
  } else {
@@ -1836,6 +1897,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
1836
1897
  cparams.cb_eval = params.cb_eval;
1837
1898
  cparams.cb_eval_user_data = params.cb_eval_user_data;
1838
1899
  cparams.offload_kqv = !params.no_kv_offload;
1900
+ cparams.flash_attn = params.flash_attn;
1839
1901
 
1840
1902
  cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
1841
1903
  cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
@@ -1866,59 +1928,75 @@ void llama_batch_add(
1866
1928
 
1867
1929
  #ifdef LLAMA_USE_CURL
1868
1930
 
1869
- static bool llama_download_file(CURL * curl, const char * url, const char * path) {
1931
+ static bool starts_with(const std::string & str, const std::string & prefix) {
1932
+ // While we wait for C++20's std::string::starts_with...
1933
+ return str.rfind(prefix, 0) == 0;
1934
+ }
1935
+
1936
+ static bool llama_download_file(const std::string & url, const std::string & path) {
1937
+
1938
+ // Initialize libcurl
1939
+ std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
1940
+ if (!curl) {
1941
+ fprintf(stderr, "%s: error initializing libcurl\n", __func__);
1942
+ return false;
1943
+ }
1944
+
1870
1945
  bool force_download = false;
1871
1946
 
1872
1947
  // Set the URL, allow to follow http redirection
1873
- curl_easy_setopt(curl, CURLOPT_URL, url);
1874
- curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
1948
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
1949
+ curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
1875
1950
 
1876
1951
  #if defined(_WIN32)
1877
1952
  // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
1878
1953
  // operating system. Currently implemented under MS-Windows.
1879
- curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
1954
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
1880
1955
  #endif
1881
1956
 
1882
1957
  // Check if the file already exists locally
1883
1958
  struct stat model_file_info;
1884
- auto file_exists = (stat(path, &model_file_info) == 0);
1959
+ auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
1885
1960
 
1886
- // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
1887
- char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
1888
- char etag_path[PATH_MAX] = {0};
1889
- snprintf(etag_path, sizeof(etag_path), "%s.etag", path);
1890
-
1891
- char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
1892
- char last_modified_path[PATH_MAX] = {0};
1893
- snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path);
1961
+ // If the file exists, check its JSON metadata companion file.
1962
+ std::string metadata_path = path + ".json";
1963
+ nlohmann::json metadata;
1964
+ std::string etag;
1965
+ std::string last_modified;
1894
1966
 
1895
1967
  if (file_exists) {
1896
- auto * f_etag = fopen(etag_path, "r");
1897
- if (f_etag) {
1898
- if (!fgets(etag, sizeof(etag), f_etag)) {
1899
- fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path);
1900
- } else {
1901
- fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag);
1902
- }
1903
- fclose(f_etag);
1904
- }
1905
-
1906
- auto * f_last_modified = fopen(last_modified_path, "r");
1907
- if (f_last_modified) {
1908
- if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
1909
- fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
1910
- } else {
1911
- fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path,
1912
- last_modified);
1968
+ // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
1969
+ std::ifstream metadata_in(metadata_path);
1970
+ if (metadata_in.good()) {
1971
+ try {
1972
+ metadata_in >> metadata;
1973
+ fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
1974
+ if (metadata.contains("url") && metadata.at("url").is_string()) {
1975
+ auto previous_url = metadata.at("url").get<std::string>();
1976
+ if (previous_url != url) {
1977
+ fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
1978
+ return false;
1979
+ }
1980
+ }
1981
+ if (metadata.contains("etag") && metadata.at("etag").is_string()) {
1982
+ etag = metadata.at("etag");
1983
+ }
1984
+ if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
1985
+ last_modified = metadata.at("lastModified");
1986
+ }
1987
+ } catch (const nlohmann::json::exception & e) {
1988
+ fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
1989
+ return false;
1913
1990
  }
1914
- fclose(f_last_modified);
1915
1991
  }
1992
+ } else {
1993
+ fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str());
1916
1994
  }
1917
1995
 
1918
1996
  // Send a HEAD request to retrieve the etag and last-modified headers
1919
1997
  struct llama_load_model_from_url_headers {
1920
- char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
1921
- char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
1998
+ std::string etag;
1999
+ std::string last_modified;
1922
2000
  };
1923
2001
  llama_load_model_from_url_headers headers;
1924
2002
  {
@@ -1926,38 +2004,37 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
1926
2004
  auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
1927
2005
  llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
1928
2006
 
1929
- // Convert header field name to lowercase
1930
- for (size_t i = 0; i < n_items && buffer[i] != ':'; ++i) {
1931
- buffer[i] = tolower(buffer[i]);
1932
- }
1933
-
1934
- const char * etag_prefix = "etag: ";
1935
- if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
1936
- strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF
1937
- }
1938
-
1939
- const char * last_modified_prefix = "last-modified: ";
1940
- if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) {
1941
- strncpy(headers->last_modified, buffer + strlen(last_modified_prefix),
1942
- n_items - strlen(last_modified_prefix) - 2); // Remove CRLF
2007
+ static std::regex header_regex("([^:]+): (.*)\r\n");
2008
+ static std::regex etag_regex("ETag", std::regex_constants::icase);
2009
+ static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
2010
+
2011
+ std::string header(buffer, n_items);
2012
+ std::smatch match;
2013
+ if (std::regex_match(header, match, header_regex)) {
2014
+ const std::string & key = match[1];
2015
+ const std::string & value = match[2];
2016
+ if (std::regex_match(key, match, etag_regex)) {
2017
+ headers->etag = value;
2018
+ } else if (std::regex_match(key, match, last_modified_regex)) {
2019
+ headers->last_modified = value;
2020
+ }
1943
2021
  }
1944
2022
  return n_items;
1945
2023
  };
1946
2024
 
1947
- curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
1948
- curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress
1949
- curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
1950
- curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers);
2025
+ curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
2026
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
2027
+ curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
2028
+ curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
1951
2029
 
1952
- CURLcode res = curl_easy_perform(curl);
2030
+ CURLcode res = curl_easy_perform(curl.get());
1953
2031
  if (res != CURLE_OK) {
1954
- curl_easy_cleanup(curl);
1955
2032
  fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
1956
2033
  return false;
1957
2034
  }
1958
2035
 
1959
2036
  long http_code = 0;
1960
- curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code);
2037
+ curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
1961
2038
  if (http_code != 200) {
1962
2039
  // HEAD not supported, we don't know if the file has changed
1963
2040
  // force trigger downloading
@@ -1966,28 +2043,30 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
1966
2043
  }
1967
2044
  }
1968
2045
 
1969
- // If the ETag or the Last-Modified headers are different: trigger a new download
1970
- bool should_download = !file_exists
1971
- || force_download
1972
- || (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0)
1973
- || (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0);
2046
+ bool should_download = !file_exists || force_download;
2047
+ if (!should_download) {
2048
+ if (!etag.empty() && etag != headers.etag) {
2049
+ fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
2050
+ should_download = true;
2051
+ } else if (!last_modified.empty() && last_modified != headers.last_modified) {
2052
+ fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
2053
+ should_download = true;
2054
+ }
2055
+ }
1974
2056
  if (should_download) {
1975
- char path_temporary[PATH_MAX] = {0};
1976
- snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path);
2057
+ std::string path_temporary = path + ".downloadInProgress";
1977
2058
  if (file_exists) {
1978
- fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path);
1979
- if (remove(path) != 0) {
1980
- curl_easy_cleanup(curl);
1981
- fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path);
2059
+ fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
2060
+ if (remove(path.c_str()) != 0) {
2061
+ fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str());
1982
2062
  return false;
1983
2063
  }
1984
2064
  }
1985
2065
 
1986
2066
  // Set the output file
1987
- auto * outfile = fopen(path_temporary, "wb");
2067
+ std::unique_ptr<FILE, decltype(&fclose)> outfile(fopen(path_temporary.c_str(), "wb"), fclose);
1988
2068
  if (!outfile) {
1989
- curl_easy_cleanup(curl);
1990
- fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path);
2069
+ fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
1991
2070
  return false;
1992
2071
  }
1993
2072
 
@@ -1995,12 +2074,12 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
1995
2074
  auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
1996
2075
  return fwrite(data, size, nmemb, (FILE *)fd);
1997
2076
  };
1998
- curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
1999
- curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
2000
- curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile);
2077
+ curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
2078
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
2079
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
2001
2080
 
2002
2081
  // display download progress
2003
- curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
2082
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
2004
2083
 
2005
2084
  // helper function to hide password in URL
2006
2085
  auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
@@ -2019,51 +2098,34 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
2019
2098
 
2020
2099
  // start the download
2021
2100
  fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
2022
- llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified);
2023
- auto res = curl_easy_perform(curl);
2101
+ llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
2102
+ auto res = curl_easy_perform(curl.get());
2024
2103
  if (res != CURLE_OK) {
2025
- fclose(outfile);
2026
- curl_easy_cleanup(curl);
2027
2104
  fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
2028
2105
  return false;
2029
2106
  }
2030
2107
 
2031
2108
  long http_code = 0;
2032
- curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code);
2109
+ curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
2033
2110
  if (http_code < 200 || http_code >= 400) {
2034
- fclose(outfile);
2035
- curl_easy_cleanup(curl);
2036
2111
  fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
2037
2112
  return false;
2038
2113
  }
2039
2114
 
2040
- // Clean up
2041
- fclose(outfile);
2115
+ // Causes file to be closed explicitly here before we rename it.
2116
+ outfile.reset();
2042
2117
 
2043
- // Write the new ETag to the .etag file
2044
- if (strlen(headers.etag) > 0) {
2045
- auto * etag_file = fopen(etag_path, "w");
2046
- if (etag_file) {
2047
- fputs(headers.etag, etag_file);
2048
- fclose(etag_file);
2049
- fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag);
2050
- }
2051
- }
2118
+ // Write the updated JSON metadata file.
2119
+ metadata.update({
2120
+ {"url", url},
2121
+ {"etag", headers.etag},
2122
+ {"lastModified", headers.last_modified}
2123
+ });
2124
+ std::ofstream(metadata_path) << metadata.dump(4);
2125
+ fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
2052
2126
 
2053
- // Write the new lastModified to the .etag file
2054
- if (strlen(headers.last_modified) > 0) {
2055
- auto * last_modified_file = fopen(last_modified_path, "w");
2056
- if (last_modified_file) {
2057
- fputs(headers.last_modified, last_modified_file);
2058
- fclose(last_modified_file);
2059
- fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path,
2060
- headers.last_modified);
2061
- }
2062
- }
2063
-
2064
- if (rename(path_temporary, path) != 0) {
2065
- curl_easy_cleanup(curl);
2066
- fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path);
2127
+ if (rename(path_temporary.c_str(), path.c_str()) != 0) {
2128
+ fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
2067
2129
  return false;
2068
2130
  }
2069
2131
  }
@@ -2081,15 +2143,7 @@ struct llama_model * llama_load_model_from_url(
2081
2143
  return NULL;
2082
2144
  }
2083
2145
 
2084
- // Initialize libcurl
2085
- auto * curl = curl_easy_init();
2086
-
2087
- if (!curl) {
2088
- fprintf(stderr, "%s: error initializing libcurl\n", __func__);
2089
- return NULL;
2090
- }
2091
-
2092
- if (!llama_download_file(curl, model_url, path_model)) {
2146
+ if (!llama_download_file(model_url, path_model)) {
2093
2147
  return NULL;
2094
2148
  }
2095
2149
 
@@ -2103,7 +2157,6 @@ struct llama_model * llama_load_model_from_url(
2103
2157
  auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
2104
2158
  if (!ctx_gguf) {
2105
2159
  fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model);
2106
- curl_easy_cleanup(curl);
2107
2160
  return NULL;
2108
2161
  }
2109
2162
 
@@ -2115,8 +2168,6 @@ struct llama_model * llama_load_model_from_url(
2115
2168
  gguf_free(ctx_gguf);
2116
2169
  }
2117
2170
 
2118
- curl_easy_cleanup(curl);
2119
-
2120
2171
  if (n_split > 1) {
2121
2172
  char split_prefix[PATH_MAX] = {0};
2122
2173
  char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
@@ -2147,11 +2198,7 @@ struct llama_model * llama_load_model_from_url(
2147
2198
  char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
2148
2199
  llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
2149
2200
 
2150
- auto * curl = curl_easy_init();
2151
- bool res = llama_download_file(curl, split_url, split_path);
2152
- curl_easy_cleanup(curl);
2153
-
2154
- return res;
2201
+ return llama_download_file(split_url, split_path);
2155
2202
  }, idx));
2156
2203
  }
2157
2204
 
@@ -2326,12 +2373,12 @@ std::vector<llama_token> llama_tokenize(
2326
2373
  return result;
2327
2374
  }
2328
2375
 
2329
- std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
2376
+ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
2330
2377
  std::vector<char> result(8, 0);
2331
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
2378
+ const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
2332
2379
  if (n_tokens < 0) {
2333
2380
  result.resize(-n_tokens);
2334
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
2381
+ int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
2335
2382
  GGML_ASSERT(check == -n_tokens);
2336
2383
  } else {
2337
2384
  result.resize(n_tokens);
@@ -2638,7 +2685,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
2638
2685
  fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
2639
2686
  fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
2640
2687
  fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
2641
- fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
2688
+ fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
2642
2689
  fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
2643
2690
  fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
2644
2691
  fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
@@ -2673,6 +2720,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
2673
2720
  fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
2674
2721
  fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
2675
2722
  fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
2723
+ fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
2676
2724
  fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
2677
2725
 
2678
2726
  const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());