@fugood/llama.node 1.2.2 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +33 -11
  3. package/src/llama.cpp/CMakeLists.txt +1 -0
  4. package/src/llama.cpp/common/CMakeLists.txt +46 -2
  5. package/src/llama.cpp/common/arg.cpp +423 -186
  6. package/src/llama.cpp/common/arg.h +0 -1
  7. package/src/llama.cpp/common/chat-parser.cpp +154 -13
  8. package/src/llama.cpp/common/chat-parser.h +3 -0
  9. package/src/llama.cpp/common/chat.cpp +217 -6
  10. package/src/llama.cpp/common/chat.h +5 -3
  11. package/src/llama.cpp/common/common.cpp +23 -6
  12. package/src/llama.cpp/common/common.h +6 -4
  13. package/src/llama.cpp/common/http.h +73 -0
  14. package/src/llama.cpp/common/sampling.cpp +1 -0
  15. package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
  16. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -1
  17. package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
  18. package/src/llama.cpp/ggml/include/ggml-zdnn.h +3 -0
  19. package/src/llama.cpp/ggml/include/ggml.h +22 -0
  20. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  22. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +2 -2
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  27. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +18 -3
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
  29. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
  30. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
  31. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +27 -19
  33. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
  34. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
  39. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
  40. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
  41. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +6 -5
  42. package/src/llama.cpp/include/llama.h +23 -11
  43. package/src/llama.cpp/src/llama-arch.cpp +93 -0
  44. package/src/llama.cpp/src/llama-arch.h +22 -0
  45. package/src/llama.cpp/src/llama-chat.cpp +1 -1
  46. package/src/llama.cpp/src/llama-context.cpp +157 -0
  47. package/src/llama.cpp/src/llama-context.h +10 -0
  48. package/src/llama.cpp/src/llama-graph.cpp +57 -22
  49. package/src/llama.cpp/src/llama-graph.h +10 -1
  50. package/src/llama.cpp/src/llama-hparams.h +17 -2
  51. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +10 -2
  52. package/src/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
  53. package/src/llama.cpp/src/llama-kv-cache.cpp +10 -5
  54. package/src/llama.cpp/src/llama-kv-cache.h +2 -0
  55. package/src/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
  56. package/src/llama.cpp/src/llama-memory-hybrid.h +2 -0
  57. package/src/llama.cpp/src/llama-memory-recurrent.cpp +19 -3
  58. package/src/llama.cpp/src/llama-memory-recurrent.h +3 -0
  59. package/src/llama.cpp/src/llama-memory.h +3 -0
  60. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  61. package/src/llama.cpp/src/llama-model.cpp +582 -45
  62. package/src/llama.cpp/src/llama-model.h +23 -1
  63. package/src/llama.cpp/src/llama-sampling.cpp +5 -0
  64. package/src/llama.cpp/src/llama-vocab.cpp +7 -1
  65. package/src/llama.cpp/src/llama-vocab.h +41 -40
  66. package/src/llama.cpp/src/unicode.h +43 -0
@@ -24,6 +24,7 @@
24
24
  #include <cstdarg>
25
25
  #include <filesystem>
26
26
  #include <fstream>
27
+ #include <future>
27
28
  #include <list>
28
29
  #include <regex>
29
30
  #include <set>
@@ -31,12 +32,31 @@
31
32
  #include <thread>
32
33
  #include <vector>
33
34
 
34
- //#define LLAMA_USE_CURL
35
-
36
35
  #if defined(LLAMA_USE_CURL)
37
36
  #include <curl/curl.h>
38
37
  #include <curl/easy.h>
39
- #include <future>
38
+ #else
39
+ #include "http.h"
40
+ #endif
41
+
42
+ #ifdef __linux__
43
+ #include <linux/limits.h>
44
+ #elif defined(_WIN32)
45
+ # if !defined(PATH_MAX)
46
+ # define PATH_MAX MAX_PATH
47
+ # endif
48
+ #elif defined(_AIX)
49
+ #include <sys/limits.h>
50
+ #else
51
+ #include <sys/syslimits.h>
52
+ #endif
53
+ #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
54
+
55
+ // isatty
56
+ #if defined(_WIN32)
57
+ #include <io.h>
58
+ #else
59
+ #include <unistd.h>
40
60
  #endif
41
61
 
42
62
  using json = nlohmann::ordered_json;
@@ -85,6 +105,14 @@ static void write_file(const std::string & fname, const std::string & content) {
85
105
  }
86
106
  }
87
107
 
108
+ static bool is_output_a_tty() {
109
+ #if defined(_WIN32)
110
+ return _isatty(_fileno(stdout));
111
+ #else
112
+ return isatty(1);
113
+ #endif
114
+ }
115
+
88
116
  common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
89
117
  this->examples = std::move(examples);
90
118
  return *this;
@@ -202,24 +230,54 @@ struct common_hf_file_res {
202
230
  std::string mmprojFile;
203
231
  };
204
232
 
205
- #ifdef LLAMA_USE_CURL
233
+ static void write_etag(const std::string & path, const std::string & etag) {
234
+ const std::string etag_path = path + ".etag";
235
+ write_file(etag_path, etag);
236
+ LOG_DBG("%s: file etag saved: %s\n", __func__, etag_path.c_str());
237
+ }
206
238
 
207
- bool common_has_curl() {
208
- return true;
239
+ static std::string read_etag(const std::string & path) {
240
+ std::string none;
241
+ const std::string etag_path = path + ".etag";
242
+
243
+ if (std::filesystem::exists(etag_path)) {
244
+ std::ifstream etag_in(etag_path);
245
+ if (!etag_in) {
246
+ LOG_ERR("%s: could not open .etag file for reading: %s\n", __func__, etag_path.c_str());
247
+ return none;
248
+ }
249
+ std::string etag;
250
+ std::getline(etag_in, etag);
251
+ return etag;
252
+ }
253
+
254
+ // no etag file, but maybe there is an old .json
255
+ // remove this code later
256
+ const std::string metadata_path = path + ".json";
257
+
258
+ if (std::filesystem::exists(metadata_path)) {
259
+ std::ifstream metadata_in(metadata_path);
260
+ try {
261
+ nlohmann::json metadata_json;
262
+ metadata_in >> metadata_json;
263
+ LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(),
264
+ metadata_json.dump().c_str());
265
+ if (metadata_json.contains("etag") && metadata_json.at("etag").is_string()) {
266
+ std::string etag = metadata_json.at("etag");
267
+ write_etag(path, etag);
268
+ if (!std::filesystem::remove(metadata_path)) {
269
+ LOG_WRN("%s: failed to delete old .json metadata file: %s\n", __func__, metadata_path.c_str());
270
+ }
271
+ return etag;
272
+ }
273
+ } catch (const nlohmann::json::exception & e) {
274
+ LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
275
+ }
276
+ }
277
+ return none;
209
278
  }
210
279
 
211
- #ifdef __linux__
212
- #include <linux/limits.h>
213
- #elif defined(_WIN32)
214
- # if !defined(PATH_MAX)
215
- # define PATH_MAX MAX_PATH
216
- # endif
217
- #elif defined(_AIX)
218
- #include <sys/limits.h>
219
- #else
220
- #include <sys/syslimits.h>
221
- #endif
222
- #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
280
+ #ifdef LLAMA_USE_CURL
223
281
 
224
282
  //
225
283
  // CURL utils
@@ -368,49 +426,19 @@ static bool common_download_head(CURL * curl,
368
426
  }
369
427
 
370
428
  // download one single file from remote URL to local path
371
- static bool common_download_file_single(const std::string & url,
372
- const std::string & path,
373
- const std::string & bearer_token,
374
- bool offline) {
375
- // If the file exists, check its JSON metadata companion file.
376
- std::string metadata_path = path + ".json";
429
+ static bool common_download_file_single_online(const std::string & url,
430
+ const std::string & path,
431
+ const std::string & bearer_token) {
377
432
  static const int max_attempts = 3;
378
433
  static const int retry_delay_seconds = 2;
379
434
  for (int i = 0; i < max_attempts; ++i) {
380
- nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
381
- std::string etag;
382
- std::string last_modified;
435
+ std::string etag;
383
436
 
384
437
  // Check if the file already exists locally
385
438
  const auto file_exists = std::filesystem::exists(path);
386
439
  if (file_exists) {
387
- if (offline) {
388
- LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
389
- return true; // skip verification/downloading
390
- }
391
- // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
392
- std::ifstream metadata_in(metadata_path);
393
- if (metadata_in.good()) {
394
- try {
395
- metadata_in >> metadata;
396
- LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(),
397
- metadata.dump().c_str());
398
- if (metadata.contains("etag") && metadata.at("etag").is_string()) {
399
- etag = metadata.at("etag");
400
- }
401
- if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
402
- last_modified = metadata.at("lastModified");
403
- }
404
- } catch (const nlohmann::json::exception & e) {
405
- LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
406
- }
407
- }
408
- // if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
440
+ etag = read_etag(path);
409
441
  } else {
410
- if (offline) {
411
- LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
412
- return false;
413
- }
414
442
  LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
415
443
  }
416
444
 
@@ -447,11 +475,6 @@ static bool common_download_file_single(const std::string & url,
447
475
  headers.etag.c_str());
448
476
  should_download = true;
449
477
  should_download_from_scratch = true;
450
- } else if (!last_modified.empty() && last_modified != headers.last_modified) {
451
- LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__,
452
- last_modified.c_str(), headers.last_modified.c_str());
453
- should_download = true;
454
- should_download_from_scratch = true;
455
478
  }
456
479
  }
457
480
 
@@ -482,15 +505,9 @@ static bool common_download_file_single(const std::string & url,
482
505
  }
483
506
  }
484
507
  }
485
-
486
- // Write the updated JSON metadata file.
487
- metadata.update({
488
- { "url", url },
489
- { "etag", headers.etag },
490
- { "lastModified", headers.last_modified }
491
- });
492
- write_file(metadata_path, metadata.dump(4));
493
- LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
508
+ if (head_request_ok) {
509
+ write_etag(path, headers.etag);
510
+ }
494
511
 
495
512
  // start the download
496
513
  LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n",
@@ -530,6 +547,306 @@ static bool common_download_file_single(const std::string & url,
530
547
  return true;
531
548
  }
532
549
 
550
+ std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
551
+ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
552
+ curl_slist_ptr http_headers;
553
+ std::vector<char> res_buffer;
554
+
555
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
556
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
557
+ curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
558
+ curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 1L);
559
+ typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
560
+ auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
561
+ auto data_vec = static_cast<std::vector<char> *>(data);
562
+ data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
563
+ return size * nmemb;
564
+ };
565
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
566
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
567
+ #if defined(_WIN32)
568
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
569
+ #endif
570
+ if (params.timeout > 0) {
571
+ curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
572
+ }
573
+ if (params.max_size > 0) {
574
+ curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
575
+ }
576
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
577
+ for (const auto & header : params.headers) {
578
+ http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
579
+ }
580
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
581
+
582
+ CURLcode res = curl_easy_perform(curl.get());
583
+
584
+ if (res != CURLE_OK) {
585
+ std::string error_msg = curl_easy_strerror(res);
586
+ throw std::runtime_error("error: cannot make GET request: " + error_msg);
587
+ }
588
+
589
+ long res_code;
590
+ curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
591
+
592
+ return { res_code, std::move(res_buffer) };
593
+ }
594
+
595
+ #else
596
+
597
+ static void print_progress(size_t current, size_t total) {
598
+ if (!is_output_a_tty()) {
599
+ return;
600
+ }
601
+
602
+ if (!total) {
603
+ return;
604
+ }
605
+
606
+ size_t width = 50;
607
+ size_t pct = (100 * current) / total;
608
+ size_t pos = (width * current) / total;
609
+
610
+ std::cout << "["
611
+ << std::string(pos, '=')
612
+ << (pos < width ? ">" : "")
613
+ << std::string(width - pos, ' ')
614
+ << "] " << std::setw(3) << pct << "% ("
615
+ << current / (1024 * 1024) << " MB / "
616
+ << total / (1024 * 1024) << " MB)\r";
617
+ std::cout.flush();
618
+ }
619
+
620
+ static bool common_pull_file(httplib::Client & cli,
621
+ const std::string & resolve_path,
622
+ const std::string & path_tmp,
623
+ bool supports_ranges,
624
+ size_t existing_size,
625
+ size_t & total_size) {
626
+ std::ofstream ofs(path_tmp, std::ios::binary | std::ios::app);
627
+ if (!ofs.is_open()) {
628
+ LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_tmp.c_str());
629
+ return false;
630
+ }
631
+
632
+ httplib::Headers headers;
633
+ if (supports_ranges && existing_size > 0) {
634
+ headers.emplace("Range", "bytes=" + std::to_string(existing_size) + "-");
635
+ }
636
+
637
+ std::atomic<size_t> downloaded{existing_size};
638
+
639
+ auto res = cli.Get(resolve_path, headers,
640
+ [&](const httplib::Response &response) {
641
+ if (existing_size > 0 && response.status != 206) {
642
+ LOG_WRN("%s: server did not respond with 206 Partial Content for a resume request. Status: %d\n", __func__, response.status);
643
+ return false;
644
+ }
645
+ if (existing_size == 0 && response.status != 200) {
646
+ LOG_WRN("%s: download received non-successful status code: %d\n", __func__, response.status);
647
+ return false;
648
+ }
649
+ if (total_size == 0 && response.has_header("Content-Length")) {
650
+ try {
651
+ size_t content_length = std::stoull(response.get_header_value("Content-Length"));
652
+ total_size = existing_size + content_length;
653
+ } catch (const std::exception &e) {
654
+ LOG_WRN("%s: invalid Content-Length header: %s\n", __func__, e.what());
655
+ }
656
+ }
657
+ return true;
658
+ },
659
+ [&](const char *data, size_t len) {
660
+ ofs.write(data, len);
661
+ if (!ofs) {
662
+ LOG_ERR("%s: error writing to file: %s\n", __func__, path_tmp.c_str());
663
+ return false;
664
+ }
665
+ downloaded += len;
666
+ print_progress(downloaded, total_size);
667
+ return true;
668
+ },
669
+ nullptr
670
+ );
671
+
672
+ std::cout << "\n";
673
+
674
+ if (!res) {
675
+ LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
676
+ return false;
677
+ }
678
+
679
+ return true;
680
+ }
681
+
682
+ // download one single file from remote URL to local path
683
+ static bool common_download_file_single_online(const std::string & url,
684
+ const std::string & path,
685
+ const std::string & bearer_token) {
686
+ static const int max_attempts = 3;
687
+ static const int retry_delay_seconds = 2;
688
+
689
+ auto [cli, parts] = common_http_client(url);
690
+
691
+ httplib::Headers default_headers = {{"User-Agent", "llama-cpp"}};
692
+ if (!bearer_token.empty()) {
693
+ default_headers.insert({"Authorization", "Bearer " + bearer_token});
694
+ }
695
+ cli.set_default_headers(default_headers);
696
+
697
+ const bool file_exists = std::filesystem::exists(path);
698
+
699
+ std::string last_etag;
700
+ if (file_exists) {
701
+ last_etag = read_etag(path);
702
+ } else {
703
+ LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
704
+ }
705
+
706
+ for (int i = 0; i < max_attempts; ++i) {
707
+ auto head = cli.Head(parts.path);
708
+ bool head_ok = head && head->status >= 200 && head->status < 300;
709
+ if (!head_ok) {
710
+ LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1);
711
+ if (file_exists) {
712
+ LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str());
713
+ return true;
714
+ }
715
+ }
716
+
717
+ std::string etag;
718
+ if (head_ok && head->has_header("ETag")) {
719
+ etag = head->get_header_value("ETag");
720
+ }
721
+
722
+ size_t total_size = 0;
723
+ if (head_ok && head->has_header("Content-Length")) {
724
+ try {
725
+ total_size = std::stoull(head->get_header_value("Content-Length"));
726
+ } catch (const std::exception& e) {
727
+ LOG_WRN("%s: Invalid Content-Length in HEAD response: %s\n", __func__, e.what());
728
+ }
729
+ }
730
+
731
+ bool supports_ranges = false;
732
+ if (head_ok && head->has_header("Accept-Ranges")) {
733
+ supports_ranges = head->get_header_value("Accept-Ranges") != "none";
734
+ }
735
+
736
+ bool should_download_from_scratch = false;
737
+ if (!last_etag.empty() && !etag.empty() && last_etag != etag) {
738
+ LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__,
739
+ last_etag.c_str(), etag.c_str());
740
+ should_download_from_scratch = true;
741
+ }
742
+
743
+ if (file_exists) {
744
+ if (!should_download_from_scratch) {
745
+ LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
746
+ return true;
747
+ }
748
+ LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
749
+ if (remove(path.c_str()) != 0) {
750
+ LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
751
+ return false;
752
+ }
753
+ }
754
+
755
+ const std::string path_temporary = path + ".downloadInProgress";
756
+ size_t existing_size = 0;
757
+
758
+ if (std::filesystem::exists(path_temporary)) {
759
+ if (supports_ranges && !should_download_from_scratch) {
760
+ existing_size = std::filesystem::file_size(path_temporary);
761
+ } else if (remove(path_temporary.c_str()) != 0) {
762
+ LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
763
+ return false;
764
+ }
765
+ }
766
+
767
+ // start the download
768
+ LOG_INF("%s: trying to download model from %s to %s (etag:%s)...\n",
769
+ __func__, common_http_show_masked_url(parts).c_str(), path_temporary.c_str(), etag.c_str());
770
+ const bool was_pull_successful = common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size);
771
+ if (!was_pull_successful) {
772
+ if (i + 1 < max_attempts) {
773
+ const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000;
774
+ LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay);
775
+ std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
776
+ } else {
777
+ LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts);
778
+ }
779
+ continue;
780
+ }
781
+
782
+ if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
783
+ LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
784
+ return false;
785
+ }
786
+ if (!etag.empty()) {
787
+ write_etag(path, etag);
788
+ }
789
+ break;
790
+ }
791
+
792
+ return true;
793
+ }
794
+
795
+ std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url,
796
+ const common_remote_params & params) {
797
+ auto [cli, parts] = common_http_client(url);
798
+
799
+ httplib::Headers headers = {{"User-Agent", "llama-cpp"}};
800
+ for (const auto & header : params.headers) {
801
+ size_t pos = header.find(':');
802
+ if (pos != std::string::npos) {
803
+ headers.emplace(header.substr(0, pos), header.substr(pos + 1));
804
+ } else {
805
+ headers.emplace(header, "");
806
+ }
807
+ }
808
+
809
+ if (params.timeout > 0) {
810
+ cli.set_read_timeout(params.timeout, 0);
811
+ cli.set_write_timeout(params.timeout, 0);
812
+ }
813
+
814
+ std::vector<char> buf;
815
+ auto res = cli.Get(parts.path, headers,
816
+ [&](const char *data, size_t len) {
817
+ buf.insert(buf.end(), data, data + len);
818
+ return params.max_size == 0 ||
819
+ buf.size() <= static_cast<size_t>(params.max_size);
820
+ },
821
+ nullptr
822
+ );
823
+
824
+ if (!res) {
825
+ throw std::runtime_error("error: cannot make GET request");
826
+ }
827
+
828
+ return { res->status, std::move(buf) };
829
+ }
830
+
831
+ #endif // LLAMA_USE_CURL
832
+
833
+ static bool common_download_file_single(const std::string & url,
834
+ const std::string & path,
835
+ const std::string & bearer_token,
836
+ bool offline) {
837
+ if (!offline) {
838
+ return common_download_file_single_online(url, path, bearer_token);
839
+ }
840
+
841
+ if (!std::filesystem::exists(path)) {
842
+ LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
843
+ return false;
844
+ }
845
+
846
+ LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
847
+ return true;
848
+ }
849
+
533
850
  // download multiple files from remote URLs to local paths
534
851
  // the input is a vector of pairs <url, path>
535
852
  static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
@@ -588,7 +905,7 @@ static bool common_download_model(
588
905
 
589
906
  if (n_split > 1) {
590
907
  char split_prefix[PATH_MAX] = {0};
591
- char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
908
+ char split_url_prefix[LLAMA_MAX_URL_LENGTH] = {0};
592
909
 
593
910
  // Verify the first split file format
594
911
  // and extract split URL and PATH prefixes
@@ -609,7 +926,7 @@ static bool common_download_model(
609
926
  char split_path[PATH_MAX] = {0};
610
927
  llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
611
928
 
612
- char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
929
+ char split_url[LLAMA_MAX_URL_LENGTH] = {0};
613
930
  llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
614
931
 
615
932
  if (std::string(split_path) == model.path) {
@@ -626,50 +943,6 @@ static bool common_download_model(
626
943
  return true;
627
944
  }
628
945
 
629
- std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
630
- curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
631
- curl_slist_ptr http_headers;
632
- std::vector<char> res_buffer;
633
-
634
- curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
635
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
636
- curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
637
- typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
638
- auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
639
- auto data_vec = static_cast<std::vector<char> *>(data);
640
- data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
641
- return size * nmemb;
642
- };
643
- curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
644
- curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
645
- #if defined(_WIN32)
646
- curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
647
- #endif
648
- if (params.timeout > 0) {
649
- curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
650
- }
651
- if (params.max_size > 0) {
652
- curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
653
- }
654
- http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
655
- for (const auto & header : params.headers) {
656
- http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
657
- }
658
- curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
659
-
660
- CURLcode res = curl_easy_perform(curl.get());
661
-
662
- if (res != CURLE_OK) {
663
- std::string error_msg = curl_easy_strerror(res);
664
- throw std::runtime_error("error: cannot make GET request: " + error_msg);
665
- }
666
-
667
- long res_code;
668
- curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
669
-
670
- return { res_code, std::move(res_buffer) };
671
- }
672
-
673
946
  /**
674
947
  * Allow getting the HF file from the HF repo with tag (like ollama), for example:
675
948
  * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
@@ -736,21 +1009,17 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
736
1009
  std::string mmprojFile;
737
1010
 
738
1011
  if (res_code == 200 || res_code == 304) {
739
- // extract ggufFile.rfilename in json, using regex
740
- {
741
- std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
742
- std::smatch match;
743
- if (std::regex_search(res_str, match, pattern)) {
744
- ggufFile = match[1].str();
1012
+ try {
1013
+ auto j = json::parse(res_str);
1014
+
1015
+ if (j.contains("ggufFile") && j["ggufFile"].contains("rfilename")) {
1016
+ ggufFile = j["ggufFile"]["rfilename"].get<std::string>();
745
1017
  }
746
- }
747
- // extract mmprojFile.rfilename in json, using regex
748
- {
749
- std::regex pattern("\"mmprojFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
750
- std::smatch match;
751
- if (std::regex_search(res_str, match, pattern)) {
752
- mmprojFile = match[1].str();
1018
+ if (j.contains("mmprojFile") && j["mmprojFile"].contains("rfilename")) {
1019
+ mmprojFile = j["mmprojFile"]["rfilename"].get<std::string>();
753
1020
  }
1021
+ } catch (const std::exception & e) {
1022
+ throw std::runtime_error(std::string("error parsing manifest JSON: ") + e.what());
754
1023
  }
755
1024
  if (!use_cache) {
756
1025
  // if not using cached response, update the cache file
@@ -770,45 +1039,6 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
770
1039
  return { hf_repo, ggufFile, mmprojFile };
771
1040
  }
772
1041
 
773
- #else
774
-
775
- bool common_has_curl() {
776
- return false;
777
- }
778
-
779
- static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
780
- LOG_ERR("error: built without CURL, cannot download model from internet\n");
781
- return false;
782
- }
783
-
784
- static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
785
- LOG_ERR("error: built without CURL, cannot download model from the internet\n");
786
- return false;
787
- }
788
-
789
- static bool common_download_model(
790
- const common_params_model &,
791
- const std::string &,
792
- bool) {
793
- LOG_ERR("error: built without CURL, cannot download model from the internet\n");
794
- return false;
795
- }
796
-
797
- static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
798
- LOG_ERR("error: built without CURL, cannot download model from the internet\n");
799
- return {};
800
- }
801
-
802
- std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
803
- if (!url.empty()) {
804
- throw std::runtime_error("error: built without CURL, cannot download model from the internet");
805
- }
806
-
807
- return {};
808
- }
809
-
810
- #endif // LLAMA_USE_CURL
811
-
812
1042
  //
813
1043
  // Docker registry functions
814
1044
  //
@@ -1068,8 +1298,6 @@ static std::string get_all_kv_cache_types() {
1068
1298
  //
1069
1299
 
1070
1300
  static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
1071
- std::string arg;
1072
- const std::string arg_prefix = "--";
1073
1301
  common_params & params = ctx_arg.params;
1074
1302
 
1075
1303
  std::unordered_map<std::string, common_arg *> arg_to_options;
@@ -1387,18 +1615,14 @@ static void add_rpc_devices(const std::string & servers) {
1387
1615
  if (!rpc_reg) {
1388
1616
  throw std::invalid_argument("failed to find RPC backend");
1389
1617
  }
1390
- typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
1391
- ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
1392
- if (!ggml_backend_rpc_add_device_fn) {
1393
- throw std::invalid_argument("failed to find RPC device add function");
1618
+ typedef ggml_backend_reg_t (*ggml_backend_rpc_add_server_t)(const char * endpoint);
1619
+ ggml_backend_rpc_add_server_t ggml_backend_rpc_add_server_fn = (ggml_backend_rpc_add_server_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server");
1620
+ if (!ggml_backend_rpc_add_server_fn) {
1621
+ throw std::invalid_argument("failed to find RPC add server function");
1394
1622
  }
1395
1623
  for (const auto & server : rpc_servers) {
1396
- ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
1397
- if (dev) {
1398
- ggml_backend_device_register(dev);
1399
- } else {
1400
- throw std::invalid_argument("failed to register RPC device");
1401
- }
1624
+ auto reg = ggml_backend_rpc_add_server_fn(server.c_str());
1625
+ ggml_backend_register(reg);
1402
1626
  }
1403
1627
  }
1404
1628
 
@@ -1704,13 +1928,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1704
1928
  }
1705
1929
  ).set_env("LLAMA_ARG_SWA_FULL"));
1706
1930
  add_opt(common_arg(
1707
- {"--swa-checkpoints"}, "N",
1708
- string_format("max number of SWA checkpoints per slot to create (default: %d)\n"
1709
- "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_swa_checkpoints),
1931
+ {"--ctx-checkpoints", "--swa-checkpoints"}, "N",
1932
+ string_format("max number of context checkpoints to create per slot (default: %d)\n"
1933
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
1710
1934
  [](common_params & params, int value) {
1711
- params.n_swa_checkpoints = value;
1935
+ params.n_ctx_checkpoints = value;
1712
1936
  }
1713
- ).set_env("LLAMA_ARG_SWA_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
1937
+ ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
1938
+ add_opt(common_arg(
1939
+ {"--cache-ram", "-cram"}, "N",
1940
+ string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
1941
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
1942
+ [](common_params & params, int value) {
1943
+ params.cache_ram_mib = value;
1944
+ }
1945
+ ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
1714
1946
  add_opt(common_arg(
1715
1947
  {"--kv-unified", "-kvu"},
1716
1948
  string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
@@ -2360,6 +2592,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2360
2592
  params.no_extra_bufts = true;
2361
2593
  }
2362
2594
  ).set_env("LLAMA_ARG_NO_REPACK"));
2595
+ add_opt(common_arg(
2596
+ {"--no-host"},
2597
+ "bypass host buffer allowing extra buffers to be used",
2598
+ [](common_params & params) {
2599
+ params.no_host = true;
2600
+ }
2601
+ ).set_env("LLAMA_ARG_NO_HOST"));
2363
2602
  add_opt(common_arg(
2364
2603
  {"-ctk", "--cache-type-k"}, "TYPE",
2365
2604
  string_format(
@@ -3201,7 +3440,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3201
3440
  {"--reasoning-format"}, "FORMAT",
3202
3441
  "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
3203
3442
  "- none: leaves thoughts unparsed in `message.content`\n"
3204
- "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
3443
+ "- deepseek: puts thoughts in `message.reasoning_content`\n"
3444
+ "- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
3205
3445
  "(default: auto)",
3206
3446
  [](common_params & params, const std::string & value) {
3207
3447
  params.reasoning_format = common_reasoning_format_from_name(value);
@@ -3628,7 +3868,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3628
3868
  [](common_params & params) {
3629
3869
  params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
3630
3870
  params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
3631
- params.pooling_type = LLAMA_POOLING_TYPE_NONE;
3632
3871
  params.embd_normalize = 2;
3633
3872
  params.n_ctx = 512;
3634
3873
  params.verbose_prompt = true;
@@ -3642,7 +3881,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3642
3881
  [](common_params & params) {
3643
3882
  params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
3644
3883
  params.model.hf_file = "e5-small-v2-q8_0.gguf";
3645
- params.pooling_type = LLAMA_POOLING_TYPE_NONE;
3646
3884
  params.embd_normalize = 2;
3647
3885
  params.n_ctx = 512;
3648
3886
  params.verbose_prompt = true;
@@ -3656,7 +3894,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3656
3894
  [](common_params & params) {
3657
3895
  params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
3658
3896
  params.model.hf_file = "gte-small-q8_0.gguf";
3659
- params.pooling_type = LLAMA_POOLING_TYPE_NONE;
3660
3897
  params.embd_normalize = 2;
3661
3898
  params.n_ctx = 512;
3662
3899
  params.verbose_prompt = true;