@fugood/llama.node 1.3.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +8 -8
  3. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  4. package/src/llama.cpp/common/arg.cpp +44 -999
  5. package/src/llama.cpp/common/arg.h +2 -2
  6. package/src/llama.cpp/common/chat.cpp +17 -2
  7. package/src/llama.cpp/common/common.cpp +33 -0
  8. package/src/llama.cpp/common/common.h +15 -1
  9. package/src/llama.cpp/common/download.cpp +1054 -0
  10. package/src/llama.cpp/common/download.h +55 -0
  11. package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
  12. package/src/llama.cpp/ggml/include/ggml.h +2 -0
  13. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  14. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +29 -11
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -1
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +21 -21
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +172 -75
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +0 -4
  23. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +82 -21
  24. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +25 -25
  25. package/src/llama.cpp/include/llama.h +7 -3
  26. package/src/llama.cpp/src/CMakeLists.txt +95 -0
  27. package/src/llama.cpp/src/llama-arch.cpp +108 -0
  28. package/src/llama.cpp/src/llama-arch.h +11 -0
  29. package/src/llama.cpp/src/llama-batch.cpp +63 -31
  30. package/src/llama.cpp/src/llama-batch.h +12 -1
  31. package/src/llama.cpp/src/llama-chat.cpp +32 -0
  32. package/src/llama.cpp/src/llama-chat.h +1 -0
  33. package/src/llama.cpp/src/llama-context.cpp +36 -13
  34. package/src/llama.cpp/src/llama-context.h +5 -5
  35. package/src/llama.cpp/src/llama-cparams.h +1 -0
  36. package/src/llama.cpp/src/llama-graph.cpp +3 -3
  37. package/src/llama.cpp/src/llama-hparams.cpp +11 -1
  38. package/src/llama.cpp/src/llama-hparams.h +6 -0
  39. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +3 -1
  40. package/src/llama.cpp/src/llama-kv-cache.cpp +33 -1
  41. package/src/llama.cpp/src/llama-kv-cells.h +44 -2
  42. package/src/llama.cpp/src/llama-memory-recurrent.cpp +4 -3
  43. package/src/llama.cpp/src/llama-model.cpp +320 -13171
  44. package/src/llama.cpp/src/llama-model.h +8 -0
  45. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  46. package/src/llama.cpp/src/llama-vocab.cpp +5 -0
  47. package/src/llama.cpp/src/llama-vocab.h +1 -0
  48. package/src/llama.cpp/src/models/apertus.cpp +125 -0
  49. package/src/llama.cpp/src/models/arcee.cpp +135 -0
  50. package/src/llama.cpp/src/models/arctic.cpp +138 -0
  51. package/src/llama.cpp/src/models/arwkv7.cpp +86 -0
  52. package/src/llama.cpp/src/models/baichuan.cpp +122 -0
  53. package/src/llama.cpp/src/models/bailingmoe.cpp +144 -0
  54. package/src/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  55. package/src/llama.cpp/src/models/bert.cpp +176 -0
  56. package/src/llama.cpp/src/models/bitnet.cpp +160 -0
  57. package/src/llama.cpp/src/models/bloom.cpp +101 -0
  58. package/src/llama.cpp/src/models/chameleon.cpp +178 -0
  59. package/src/llama.cpp/src/models/chatglm.cpp +132 -0
  60. package/src/llama.cpp/src/models/codeshell.cpp +111 -0
  61. package/src/llama.cpp/src/models/cogvlm.cpp +100 -0
  62. package/src/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  63. package/src/llama.cpp/src/models/command-r.cpp +122 -0
  64. package/src/llama.cpp/src/models/dbrx.cpp +123 -0
  65. package/src/llama.cpp/src/models/deci.cpp +135 -0
  66. package/src/llama.cpp/src/models/deepseek.cpp +144 -0
  67. package/src/llama.cpp/src/models/deepseek2.cpp +236 -0
  68. package/src/llama.cpp/src/models/dots1.cpp +134 -0
  69. package/src/llama.cpp/src/models/dream.cpp +105 -0
  70. package/src/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  71. package/src/llama.cpp/src/models/ernie4-5.cpp +110 -0
  72. package/src/llama.cpp/src/models/exaone.cpp +114 -0
  73. package/src/llama.cpp/src/models/exaone4.cpp +123 -0
  74. package/src/llama.cpp/src/models/falcon-h1.cpp +113 -0
  75. package/src/llama.cpp/src/models/falcon.cpp +120 -0
  76. package/src/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  77. package/src/llama.cpp/src/models/gemma.cpp +112 -0
  78. package/src/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  79. package/src/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  80. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  81. package/src/llama.cpp/src/models/glm4-moe.cpp +153 -0
  82. package/src/llama.cpp/src/models/glm4.cpp +127 -0
  83. package/src/llama.cpp/src/models/gpt2.cpp +105 -0
  84. package/src/llama.cpp/src/models/gptneox.cpp +144 -0
  85. package/src/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  86. package/src/llama.cpp/src/models/granite.cpp +211 -0
  87. package/src/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  88. package/src/llama.cpp/src/models/grok.cpp +159 -0
  89. package/src/llama.cpp/src/models/grovemoe.cpp +141 -0
  90. package/src/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  91. package/src/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  92. package/src/llama.cpp/src/models/internlm2.cpp +120 -0
  93. package/src/llama.cpp/src/models/jais.cpp +86 -0
  94. package/src/llama.cpp/src/models/jamba.cpp +106 -0
  95. package/src/llama.cpp/src/models/lfm2.cpp +173 -0
  96. package/src/llama.cpp/src/models/llada-moe.cpp +122 -0
  97. package/src/llama.cpp/src/models/llada.cpp +99 -0
  98. package/src/llama.cpp/src/models/llama-iswa.cpp +174 -0
  99. package/src/llama.cpp/src/models/llama.cpp +155 -0
  100. package/src/llama.cpp/src/models/mamba.cpp +55 -0
  101. package/src/llama.cpp/src/models/minicpm3.cpp +199 -0
  102. package/src/llama.cpp/src/models/minimax-m2.cpp +124 -0
  103. package/src/llama.cpp/src/models/models.h +481 -0
  104. package/src/llama.cpp/src/models/mpt.cpp +126 -0
  105. package/src/llama.cpp/src/models/nemotron-h.cpp +121 -0
  106. package/src/llama.cpp/src/models/nemotron.cpp +122 -0
  107. package/src/llama.cpp/src/models/neo-bert.cpp +104 -0
  108. package/src/llama.cpp/src/models/olmo.cpp +121 -0
  109. package/src/llama.cpp/src/models/olmo2.cpp +150 -0
  110. package/src/llama.cpp/src/models/olmoe.cpp +124 -0
  111. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
  112. package/src/llama.cpp/src/models/openelm.cpp +124 -0
  113. package/src/llama.cpp/src/models/orion.cpp +123 -0
  114. package/src/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  115. package/src/llama.cpp/src/models/phi2.cpp +121 -0
  116. package/src/llama.cpp/src/models/phi3.cpp +152 -0
  117. package/src/llama.cpp/src/models/plamo.cpp +110 -0
  118. package/src/llama.cpp/src/models/plamo2.cpp +316 -0
  119. package/src/llama.cpp/src/models/plm.cpp +168 -0
  120. package/src/llama.cpp/src/models/qwen.cpp +108 -0
  121. package/src/llama.cpp/src/models/qwen2.cpp +117 -0
  122. package/src/llama.cpp/src/models/qwen2moe.cpp +151 -0
  123. package/src/llama.cpp/src/models/qwen2vl.cpp +117 -0
  124. package/src/llama.cpp/src/models/qwen3.cpp +117 -0
  125. package/src/llama.cpp/src/models/qwen3moe.cpp +124 -0
  126. package/src/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  127. package/src/llama.cpp/src/models/qwen3vl.cpp +141 -0
  128. package/src/llama.cpp/src/models/refact.cpp +94 -0
  129. package/src/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  130. package/src/llama.cpp/src/models/rwkv6.cpp +94 -0
  131. package/src/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  132. package/src/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  133. package/src/llama.cpp/src/models/rwkv7.cpp +90 -0
  134. package/src/llama.cpp/src/models/seed-oss.cpp +124 -0
  135. package/src/llama.cpp/src/models/smallthinker.cpp +120 -0
  136. package/src/llama.cpp/src/models/smollm3.cpp +128 -0
  137. package/src/llama.cpp/src/models/stablelm.cpp +146 -0
  138. package/src/llama.cpp/src/models/starcoder.cpp +100 -0
  139. package/src/llama.cpp/src/models/starcoder2.cpp +121 -0
  140. package/src/llama.cpp/src/models/t5-dec.cpp +166 -0
  141. package/src/llama.cpp/src/models/t5-enc.cpp +96 -0
  142. package/src/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  143. package/src/llama.cpp/src/models/xverse.cpp +108 -0
@@ -2,10 +2,10 @@
2
2
 
3
3
  #include "chat.h"
4
4
  #include "common.h"
5
- #include "gguf.h" // for reading GGUF splits
6
5
  #include "json-schema-to-grammar.h"
7
6
  #include "log.h"
8
7
  #include "sampling.h"
8
+ #include "download.h"
9
9
 
10
10
  // fix problem with std::min and std::max
11
11
  #if defined(_WIN32)
@@ -22,23 +22,14 @@
22
22
  #include <algorithm>
23
23
  #include <climits>
24
24
  #include <cstdarg>
25
- #include <filesystem>
26
25
  #include <fstream>
27
- #include <future>
28
26
  #include <list>
29
27
  #include <regex>
30
28
  #include <set>
31
29
  #include <string>
32
- #include <thread>
30
+ #include <thread> // for hardware_concurrency
33
31
  #include <vector>
34
32
 
35
- #if defined(LLAMA_USE_CURL)
36
- #include <curl/curl.h>
37
- #include <curl/easy.h>
38
- #else
39
- #include "http.h"
40
- #endif
41
-
42
33
  #ifdef __linux__
43
34
  #include <linux/limits.h>
44
35
  #elif defined(_WIN32)
@@ -52,16 +43,9 @@
52
43
  #endif
53
44
  #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
54
45
 
55
- // isatty
56
- #if defined(_WIN32)
57
- #include <io.h>
58
- #else
59
- #include <unistd.h>
60
- #endif
61
-
62
46
  using json = nlohmann::ordered_json;
63
47
 
64
- std::initializer_list<enum llama_example> mmproj_examples = {
48
+ static std::initializer_list<enum llama_example> mmproj_examples = {
65
49
  LLAMA_EXAMPLE_MTMD,
66
50
  LLAMA_EXAMPLE_SERVER,
67
51
  };
@@ -76,50 +60,13 @@ static std::string read_file(const std::string & fname) {
76
60
  return content;
77
61
  }
78
62
 
79
- static void write_file(const std::string & fname, const std::string & content) {
80
- const std::string fname_tmp = fname + ".tmp";
81
- std::ofstream file(fname_tmp);
82
- if (!file) {
83
- throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
84
- }
85
-
86
- try {
87
- file << content;
88
- file.close();
89
-
90
- // Makes write atomic
91
- if (rename(fname_tmp.c_str(), fname.c_str()) != 0) {
92
- LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, fname_tmp.c_str(), fname.c_str());
93
- // If rename fails, try to delete the temporary file
94
- if (remove(fname_tmp.c_str()) != 0) {
95
- LOG_ERR("%s: unable to delete temporary file: %s\n", __func__, fname_tmp.c_str());
96
- }
97
- }
98
- } catch (...) {
99
- // If anything fails, try to delete the temporary file
100
- if (remove(fname_tmp.c_str()) != 0) {
101
- LOG_ERR("%s: unable to delete temporary file: %s\n", __func__, fname_tmp.c_str());
102
- }
103
-
104
- throw std::runtime_error(string_format("error: failed to write file '%s'\n", fname.c_str()));
105
- }
106
- }
107
-
108
- static bool is_output_a_tty() {
109
- #if defined(_WIN32)
110
- return _isatty(_fileno(stdout));
111
- #else
112
- return isatty(1);
113
- #endif
114
- }
115
-
116
63
  common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
117
- this->examples = std::move(examples);
64
+ this->examples = examples;
118
65
  return *this;
119
66
  }
120
67
 
121
68
  common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
122
- this->excludes = std::move(excludes);
69
+ this->excludes = excludes;
123
70
  return *this;
124
71
  }
125
72
 
@@ -142,7 +89,7 @@ bool common_arg::is_exclude(enum llama_example ex) {
142
89
  return excludes.find(ex) != excludes.end();
143
90
  }
144
91
 
145
- bool common_arg::get_value_from_env(std::string & output) {
92
+ bool common_arg::get_value_from_env(std::string & output) const {
146
93
  if (env == nullptr) return false;
147
94
  char * value = std::getenv(env);
148
95
  if (value) {
@@ -152,7 +99,7 @@ bool common_arg::get_value_from_env(std::string & output) {
152
99
  return false;
153
100
  }
154
101
 
155
- bool common_arg::has_value_from_env() {
102
+ bool common_arg::has_value_from_env() const {
156
103
  return env != nullptr && std::getenv(env);
157
104
  }
158
105
 
@@ -220,943 +167,6 @@ std::string common_arg::to_string() {
220
167
  return ss.str();
221
168
  }
222
169
 
223
- //
224
- // downloader
225
- //
226
-
227
- struct common_hf_file_res {
228
- std::string repo; // repo name with ":tag" removed
229
- std::string ggufFile;
230
- std::string mmprojFile;
231
- };
232
-
233
- static void write_etag(const std::string & path, const std::string & etag) {
234
- const std::string etag_path = path + ".etag";
235
- write_file(etag_path, etag);
236
- LOG_DBG("%s: file etag saved: %s\n", __func__, etag_path.c_str());
237
- }
238
-
239
- static std::string read_etag(const std::string & path) {
240
- std::string none;
241
- const std::string etag_path = path + ".etag";
242
-
243
- if (std::filesystem::exists(etag_path)) {
244
- std::ifstream etag_in(etag_path);
245
- if (!etag_in) {
246
- LOG_ERR("%s: could not open .etag file for reading: %s\n", __func__, etag_path.c_str());
247
- return none;
248
- }
249
- std::string etag;
250
- std::getline(etag_in, etag);
251
- return etag;
252
- }
253
-
254
- // no etag file, but maybe there is an old .json
255
- // remove this code later
256
- const std::string metadata_path = path + ".json";
257
-
258
- if (std::filesystem::exists(metadata_path)) {
259
- std::ifstream metadata_in(metadata_path);
260
- try {
261
- nlohmann::json metadata_json;
262
- metadata_in >> metadata_json;
263
- LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(),
264
- metadata_json.dump().c_str());
265
- if (metadata_json.contains("etag") && metadata_json.at("etag").is_string()) {
266
- std::string etag = metadata_json.at("etag");
267
- write_etag(path, etag);
268
- if (!std::filesystem::remove(metadata_path)) {
269
- LOG_WRN("%s: failed to delete old .json metadata file: %s\n", __func__, metadata_path.c_str());
270
- }
271
- return etag;
272
- }
273
- } catch (const nlohmann::json::exception & e) {
274
- LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
275
- }
276
- }
277
- return none;
278
- }
279
-
280
- #ifdef LLAMA_USE_CURL
281
-
282
- //
283
- // CURL utils
284
- //
285
-
286
- using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
287
-
288
- // cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
289
- struct curl_slist_ptr {
290
- struct curl_slist * ptr = nullptr;
291
- ~curl_slist_ptr() {
292
- if (ptr) {
293
- curl_slist_free_all(ptr);
294
- }
295
- }
296
- };
297
-
298
- static CURLcode common_curl_perf(CURL * curl) {
299
- CURLcode res = curl_easy_perform(curl);
300
- if (res != CURLE_OK) {
301
- LOG_ERR("%s: curl_easy_perform() failed\n", __func__);
302
- }
303
-
304
- return res;
305
- }
306
-
307
- // Send a HEAD request to retrieve the etag and last-modified headers
308
- struct common_load_model_from_url_headers {
309
- std::string etag;
310
- std::string last_modified;
311
- std::string accept_ranges;
312
- };
313
-
314
- struct FILE_deleter {
315
- void operator()(FILE * f) const { fclose(f); }
316
- };
317
-
318
- static size_t common_header_callback(char * buffer, size_t, size_t n_items, void * userdata) {
319
- common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
320
- static std::regex header_regex("([^:]+): (.*)\r\n");
321
- static std::regex etag_regex("ETag", std::regex_constants::icase);
322
- static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
323
- static std::regex accept_ranges_regex("Accept-Ranges", std::regex_constants::icase);
324
- std::string header(buffer, n_items);
325
- std::smatch match;
326
- if (std::regex_match(header, match, header_regex)) {
327
- const std::string & key = match[1];
328
- const std::string & value = match[2];
329
- if (std::regex_match(key, match, etag_regex)) {
330
- headers->etag = value;
331
- } else if (std::regex_match(key, match, last_modified_regex)) {
332
- headers->last_modified = value;
333
- } else if (std::regex_match(key, match, accept_ranges_regex)) {
334
- headers->accept_ranges = value;
335
- }
336
- }
337
-
338
- return n_items;
339
- }
340
-
341
- static size_t common_write_callback(void * data, size_t size, size_t nmemb, void * fd) {
342
- return std::fwrite(data, size, nmemb, static_cast<FILE *>(fd));
343
- }
344
-
345
- // helper function to hide password in URL
346
- static std::string llama_download_hide_password_in_url(const std::string & url) {
347
- // Use regex to match and replace the user[:password]@ pattern in URLs
348
- // Pattern: scheme://[user[:password]@]host[...]
349
- static const std::regex url_regex(R"(^(?:[A-Za-z][A-Za-z0-9+.-]://)(?:[^/@]+@)?.$)");
350
- std::smatch match;
351
-
352
- if (std::regex_match(url, match, url_regex)) {
353
- // match[1] = scheme (e.g., "https://")
354
- // match[2] = user[:password]@ part
355
- // match[3] = rest of URL (host and path)
356
- return match[1].str() + "********@" + match[3].str();
357
- }
358
-
359
- return url; // No credentials found or malformed URL
360
- }
361
-
362
- static void common_curl_easy_setopt_head(CURL * curl, const std::string & url) {
363
- // Set the URL, allow to follow http redirection
364
- curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
365
- curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
366
-
367
- # if defined(_WIN32)
368
- // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
369
- // operating system. Currently implemented under MS-Windows.
370
- curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
371
- # endif
372
-
373
- curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
374
- curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress
375
- curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, common_header_callback);
376
- }
377
-
378
- static void common_curl_easy_setopt_get(CURL * curl) {
379
- curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
380
- curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, common_write_callback);
381
-
382
- // display download progress
383
- curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
384
- }
385
-
386
- static bool common_pull_file(CURL * curl, const std::string & path_temporary) {
387
- if (std::filesystem::exists(path_temporary)) {
388
- const std::string partial_size = std::to_string(std::filesystem::file_size(path_temporary));
389
- LOG_INF("%s: server supports range requests, resuming download from byte %s\n", __func__, partial_size.c_str());
390
- const std::string range_str = partial_size + "-";
391
- curl_easy_setopt(curl, CURLOPT_RANGE, range_str.c_str());
392
- }
393
-
394
- // Always open file in append mode could be resuming
395
- std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "ab"));
396
- if (!outfile) {
397
- LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_temporary.c_str());
398
- return false;
399
- }
400
-
401
- common_curl_easy_setopt_get(curl);
402
- curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile.get());
403
-
404
- return common_curl_perf(curl) == CURLE_OK;
405
- }
406
-
407
- static bool common_download_head(CURL * curl,
408
- curl_slist_ptr & http_headers,
409
- const std::string & url,
410
- const std::string & bearer_token) {
411
- if (!curl) {
412
- LOG_ERR("%s: error initializing libcurl\n", __func__);
413
- return false;
414
- }
415
-
416
- http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
417
- // Check if hf-token or bearer-token was specified
418
- if (!bearer_token.empty()) {
419
- std::string auth_header = "Authorization: Bearer " + bearer_token;
420
- http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
421
- }
422
-
423
- curl_easy_setopt(curl, CURLOPT_HTTPHEADER, http_headers.ptr);
424
- common_curl_easy_setopt_head(curl, url);
425
- return common_curl_perf(curl) == CURLE_OK;
426
- }
427
-
428
- // download one single file from remote URL to local path
429
- static bool common_download_file_single_online(const std::string & url,
430
- const std::string & path,
431
- const std::string & bearer_token) {
432
- static const int max_attempts = 3;
433
- static const int retry_delay_seconds = 2;
434
- for (int i = 0; i < max_attempts; ++i) {
435
- std::string etag;
436
-
437
- // Check if the file already exists locally
438
- const auto file_exists = std::filesystem::exists(path);
439
- if (file_exists) {
440
- etag = read_etag(path);
441
- } else {
442
- LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
443
- }
444
-
445
- bool head_request_ok = false;
446
- bool should_download = !file_exists; // by default, we should download if the file does not exist
447
-
448
- // Initialize libcurl
449
- curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
450
- common_load_model_from_url_headers headers;
451
- curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
452
- curl_slist_ptr http_headers;
453
- const bool was_perform_successful = common_download_head(curl.get(), http_headers, url, bearer_token);
454
- if (!was_perform_successful) {
455
- head_request_ok = false;
456
- }
457
-
458
- long http_code = 0;
459
- curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
460
- if (http_code == 200) {
461
- head_request_ok = true;
462
- } else {
463
- LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
464
- head_request_ok = false;
465
- }
466
-
467
- // if head_request_ok is false, we don't have the etag or last-modified headers
468
- // we leave should_download as-is, which is true if the file does not exist
469
- bool should_download_from_scratch = false;
470
- if (head_request_ok) {
471
- // check if ETag or Last-Modified headers are different
472
- // if it is, we need to download the file again
473
- if (!etag.empty() && etag != headers.etag) {
474
- LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(),
475
- headers.etag.c_str());
476
- should_download = true;
477
- should_download_from_scratch = true;
478
- }
479
- }
480
-
481
- const bool accept_ranges_supported = !headers.accept_ranges.empty() && headers.accept_ranges != "none";
482
- if (should_download) {
483
- if (file_exists &&
484
- !accept_ranges_supported) { // Resumable downloads not supported, delete and start again.
485
- LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
486
- if (remove(path.c_str()) != 0) {
487
- LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
488
- return false;
489
- }
490
- }
491
-
492
- const std::string path_temporary = path + ".downloadInProgress";
493
- if (should_download_from_scratch) {
494
- if (std::filesystem::exists(path_temporary)) {
495
- if (remove(path_temporary.c_str()) != 0) {
496
- LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
497
- return false;
498
- }
499
- }
500
-
501
- if (std::filesystem::exists(path)) {
502
- if (remove(path.c_str()) != 0) {
503
- LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
504
- return false;
505
- }
506
- }
507
- }
508
- if (head_request_ok) {
509
- write_etag(path, headers.etag);
510
- }
511
-
512
- // start the download
513
- LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n",
514
- __func__, llama_download_hide_password_in_url(url).c_str(), path_temporary.c_str(),
515
- headers.etag.c_str(), headers.last_modified.c_str());
516
- const bool was_pull_successful = common_pull_file(curl.get(), path_temporary);
517
- if (!was_pull_successful) {
518
- if (i + 1 < max_attempts) {
519
- const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000;
520
- LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay);
521
- std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
522
- } else {
523
- LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
524
- }
525
-
526
- continue;
527
- }
528
-
529
- long http_code = 0;
530
- curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
531
- if (http_code < 200 || http_code >= 400) {
532
- LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
533
- return false;
534
- }
535
-
536
- if (rename(path_temporary.c_str(), path.c_str()) != 0) {
537
- LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
538
- return false;
539
- }
540
- } else {
541
- LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
542
- }
543
-
544
- break;
545
- }
546
-
547
- return true;
548
- }
549
-
550
- std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
551
- curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
552
- curl_slist_ptr http_headers;
553
- std::vector<char> res_buffer;
554
-
555
- curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
556
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
557
- curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
558
- curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 1L);
559
- typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
560
- auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
561
- auto data_vec = static_cast<std::vector<char> *>(data);
562
- data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
563
- return size * nmemb;
564
- };
565
- curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
566
- curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
567
- #if defined(_WIN32)
568
- curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
569
- #endif
570
- if (params.timeout > 0) {
571
- curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
572
- }
573
- if (params.max_size > 0) {
574
- curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
575
- }
576
- http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
577
- for (const auto & header : params.headers) {
578
- http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
579
- }
580
- curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
581
-
582
- CURLcode res = curl_easy_perform(curl.get());
583
-
584
- if (res != CURLE_OK) {
585
- std::string error_msg = curl_easy_strerror(res);
586
- throw std::runtime_error("error: cannot make GET request: " + error_msg);
587
- }
588
-
589
- long res_code;
590
- curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
591
-
592
- return { res_code, std::move(res_buffer) };
593
- }
594
-
595
- #else
596
-
597
- static void print_progress(size_t current, size_t total) {
598
- if (!is_output_a_tty()) {
599
- return;
600
- }
601
-
602
- if (!total) {
603
- return;
604
- }
605
-
606
- size_t width = 50;
607
- size_t pct = (100 * current) / total;
608
- size_t pos = (width * current) / total;
609
-
610
- std::cout << "["
611
- << std::string(pos, '=')
612
- << (pos < width ? ">" : "")
613
- << std::string(width - pos, ' ')
614
- << "] " << std::setw(3) << pct << "% ("
615
- << current / (1024 * 1024) << " MB / "
616
- << total / (1024 * 1024) << " MB)\r";
617
- std::cout.flush();
618
- }
619
-
620
- static bool common_pull_file(httplib::Client & cli,
621
- const std::string & resolve_path,
622
- const std::string & path_tmp,
623
- bool supports_ranges,
624
- size_t existing_size,
625
- size_t & total_size) {
626
- std::ofstream ofs(path_tmp, std::ios::binary | std::ios::app);
627
- if (!ofs.is_open()) {
628
- LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_tmp.c_str());
629
- return false;
630
- }
631
-
632
- httplib::Headers headers;
633
- if (supports_ranges && existing_size > 0) {
634
- headers.emplace("Range", "bytes=" + std::to_string(existing_size) + "-");
635
- }
636
-
637
- std::atomic<size_t> downloaded{existing_size};
638
-
639
- auto res = cli.Get(resolve_path, headers,
640
- [&](const httplib::Response &response) {
641
- if (existing_size > 0 && response.status != 206) {
642
- LOG_WRN("%s: server did not respond with 206 Partial Content for a resume request. Status: %d\n", __func__, response.status);
643
- return false;
644
- }
645
- if (existing_size == 0 && response.status != 200) {
646
- LOG_WRN("%s: download received non-successful status code: %d\n", __func__, response.status);
647
- return false;
648
- }
649
- if (total_size == 0 && response.has_header("Content-Length")) {
650
- try {
651
- size_t content_length = std::stoull(response.get_header_value("Content-Length"));
652
- total_size = existing_size + content_length;
653
- } catch (const std::exception &e) {
654
- LOG_WRN("%s: invalid Content-Length header: %s\n", __func__, e.what());
655
- }
656
- }
657
- return true;
658
- },
659
- [&](const char *data, size_t len) {
660
- ofs.write(data, len);
661
- if (!ofs) {
662
- LOG_ERR("%s: error writing to file: %s\n", __func__, path_tmp.c_str());
663
- return false;
664
- }
665
- downloaded += len;
666
- print_progress(downloaded, total_size);
667
- return true;
668
- },
669
- nullptr
670
- );
671
-
672
- std::cout << "\n";
673
-
674
- if (!res) {
675
- LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
676
- return false;
677
- }
678
-
679
- return true;
680
- }
681
-
682
- // download one single file from remote URL to local path
683
- static bool common_download_file_single_online(const std::string & url,
684
- const std::string & path,
685
- const std::string & bearer_token) {
686
- static const int max_attempts = 3;
687
- static const int retry_delay_seconds = 2;
688
-
689
- auto [cli, parts] = common_http_client(url);
690
-
691
- httplib::Headers default_headers = {{"User-Agent", "llama-cpp"}};
692
- if (!bearer_token.empty()) {
693
- default_headers.insert({"Authorization", "Bearer " + bearer_token});
694
- }
695
- cli.set_default_headers(default_headers);
696
-
697
- const bool file_exists = std::filesystem::exists(path);
698
-
699
- std::string last_etag;
700
- if (file_exists) {
701
- last_etag = read_etag(path);
702
- } else {
703
- LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
704
- }
705
-
706
- for (int i = 0; i < max_attempts; ++i) {
707
- auto head = cli.Head(parts.path);
708
- bool head_ok = head && head->status >= 200 && head->status < 300;
709
- if (!head_ok) {
710
- LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1);
711
- if (file_exists) {
712
- LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str());
713
- return true;
714
- }
715
- }
716
-
717
- std::string etag;
718
- if (head_ok && head->has_header("ETag")) {
719
- etag = head->get_header_value("ETag");
720
- }
721
-
722
- size_t total_size = 0;
723
- if (head_ok && head->has_header("Content-Length")) {
724
- try {
725
- total_size = std::stoull(head->get_header_value("Content-Length"));
726
- } catch (const std::exception& e) {
727
- LOG_WRN("%s: Invalid Content-Length in HEAD response: %s\n", __func__, e.what());
728
- }
729
- }
730
-
731
- bool supports_ranges = false;
732
- if (head_ok && head->has_header("Accept-Ranges")) {
733
- supports_ranges = head->get_header_value("Accept-Ranges") != "none";
734
- }
735
-
736
- bool should_download_from_scratch = false;
737
- if (!last_etag.empty() && !etag.empty() && last_etag != etag) {
738
- LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__,
739
- last_etag.c_str(), etag.c_str());
740
- should_download_from_scratch = true;
741
- }
742
-
743
- if (file_exists) {
744
- if (!should_download_from_scratch) {
745
- LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
746
- return true;
747
- }
748
- LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
749
- if (remove(path.c_str()) != 0) {
750
- LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
751
- return false;
752
- }
753
- }
754
-
755
- const std::string path_temporary = path + ".downloadInProgress";
756
- size_t existing_size = 0;
757
-
758
- if (std::filesystem::exists(path_temporary)) {
759
- if (supports_ranges && !should_download_from_scratch) {
760
- existing_size = std::filesystem::file_size(path_temporary);
761
- } else if (remove(path_temporary.c_str()) != 0) {
762
- LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
763
- return false;
764
- }
765
- }
766
-
767
- // start the download
768
- LOG_INF("%s: trying to download model from %s to %s (etag:%s)...\n",
769
- __func__, common_http_show_masked_url(parts).c_str(), path_temporary.c_str(), etag.c_str());
770
- const bool was_pull_successful = common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size);
771
- if (!was_pull_successful) {
772
- if (i + 1 < max_attempts) {
773
- const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000;
774
- LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay);
775
- std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
776
- } else {
777
- LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts);
778
- }
779
- continue;
780
- }
781
-
782
- if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
783
- LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
784
- return false;
785
- }
786
- if (!etag.empty()) {
787
- write_etag(path, etag);
788
- }
789
- break;
790
- }
791
-
792
- return true;
793
- }
794
-
795
- std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url,
796
- const common_remote_params & params) {
797
- auto [cli, parts] = common_http_client(url);
798
-
799
- httplib::Headers headers = {{"User-Agent", "llama-cpp"}};
800
- for (const auto & header : params.headers) {
801
- size_t pos = header.find(':');
802
- if (pos != std::string::npos) {
803
- headers.emplace(header.substr(0, pos), header.substr(pos + 1));
804
- } else {
805
- headers.emplace(header, "");
806
- }
807
- }
808
-
809
- if (params.timeout > 0) {
810
- cli.set_read_timeout(params.timeout, 0);
811
- cli.set_write_timeout(params.timeout, 0);
812
- }
813
-
814
- std::vector<char> buf;
815
- auto res = cli.Get(parts.path, headers,
816
- [&](const char *data, size_t len) {
817
- buf.insert(buf.end(), data, data + len);
818
- return params.max_size == 0 ||
819
- buf.size() <= static_cast<size_t>(params.max_size);
820
- },
821
- nullptr
822
- );
823
-
824
- if (!res) {
825
- throw std::runtime_error("error: cannot make GET request");
826
- }
827
-
828
- return { res->status, std::move(buf) };
829
- }
830
-
831
- #endif // LLAMA_USE_CURL
832
-
833
- static bool common_download_file_single(const std::string & url,
834
- const std::string & path,
835
- const std::string & bearer_token,
836
- bool offline) {
837
- if (!offline) {
838
- return common_download_file_single_online(url, path, bearer_token);
839
- }
840
-
841
- if (!std::filesystem::exists(path)) {
842
- LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
843
- return false;
844
- }
845
-
846
- LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
847
- return true;
848
- }
849
-
850
- // download multiple files from remote URLs to local paths
851
- // the input is a vector of pairs <url, path>
852
- static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
853
- // Prepare download in parallel
854
- std::vector<std::future<bool>> futures_download;
855
- for (auto const & item : urls) {
856
- futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
857
- return common_download_file_single(it.first, it.second, bearer_token, offline);
858
- }, item));
859
- }
860
-
861
- // Wait for all downloads to complete
862
- for (auto & f : futures_download) {
863
- if (!f.get()) {
864
- return false;
865
- }
866
- }
867
-
868
- return true;
869
- }
870
-
871
- static bool common_download_model(
872
- const common_params_model & model,
873
- const std::string & bearer_token,
874
- bool offline) {
875
- // Basic validation of the model.url
876
- if (model.url.empty()) {
877
- LOG_ERR("%s: invalid model url\n", __func__);
878
- return false;
879
- }
880
-
881
- if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
882
- return false;
883
- }
884
-
885
- // check for additional GGUFs split to download
886
- int n_split = 0;
887
- {
888
- struct gguf_init_params gguf_params = {
889
- /*.no_alloc = */ true,
890
- /*.ctx = */ NULL,
891
- };
892
- auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
893
- if (!ctx_gguf) {
894
- LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, model.path.c_str());
895
- return false;
896
- }
897
-
898
- auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
899
- if (key_n_split >= 0) {
900
- n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
901
- }
902
-
903
- gguf_free(ctx_gguf);
904
- }
905
-
906
- if (n_split > 1) {
907
- char split_prefix[PATH_MAX] = {0};
908
- char split_url_prefix[LLAMA_MAX_URL_LENGTH] = {0};
909
-
910
- // Verify the first split file format
911
- // and extract split URL and PATH prefixes
912
- {
913
- if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
914
- LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
915
- return false;
916
- }
917
-
918
- if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) {
919
- LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split);
920
- return false;
921
- }
922
- }
923
-
924
- std::vector<std::pair<std::string, std::string>> urls;
925
- for (int idx = 1; idx < n_split; idx++) {
926
- char split_path[PATH_MAX] = {0};
927
- llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
928
-
929
- char split_url[LLAMA_MAX_URL_LENGTH] = {0};
930
- llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
931
-
932
- if (std::string(split_path) == model.path) {
933
- continue; // skip the already downloaded file
934
- }
935
-
936
- urls.push_back({split_url, split_path});
937
- }
938
-
939
- // Download in parallel
940
- common_download_file_multiple(urls, bearer_token, offline);
941
- }
942
-
943
- return true;
944
- }
945
-
946
- /**
947
- * Allow getting the HF file from the HF repo with tag (like ollama), for example:
948
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
949
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
950
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
951
- * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
952
- *
953
- * Return pair of <repo, file> (with "repo" already having tag removed)
954
- *
955
- * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
956
- */
957
- static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
958
- auto parts = string_split<std::string>(hf_repo_with_tag, ':');
959
- std::string tag = parts.size() > 1 ? parts.back() : "latest";
960
- std::string hf_repo = parts[0];
961
- if (string_split<std::string>(hf_repo, '/').size() != 2) {
962
- throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
963
- }
964
-
965
- std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
966
-
967
- // headers
968
- std::vector<std::string> headers;
969
- headers.push_back("Accept: application/json");
970
- if (!bearer_token.empty()) {
971
- headers.push_back("Authorization: Bearer " + bearer_token);
972
- }
973
- // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
974
- // User-Agent header is already set in common_remote_get_content, no need to set it here
975
-
976
- // we use "=" to avoid clashing with other component, while still being allowed on windows
977
- std::string cached_response_fname = "manifest=" + hf_repo + "=" + tag + ".json";
978
- string_replace_all(cached_response_fname, "/", "_");
979
- std::string cached_response_path = fs_get_cache_file(cached_response_fname);
980
-
981
- // make the request
982
- common_remote_params params;
983
- params.headers = headers;
984
- long res_code = 0;
985
- std::string res_str;
986
- bool use_cache = false;
987
- if (!offline) {
988
- try {
989
- auto res = common_remote_get_content(url, params);
990
- res_code = res.first;
991
- res_str = std::string(res.second.data(), res.second.size());
992
- } catch (const std::exception & e) {
993
- LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
994
- }
995
- }
996
- if (res_code == 0) {
997
- if (std::filesystem::exists(cached_response_path)) {
998
- LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
999
- res_str = read_file(cached_response_path);
1000
- res_code = 200;
1001
- use_cache = true;
1002
- } else {
1003
- throw std::runtime_error(
1004
- offline ? "error: failed to get manifest (offline mode)"
1005
- : "error: failed to get manifest (check your internet connection)");
1006
- }
1007
- }
1008
- std::string ggufFile;
1009
- std::string mmprojFile;
1010
-
1011
- if (res_code == 200 || res_code == 304) {
1012
- try {
1013
- auto j = json::parse(res_str);
1014
-
1015
- if (j.contains("ggufFile") && j["ggufFile"].contains("rfilename")) {
1016
- ggufFile = j["ggufFile"]["rfilename"].get<std::string>();
1017
- }
1018
- if (j.contains("mmprojFile") && j["mmprojFile"].contains("rfilename")) {
1019
- mmprojFile = j["mmprojFile"]["rfilename"].get<std::string>();
1020
- }
1021
- } catch (const std::exception & e) {
1022
- throw std::runtime_error(std::string("error parsing manifest JSON: ") + e.what());
1023
- }
1024
- if (!use_cache) {
1025
- // if not using cached response, update the cache file
1026
- write_file(cached_response_path, res_str);
1027
- }
1028
- } else if (res_code == 401) {
1029
- throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
1030
- } else {
1031
- throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
1032
- }
1033
-
1034
- // check response
1035
- if (ggufFile.empty()) {
1036
- throw std::runtime_error("error: model does not have ggufFile");
1037
- }
1038
-
1039
- return { hf_repo, ggufFile, mmprojFile };
1040
- }
1041
-
1042
- //
1043
- // Docker registry functions
1044
- //
1045
-
1046
- static std::string common_docker_get_token(const std::string & repo) {
1047
- std::string url = "https://auth.docker.io/token?service=registry.docker.io&scope=repository:" + repo + ":pull";
1048
-
1049
- common_remote_params params;
1050
- auto res = common_remote_get_content(url, params);
1051
-
1052
- if (res.first != 200) {
1053
- throw std::runtime_error("Failed to get Docker registry token, HTTP code: " + std::to_string(res.first));
1054
- }
1055
-
1056
- std::string response_str(res.second.begin(), res.second.end());
1057
- nlohmann::ordered_json response = nlohmann::ordered_json::parse(response_str);
1058
-
1059
- if (!response.contains("token")) {
1060
- throw std::runtime_error("Docker registry token response missing 'token' field");
1061
- }
1062
-
1063
- return response["token"].get<std::string>();
1064
- }
1065
-
1066
- static std::string common_docker_resolve_model(const std::string & docker) {
1067
- // Parse ai/smollm2:135M-Q4_0
1068
- size_t colon_pos = docker.find(':');
1069
- std::string repo, tag;
1070
- if (colon_pos != std::string::npos) {
1071
- repo = docker.substr(0, colon_pos);
1072
- tag = docker.substr(colon_pos + 1);
1073
- } else {
1074
- repo = docker;
1075
- tag = "latest";
1076
- }
1077
-
1078
- // ai/ is the default
1079
- size_t slash_pos = docker.find('/');
1080
- if (slash_pos == std::string::npos) {
1081
- repo.insert(0, "ai/");
1082
- }
1083
-
1084
- LOG_INF("%s: Downloading Docker Model: %s:%s\n", __func__, repo.c_str(), tag.c_str());
1085
- try {
1086
- // --- helper: digest validation ---
1087
- auto validate_oci_digest = [](const std::string & digest) -> std::string {
1088
- // Expected: algo:hex ; start with sha256 (64 hex chars)
1089
- // You can extend this map if supporting other algorithms in future.
1090
- static const std::regex re("^sha256:([a-fA-F0-9]{64})$");
1091
- std::smatch m;
1092
- if (!std::regex_match(digest, m, re)) {
1093
- throw std::runtime_error("Invalid OCI digest format received in manifest: " + digest);
1094
- }
1095
- // normalize hex to lowercase
1096
- std::string normalized = digest;
1097
- std::transform(normalized.begin()+7, normalized.end(), normalized.begin()+7, [](unsigned char c){
1098
- return std::tolower(c);
1099
- });
1100
- return normalized;
1101
- };
1102
-
1103
- std::string token = common_docker_get_token(repo); // Get authentication token
1104
-
1105
- // Get manifest
1106
- const std::string url_prefix = "https://registry-1.docker.io/v2/" + repo;
1107
- std::string manifest_url = url_prefix + "/manifests/" + tag;
1108
- common_remote_params manifest_params;
1109
- manifest_params.headers.push_back("Authorization: Bearer " + token);
1110
- manifest_params.headers.push_back(
1111
- "Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json");
1112
- auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
1113
- if (manifest_res.first != 200) {
1114
- throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
1115
- }
1116
-
1117
- std::string manifest_str(manifest_res.second.begin(), manifest_res.second.end());
1118
- nlohmann::ordered_json manifest = nlohmann::ordered_json::parse(manifest_str);
1119
- std::string gguf_digest; // Find the GGUF layer
1120
- if (manifest.contains("layers")) {
1121
- for (const auto & layer : manifest["layers"]) {
1122
- if (layer.contains("mediaType")) {
1123
- std::string media_type = layer["mediaType"].get<std::string>();
1124
- if (media_type == "application/vnd.docker.ai.gguf.v3" ||
1125
- media_type.find("gguf") != std::string::npos) {
1126
- gguf_digest = layer["digest"].get<std::string>();
1127
- break;
1128
- }
1129
- }
1130
- }
1131
- }
1132
-
1133
- if (gguf_digest.empty()) {
1134
- throw std::runtime_error("No GGUF layer found in Docker manifest");
1135
- }
1136
-
1137
- // Validate & normalize digest
1138
- gguf_digest = validate_oci_digest(gguf_digest);
1139
- LOG_DBG("%s: Using validated digest: %s\n", __func__, gguf_digest.c_str());
1140
-
1141
- // Prepare local filename
1142
- std::string model_filename = repo;
1143
- std::replace(model_filename.begin(), model_filename.end(), '/', '_');
1144
- model_filename += "_" + tag + ".gguf";
1145
- std::string local_path = fs_get_cache_file(model_filename);
1146
-
1147
- const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
1148
- if (!common_download_file_single(blob_url, local_path, token, false)) {
1149
- throw std::runtime_error("Failed to download Docker Model");
1150
- }
1151
-
1152
- LOG_INF("%s: Downloaded Docker Model to: %s\n", __func__, local_path.c_str());
1153
- return local_path;
1154
- } catch (const std::exception & e) {
1155
- LOG_ERR("%s: Docker Model download failed: %s\n", __func__, e.what());
1156
- throw;
1157
- }
1158
- }
1159
-
1160
170
  //
1161
171
  // utils
1162
172
  //
@@ -1730,6 +740,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1730
740
  exit(0);
1731
741
  }
1732
742
  ));
743
+ add_opt(common_arg(
744
+ {"-cl", "--cache-list"},
745
+ "show list of models in cache",
746
+ [](common_params &) {
747
+ printf("model cache directory: %s\n", fs_get_cache_directory().c_str());
748
+ auto models = common_list_cached_models();
749
+ printf("number of models in cache: %zu\n", models.size());
750
+ for (size_t i = 0; i < models.size(); i++) {
751
+ auto & model = models[i];
752
+ printf("%4d. %s\n", (int) i + 1, model.to_string().c_str());
753
+ }
754
+ exit(0);
755
+ }
756
+ ));
1733
757
  add_opt(common_arg(
1734
758
  {"--completion-bash"},
1735
759
  "print source-able bash completion script for llama.cpp",
@@ -2030,7 +1054,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2030
1054
  params.system_prompt.pop_back();
2031
1055
  }
2032
1056
  }
2033
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1057
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
2034
1058
  add_opt(common_arg(
2035
1059
  {"--in-file"}, "FNAME",
2036
1060
  "an input file (repeat to specify multiple files)",
@@ -2768,6 +1792,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2768
1792
  params.image.emplace_back(value);
2769
1793
  }
2770
1794
  ).set_examples({LLAMA_EXAMPLE_MTMD}));
1795
+ add_opt(common_arg(
1796
+ {"--image-min-tokens"}, "N",
1797
+ "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
1798
+ [](common_params & params, int value) {
1799
+ params.image_min_tokens = value;
1800
+ }
1801
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MIN_TOKENS"));
1802
+ add_opt(common_arg(
1803
+ {"--image-max-tokens"}, "N",
1804
+ "maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
1805
+ [](common_params & params, int value) {
1806
+ params.image_max_tokens = value;
1807
+ }
1808
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
2771
1809
  if (llama_supports_rpc()) {
2772
1810
  add_opt(common_arg(
2773
1811
  {"--rpc"}, "SERVERS",
@@ -3203,7 +2241,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3203
2241
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
3204
2242
  add_opt(common_arg(
3205
2243
  {"--parse-special"},
3206
- string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
2244
+ string_format("parse special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
3207
2245
  [](common_params & params) {
3208
2246
  params.parse_special = true;
3209
2247
  }
@@ -3215,6 +2253,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3215
2253
  params.is_pp_shared = true;
3216
2254
  }
3217
2255
  ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
2256
+ add_opt(common_arg(
2257
+ {"-tgs"},
2258
+ string_format("is the text generation separated across the different sequences (default: %s)", params.is_tg_separate ? "true" : "false"),
2259
+ [](common_params & params) {
2260
+ params.is_tg_separate = true;
2261
+ }
2262
+ ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
3218
2263
  add_opt(common_arg(
3219
2264
  {"-npp"}, "n0,n1,...",
3220
2265
  "number of prompt tokens",