@fugood/llama.node 0.3.16 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +238 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +6 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  130. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
  131. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  133. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  135. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  136. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
  142. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  143. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  144. package/src/llama.cpp/include/llama.h +30 -11
  145. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  147. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  149. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  150. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  151. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  152. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  153. package/src/llama.cpp/src/llama-arch.cpp +160 -17
  154. package/src/llama.cpp/src/llama-arch.h +16 -0
  155. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  156. package/src/llama.cpp/src/llama-chat.h +6 -2
  157. package/src/llama.cpp/src/llama-context.cpp +108 -92
  158. package/src/llama.cpp/src/llama-context.h +1 -2
  159. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  160. package/src/llama.cpp/src/llama-graph.h +26 -6
  161. package/src/llama.cpp/src/llama-hparams.h +13 -0
  162. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  163. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  164. package/src/llama.cpp/src/llama-memory.h +1 -1
  165. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  166. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  167. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  168. package/src/llama.cpp/src/llama-model.cpp +1760 -534
  169. package/src/llama.cpp/src/llama-model.h +13 -1
  170. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  171. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  172. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  173. package/src/llama.cpp/src/llama.cpp +1 -1
  174. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  175. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  176. package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
  177. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  178. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  179. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  180. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  181. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  182. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  183. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  184. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  185. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  186. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  188. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  189. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  190. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  191. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  192. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  193. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -1,12 +1,24 @@
1
+ #include "gguf.h" // for reading GGUF splits
1
2
  #include "arg.h"
2
3
 
4
+ #include "common.h"
3
5
  #include "log.h"
4
6
  #include "sampling.h"
5
7
  #include "chat.h"
6
8
 
9
+ // fix problem with std::min and std::max
10
+ #if defined(_WIN32)
11
+ #define WIN32_LEAN_AND_MEAN
12
+ #ifndef NOMINMAX
13
+ # define NOMINMAX
14
+ #endif
15
+ #include <windows.h>
16
+ #endif
17
+
7
18
  #include <algorithm>
8
19
  #include <climits>
9
20
  #include <cstdarg>
21
+ #include <filesystem>
10
22
  #include <fstream>
11
23
  #include <regex>
12
24
  #include <set>
@@ -14,10 +26,42 @@
14
26
  #include <thread>
15
27
  #include <vector>
16
28
 
29
+ //#define LLAMA_USE_CURL
30
+
31
+ #if defined(LLAMA_USE_CURL)
32
+ #include <curl/curl.h>
33
+ #include <curl/easy.h>
34
+ #include <future>
35
+ #endif
36
+
17
37
  #include "json-schema-to-grammar.h"
18
38
 
19
39
  using json = nlohmann::ordered_json;
20
40
 
41
+ std::initializer_list<enum llama_example> mmproj_examples = {
42
+ LLAMA_EXAMPLE_LLAVA,
43
+ // TODO: add LLAMA_EXAMPLE_SERVER when it's ready
44
+ };
45
+
46
+ static std::string read_file(const std::string & fname) {
47
+ std::ifstream file(fname);
48
+ if (!file) {
49
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
50
+ }
51
+ std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
52
+ file.close();
53
+ return content;
54
+ }
55
+
56
+ static void write_file(const std::string & fname, const std::string & content) {
57
+ std::ofstream file(fname);
58
+ if (!file) {
59
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
60
+ }
61
+ file << content;
62
+ file.close();
63
+ }
64
+
21
65
  common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
22
66
  this->examples = std::move(examples);
23
67
  return *this;
@@ -126,47 +170,637 @@ std::string common_arg::to_string() {
126
170
  }
127
171
 
128
172
  //
129
- // utils
173
+ // downloader
174
+ //
175
+
176
+ struct common_hf_file_res {
177
+ std::string repo; // repo name with ":tag" removed
178
+ std::string ggufFile;
179
+ std::string mmprojFile;
180
+ };
181
+
182
+ #ifdef LLAMA_USE_CURL
183
+
184
+ bool common_has_curl() {
185
+ return true;
186
+ }
187
+
188
+ #ifdef __linux__
189
+ #include <linux/limits.h>
190
+ #elif defined(_WIN32)
191
+ # if !defined(PATH_MAX)
192
+ # define PATH_MAX MAX_PATH
193
+ # endif
194
+ #elif defined(_AIX)
195
+ #include <sys/limits.h>
196
+ #else
197
+ #include <sys/syslimits.h>
198
+ #endif
199
+ #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
200
+
201
+ //
202
+ // CURL utils
130
203
  //
131
204
 
132
- static void common_params_handle_model_default(
133
- std::string & model,
134
- const std::string & model_url,
135
- std::string & hf_repo,
136
- std::string & hf_file,
137
- const std::string & hf_token,
138
- const std::string & model_default) {
139
- if (!hf_repo.empty()) {
140
- // short-hand to avoid specifying --hf-file -> default it to --model
141
- if (hf_file.empty()) {
142
- if (model.empty()) {
143
- auto auto_detected = common_get_hf_file(hf_repo, hf_token);
144
- if (auto_detected.first.empty() || auto_detected.second.empty()) {
145
- exit(1); // built without CURL, error message already printed
205
+ using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
206
+
207
+ // cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
208
+ struct curl_slist_ptr {
209
+ struct curl_slist * ptr = nullptr;
210
+ ~curl_slist_ptr() {
211
+ if (ptr) {
212
+ curl_slist_free_all(ptr);
213
+ }
214
+ }
215
+ };
216
+
217
+ #define CURL_MAX_RETRY 3
218
+ #define CURL_RETRY_DELAY_SECONDS 2
219
+
220
+ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
221
+ int remaining_attempts = max_attempts;
222
+ char * method = nullptr;
223
+ curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_METHOD, &method);
224
+
225
+ while (remaining_attempts > 0) {
226
+ LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
227
+
228
+ CURLcode res = curl_easy_perform(curl);
229
+ if (res == CURLE_OK) {
230
+ return true;
231
+ }
232
+
233
+ int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
234
+ LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
235
+
236
+ remaining_attempts--;
237
+ if (remaining_attempts == 0) break;
238
+ std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
239
+ }
240
+
241
+ LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
242
+
243
+ return false;
244
+ }
245
+
246
+ // download one single file from remote URL to local path
247
+ static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
248
+ // Initialize libcurl
249
+ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
250
+ curl_slist_ptr http_headers;
251
+ if (!curl) {
252
+ LOG_ERR("%s: error initializing libcurl\n", __func__);
253
+ return false;
254
+ }
255
+
256
+ // Set the URL, allow to follow http redirection
257
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
258
+ curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
259
+
260
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
261
+ // Check if hf-token or bearer-token was specified
262
+ if (!bearer_token.empty()) {
263
+ std::string auth_header = "Authorization: Bearer " + bearer_token;
264
+ http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
265
+ }
266
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
267
+
268
+ #if defined(_WIN32)
269
+ // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
270
+ // operating system. Currently implemented under MS-Windows.
271
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
272
+ #endif
273
+
274
+ // Check if the file already exists locally
275
+ auto file_exists = std::filesystem::exists(path);
276
+
277
+ // If the file exists, check its JSON metadata companion file.
278
+ std::string metadata_path = path + ".json";
279
+ nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
280
+ std::string etag;
281
+ std::string last_modified;
282
+
283
+ if (file_exists) {
284
+ // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
285
+ std::ifstream metadata_in(metadata_path);
286
+ if (metadata_in.good()) {
287
+ try {
288
+ metadata_in >> metadata;
289
+ LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
290
+ if (metadata.contains("etag") && metadata.at("etag").is_string()) {
291
+ etag = metadata.at("etag");
146
292
  }
147
- hf_repo = auto_detected.first;
148
- hf_file = auto_detected.second;
149
- } else {
150
- hf_file = model;
293
+ if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
294
+ last_modified = metadata.at("lastModified");
295
+ }
296
+ } catch (const nlohmann::json::exception & e) {
297
+ LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
298
+ }
299
+ }
300
+ // if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
301
+ } else {
302
+ LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
303
+ }
304
+
305
+ // Send a HEAD request to retrieve the etag and last-modified headers
306
+ struct common_load_model_from_url_headers {
307
+ std::string etag;
308
+ std::string last_modified;
309
+ };
310
+
311
+ common_load_model_from_url_headers headers;
312
+ bool head_request_ok = false;
313
+ bool should_download = !file_exists; // by default, we should download if the file does not exist
314
+
315
+ // get ETag to see if the remote file has changed
316
+ {
317
+ typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
318
+ auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
319
+ common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
320
+
321
+ static std::regex header_regex("([^:]+): (.*)\r\n");
322
+ static std::regex etag_regex("ETag", std::regex_constants::icase);
323
+ static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
324
+
325
+ std::string header(buffer, n_items);
326
+ std::smatch match;
327
+ if (std::regex_match(header, match, header_regex)) {
328
+ const std::string & key = match[1];
329
+ const std::string & value = match[2];
330
+ if (std::regex_match(key, match, etag_regex)) {
331
+ headers->etag = value;
332
+ } else if (std::regex_match(key, match, last_modified_regex)) {
333
+ headers->last_modified = value;
334
+ }
335
+ }
336
+ return n_items;
337
+ };
338
+
339
+ curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
340
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
341
+ curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
342
+ curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
343
+
344
+ // we only allow retrying once for HEAD requests
345
+ // this is for the use case of using running offline (no internet), retrying can be annoying
346
+ bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0);
347
+ if (!was_perform_successful) {
348
+ head_request_ok = false;
349
+ }
350
+
351
+ long http_code = 0;
352
+ curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
353
+ if (http_code == 200) {
354
+ head_request_ok = true;
355
+ } else {
356
+ LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
357
+ head_request_ok = false;
358
+ }
359
+ }
360
+
361
+ // if head_request_ok is false, we don't have the etag or last-modified headers
362
+ // we leave should_download as-is, which is true if the file does not exist
363
+ if (head_request_ok) {
364
+ // check if ETag or Last-Modified headers are different
365
+ // if it is, we need to download the file again
366
+ if (!etag.empty() && etag != headers.etag) {
367
+ LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
368
+ should_download = true;
369
+ } else if (!last_modified.empty() && last_modified != headers.last_modified) {
370
+ LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
371
+ should_download = true;
372
+ }
373
+ }
374
+
375
+ if (should_download) {
376
+ std::string path_temporary = path + ".downloadInProgress";
377
+ if (file_exists) {
378
+ LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
379
+ if (remove(path.c_str()) != 0) {
380
+ LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
381
+ return false;
382
+ }
383
+ }
384
+
385
+ // Set the output file
386
+
387
+ struct FILE_deleter {
388
+ void operator()(FILE * f) const {
389
+ fclose(f);
390
+ }
391
+ };
392
+
393
+ std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
394
+ if (!outfile) {
395
+ LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
396
+ return false;
397
+ }
398
+
399
+ typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
400
+ auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
401
+ return fwrite(data, size, nmemb, (FILE *)fd);
402
+ };
403
+ curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
404
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
405
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
406
+
407
+ // display download progress
408
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
409
+
410
+ // helper function to hide password in URL
411
+ auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
412
+ std::size_t protocol_pos = url.find("://");
413
+ if (protocol_pos == std::string::npos) {
414
+ return url; // Malformed URL
415
+ }
416
+
417
+ std::size_t at_pos = url.find('@', protocol_pos + 3);
418
+ if (at_pos == std::string::npos) {
419
+ return url; // No password in URL
420
+ }
421
+
422
+ return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
423
+ };
424
+
425
+ // start the download
426
+ LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
427
+ llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
428
+ bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
429
+ if (!was_perform_successful) {
430
+ return false;
431
+ }
432
+
433
+ long http_code = 0;
434
+ curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
435
+ if (http_code < 200 || http_code >= 400) {
436
+ LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
437
+ return false;
438
+ }
439
+
440
+ // Causes file to be closed explicitly here before we rename it.
441
+ outfile.reset();
442
+
443
+ // Write the updated JSON metadata file.
444
+ metadata.update({
445
+ {"url", url},
446
+ {"etag", headers.etag},
447
+ {"lastModified", headers.last_modified}
448
+ });
449
+ write_file(metadata_path, metadata.dump(4));
450
+ LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
451
+
452
+ if (rename(path_temporary.c_str(), path.c_str()) != 0) {
453
+ LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
454
+ return false;
455
+ }
456
+ } else {
457
+ LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
458
+ }
459
+
460
+ return true;
461
+ }
462
+
463
+ // download multiple files from remote URLs to local paths
464
+ // the input is a vector of pairs <url, path>
465
+ static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
466
+ // Prepare download in parallel
467
+ std::vector<std::future<bool>> futures_download;
468
+ for (auto const & item : urls) {
469
+ futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
470
+ return common_download_file_single(it.first, it.second, bearer_token);
471
+ }, item));
472
+ }
473
+
474
+ // Wait for all downloads to complete
475
+ for (auto & f : futures_download) {
476
+ if (!f.get()) {
477
+ return false;
478
+ }
479
+ }
480
+
481
+ return true;
482
+ }
483
+
484
+ static bool common_download_model(
485
+ const common_params_model & model,
486
+ const std::string & bearer_token) {
487
+ // Basic validation of the model.url
488
+ if (model.url.empty()) {
489
+ LOG_ERR("%s: invalid model url\n", __func__);
490
+ return false;
491
+ }
492
+
493
+ if (!common_download_file_single(model.url, model.path, bearer_token)) {
494
+ return false;
495
+ }
496
+
497
+ // check for additional GGUFs split to download
498
+ int n_split = 0;
499
+ {
500
+ struct gguf_init_params gguf_params = {
501
+ /*.no_alloc = */ true,
502
+ /*.ctx = */ NULL,
503
+ };
504
+ auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
505
+ if (!ctx_gguf) {
506
+ LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, model.path.c_str());
507
+ return false;
508
+ }
509
+
510
+ auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
511
+ if (key_n_split >= 0) {
512
+ n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
513
+ }
514
+
515
+ gguf_free(ctx_gguf);
516
+ }
517
+
518
+ if (n_split > 1) {
519
+ char split_prefix[PATH_MAX] = {0};
520
+ char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
521
+
522
+ // Verify the first split file format
523
+ // and extract split URL and PATH prefixes
524
+ {
525
+ if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
526
+ LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
527
+ return false;
528
+ }
529
+
530
+ if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) {
531
+ LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split);
532
+ return false;
533
+ }
534
+ }
535
+
536
+ std::vector<std::pair<std::string, std::string>> urls;
537
+ for (int idx = 1; idx < n_split; idx++) {
538
+ char split_path[PATH_MAX] = {0};
539
+ llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
540
+
541
+ char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
542
+ llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
543
+
544
+ if (std::string(split_path) == model.path) {
545
+ continue; // skip the already downloaded file
546
+ }
547
+
548
+ urls.push_back({split_url, split_path});
549
+ }
550
+
551
+ // Download in parallel
552
+ common_download_file_multiple(urls, bearer_token);
553
+ }
554
+
555
+ return true;
556
+ }
557
+
558
+ std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
559
+ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
560
+ curl_slist_ptr http_headers;
561
+ std::vector<char> res_buffer;
562
+
563
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
564
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
565
+ curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
566
+ typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
567
+ auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
568
+ auto data_vec = static_cast<std::vector<char> *>(data);
569
+ data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
570
+ return size * nmemb;
571
+ };
572
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
573
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
574
+ #if defined(_WIN32)
575
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
576
+ #endif
577
+ if (params.timeout > 0) {
578
+ curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
579
+ }
580
+ if (params.max_size > 0) {
581
+ curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
582
+ }
583
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
584
+ for (const auto & header : params.headers) {
585
+ http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
586
+ }
587
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
588
+
589
+ CURLcode res = curl_easy_perform(curl.get());
590
+
591
+ if (res != CURLE_OK) {
592
+ std::string error_msg = curl_easy_strerror(res);
593
+ throw std::runtime_error("error: cannot make GET request: " + error_msg);
594
+ }
595
+
596
+ long res_code;
597
+ curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
598
+
599
+ return { res_code, std::move(res_buffer) };
600
+ }
601
+
602
+ /**
603
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
604
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
605
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
606
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
607
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
608
+ *
609
+ * Return pair of <repo, file> (with "repo" already having tag removed)
610
+ *
611
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
612
+ */
613
+ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
614
+ auto parts = string_split<std::string>(hf_repo_with_tag, ':');
615
+ std::string tag = parts.size() > 1 ? parts.back() : "latest";
616
+ std::string hf_repo = parts[0];
617
+ if (string_split<std::string>(hf_repo, '/').size() != 2) {
618
+ throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
619
+ }
620
+
621
+ std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
622
+
623
+ // headers
624
+ std::vector<std::string> headers;
625
+ headers.push_back("Accept: application/json");
626
+ if (!bearer_token.empty()) {
627
+ headers.push_back("Authorization: Bearer " + bearer_token);
628
+ }
629
+ // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
630
+ // User-Agent header is already set in common_remote_get_content, no need to set it here
631
+
632
+ // we use "=" to avoid clashing with other component, while still being allowed on windows
633
+ std::string cached_response_fname = "manifest=" + hf_repo + "=" + tag + ".json";
634
+ string_replace_all(cached_response_fname, "/", "_");
635
+ std::string cached_response_path = fs_get_cache_file(cached_response_fname);
636
+
637
+ // make the request
638
+ common_remote_params params;
639
+ params.headers = headers;
640
+ long res_code = 0;
641
+ std::string res_str;
642
+ bool use_cache = false;
643
+ try {
644
+ auto res = common_remote_get_content(url, params);
645
+ res_code = res.first;
646
+ res_str = std::string(res.second.data(), res.second.size());
647
+ } catch (const std::exception & e) {
648
+ LOG_WRN("error: failed to get manifest: %s\n", e.what());
649
+ LOG_WRN("try reading from cache\n");
650
+ // try to read from cache
651
+ try {
652
+ res_str = read_file(cached_response_path);
653
+ res_code = 200;
654
+ use_cache = true;
655
+ } catch (const std::exception & e) {
656
+ throw std::runtime_error("error: failed to get manifest (check your internet connection)");
657
+ }
658
+ }
659
+ std::string ggufFile;
660
+ std::string mmprojFile;
661
+
662
+ if (res_code == 200 || res_code == 304) {
663
+ // extract ggufFile.rfilename in json, using regex
664
+ {
665
+ std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
666
+ std::smatch match;
667
+ if (std::regex_search(res_str, match, pattern)) {
668
+ ggufFile = match[1].str();
669
+ }
670
+ }
671
+ // extract mmprojFile.rfilename in json, using regex
672
+ {
673
+ std::regex pattern("\"mmprojFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
674
+ std::smatch match;
675
+ if (std::regex_search(res_str, match, pattern)) {
676
+ mmprojFile = match[1].str();
151
677
  }
152
678
  }
153
- // make sure model path is present (for caching purposes)
154
- if (model.empty()) {
155
- // this is to avoid different repo having same file name, or same file name in different subdirs
156
- std::string filename = hf_repo + "_" + hf_file;
157
- // to make sure we don't have any slashes in the filename
158
- string_replace_all(filename, "/", "_");
159
- model = fs_get_cache_file(filename);
679
+ if (!use_cache) {
680
+ // if not using cached response, update the cache file
681
+ write_file(cached_response_path, res_str);
682
+ }
683
+ } else if (res_code == 401) {
684
+ throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
685
+ } else {
686
+ throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
687
+ }
688
+
689
+ // check response
690
+ if (ggufFile.empty()) {
691
+ throw std::runtime_error("error: model does not have ggufFile");
692
+ }
693
+
694
+ return { hf_repo, ggufFile, mmprojFile };
695
+ }
696
+
697
+ #else
698
+
699
+ bool common_has_curl() {
700
+ return false;
701
+ }
702
+
703
+ static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
704
+ LOG_ERR("error: built without CURL, cannot download model from internet\n");
705
+ return false;
706
+ }
707
+
708
+ static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
709
+ LOG_ERR("error: built without CURL, cannot download model from the internet\n");
710
+ return false;
711
+ }
712
+
713
+ static bool common_download_model(
714
+ const common_params_model &,
715
+ const std::string &) {
716
+ LOG_ERR("error: built without CURL, cannot download model from the internet\n");
717
+ return false;
718
+ }
719
+
720
+ static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
721
+ LOG_ERR("error: built without CURL, cannot download model from the internet\n");
722
+ return {};
723
+ }
724
+
725
+ std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
726
+ if (!url.empty()) {
727
+ throw std::runtime_error("error: built without CURL, cannot download model from the internet");
728
+ }
729
+
730
+ return {};
731
+ }
732
+
733
+ #endif // LLAMA_USE_CURL
734
+
735
+ //
736
+ // utils
737
+ //
738
+
739
+ struct handle_model_result {
740
+ bool found_mmproj = false;
741
+ common_params_model mmproj;
742
+ };
743
+
744
+ static handle_model_result common_params_handle_model(
745
+ struct common_params_model & model,
746
+ const std::string & bearer_token,
747
+ const std::string & model_path_default) {
748
+ handle_model_result result;
749
+ // handle pre-fill default model path and url based on hf_repo and hf_file
750
+ {
751
+ if (!model.hf_repo.empty()) {
752
+ // short-hand to avoid specifying --hf-file -> default it to --model
753
+ if (model.hf_file.empty()) {
754
+ if (model.path.empty()) {
755
+ auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
756
+ if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
757
+ exit(1); // built without CURL, error message already printed
758
+ }
759
+ model.hf_repo = auto_detected.repo;
760
+ model.hf_file = auto_detected.ggufFile;
761
+ if (!auto_detected.mmprojFile.empty()) {
762
+ result.found_mmproj = true;
763
+ result.mmproj.hf_repo = model.hf_repo;
764
+ result.mmproj.hf_file = auto_detected.mmprojFile;
765
+ }
766
+ } else {
767
+ model.hf_file = model.path;
768
+ }
769
+ }
770
+
771
+ std::string model_endpoint = get_model_endpoint();
772
+ model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
773
+ // make sure model path is present (for caching purposes)
774
+ if (model.path.empty()) {
775
+ // this is to avoid different repo having same file name, or same file name in different subdirs
776
+ std::string filename = model.hf_repo + "_" + model.hf_file;
777
+ // to make sure we don't have any slashes in the filename
778
+ string_replace_all(filename, "/", "_");
779
+ model.path = fs_get_cache_file(filename);
780
+ }
781
+
782
+ } else if (!model.url.empty()) {
783
+ if (model.path.empty()) {
784
+ auto f = string_split<std::string>(model.url, '#').front();
785
+ f = string_split<std::string>(f, '?').front();
786
+ model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
787
+ }
788
+
789
+ } else if (model.path.empty()) {
790
+ model.path = model_path_default;
160
791
  }
161
- } else if (!model_url.empty()) {
162
- if (model.empty()) {
163
- auto f = string_split<std::string>(model_url, '#').front();
164
- f = string_split<std::string>(f, '?').front();
165
- model = fs_get_cache_file(string_split<std::string>(f, '/').back());
792
+ }
793
+
794
+ // then, download it if needed
795
+ if (!model.url.empty()) {
796
+ bool ok = common_download_model(model, bearer_token);
797
+ if (!ok) {
798
+ LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
799
+ exit(1);
166
800
  }
167
- } else if (model.empty()) {
168
- model = model_default;
169
801
  }
802
+
803
+ return result;
170
804
  }
171
805
 
172
806
  const std::vector<ggml_type> kv_cache_types = {
@@ -300,10 +934,25 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
300
934
  throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
301
935
  }
302
936
 
303
- // TODO: refactor model params in a common struct
304
- common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token, DEFAULT_MODEL_PATH);
305
- common_params_handle_model_default(params.speculative.model, params.speculative.model_url, params.speculative.hf_repo, params.speculative.hf_file, params.hf_token, "");
306
- common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token, "");
937
+ // handle model and download
938
+ {
939
+ auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
940
+ if (params.no_mmproj) {
941
+ params.mmproj = {};
942
+ } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
943
+ // optionally, handle mmproj model when -hf is specified
944
+ params.mmproj = res.mmproj;
945
+ }
946
+ // only download mmproj if the current example is using it
947
+ for (auto & ex : mmproj_examples) {
948
+ if (ctx_arg.ex == ex) {
949
+ common_params_handle_model(params.mmproj, params.hf_token, "");
950
+ break;
951
+ }
952
+ }
953
+ common_params_handle_model(params.speculative.model, params.hf_token, "");
954
+ common_params_handle_model(params.vocoder.model, params.hf_token, "");
955
+ }
307
956
 
308
957
  if (params.escape) {
309
958
  string_process_escapes(params.prompt);
@@ -322,6 +971,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
322
971
  params.kv_overrides.back().key[0] = 0;
323
972
  }
324
973
 
974
+ if (!params.tensor_buft_overrides.empty()) {
975
+ params.tensor_buft_overrides.push_back({nullptr, nullptr});
976
+ }
977
+
325
978
  if (params.reranking && params.embedding) {
326
979
  throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
327
980
  }
@@ -431,7 +1084,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
431
1084
  "llama-embedding",
432
1085
  "llama-eval-callback",
433
1086
  "llama-export-lora",
434
- "llama-gbnf-validator",
435
1087
  "llama-gen-docs",
436
1088
  "llama-gguf",
437
1089
  "llama-gguf-hash",
@@ -439,20 +1091,18 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
439
1091
  "llama-gritlm",
440
1092
  "llama-imatrix",
441
1093
  "llama-infill",
442
- "llama-llava-cli",
1094
+ "llama-mtmd-cli",
443
1095
  "llama-llava-clip-quantize-cli",
444
1096
  "llama-lookahead",
445
1097
  "llama-lookup",
446
1098
  "llama-lookup-create",
447
1099
  "llama-lookup-merge",
448
1100
  "llama-lookup-stats",
449
- "llama-minicpmv-cli",
450
1101
  "llama-parallel",
451
1102
  "llama-passkey",
452
1103
  "llama-perplexity",
453
1104
  "llama-q8dot",
454
1105
  "llama-quantize",
455
- "llama-quantize-stats",
456
1106
  "llama-qwen2vl-cli",
457
1107
  "llama-retrieval",
458
1108
  "llama-run",
@@ -541,6 +1191,9 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
541
1191
  fprintf(stderr, "%s\n", ex.what());
542
1192
  ctx_arg.params = params_org;
543
1193
  return false;
1194
+ } catch (std::exception & ex) {
1195
+ fprintf(stderr, "%s\n", ex.what());
1196
+ exit(1); // for other exceptions, we exit with status code 1
544
1197
  }
545
1198
 
546
1199
  return true;
@@ -841,13 +1494,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
841
1494
  {"-f", "--file"}, "FNAME",
842
1495
  "a file containing the prompt (default: none)",
843
1496
  [](common_params & params, const std::string & value) {
844
- std::ifstream file(value);
845
- if (!file) {
846
- throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
847
- }
1497
+ params.prompt = read_file(value);
848
1498
  // store the external file name in params
849
1499
  params.prompt_file = value;
850
- std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
851
1500
  if (!params.prompt.empty() && params.prompt.back() == '\n') {
852
1501
  params.prompt.pop_back();
853
1502
  }
@@ -857,11 +1506,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
857
1506
  {"-sysf", "--system-prompt-file"}, "FNAME",
858
1507
  "a file containing the system prompt (default: none)",
859
1508
  [](common_params & params, const std::string & value) {
860
- std::ifstream file(value);
861
- if (!file) {
862
- throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
863
- }
864
- std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
1509
+ params.system_prompt = read_file(value);
865
1510
  if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
866
1511
  params.system_prompt.pop_back();
867
1512
  }
@@ -1285,23 +1930,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1285
1930
  add_opt(common_arg(
1286
1931
  {"--grammar-file"}, "FNAME",
1287
1932
  "file to read grammar from",
1933
+ [](common_params & params, const std::string & value) {
1934
+ params.sampling.grammar = read_file(value);
1935
+ }
1936
+ ).set_sparam());
1937
+ add_opt(common_arg(
1938
+ {"-j", "--json-schema"}, "SCHEMA",
1939
+ "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
1940
+ [](common_params & params, const std::string & value) {
1941
+ params.sampling.grammar = json_schema_to_grammar(json::parse(value));
1942
+ }
1943
+ ).set_sparam());
1944
+ add_opt(common_arg(
1945
+ {"-jf", "--json-schema-file"}, "FILE",
1946
+ "File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
1288
1947
  [](common_params & params, const std::string & value) {
1289
1948
  std::ifstream file(value);
1290
1949
  if (!file) {
1291
1950
  throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1292
1951
  }
1952
+ std::string schema;
1293
1953
  std::copy(
1294
1954
  std::istreambuf_iterator<char>(file),
1295
1955
  std::istreambuf_iterator<char>(),
1296
- std::back_inserter(params.sampling.grammar)
1956
+ std::back_inserter(schema)
1297
1957
  );
1298
- }
1299
- ).set_sparam());
1300
- add_opt(common_arg(
1301
- {"-j", "--json-schema"}, "SCHEMA",
1302
- "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
1303
- [](common_params & params, const std::string & value) {
1304
- params.sampling.grammar = json_schema_to_grammar(json::parse(value));
1958
+ params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
1305
1959
  }
1306
1960
  ).set_sparam());
1307
1961
  add_opt(common_arg(
@@ -1559,11 +2213,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1559
2213
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
1560
2214
  add_opt(common_arg(
1561
2215
  {"--mmproj"}, "FILE",
1562
- "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
2216
+ "path to a multimodal projector file. see examples/llava/README.md",
1563
2217
  [](common_params & params, const std::string & value) {
1564
- params.mmproj = value;
2218
+ params.mmproj.path = value;
1565
2219
  }
1566
- ).set_examples({LLAMA_EXAMPLE_LLAVA}));
2220
+ ).set_examples(mmproj_examples));
2221
+ add_opt(common_arg(
2222
+ {"--mmproj-url"}, "URL",
2223
+ "URL to a multimodal projector file. see examples/llava/README.md",
2224
+ [](common_params & params, const std::string & value) {
2225
+ params.mmproj.url = value;
2226
+ }
2227
+ ).set_examples(mmproj_examples));
2228
+ add_opt(common_arg(
2229
+ {"--no-mmproj"},
2230
+ "explicitly disable multimodal projector, useful when using -hf",
2231
+ [](common_params & params) {
2232
+ params.no_mmproj = true;
2233
+ }
2234
+ ).set_examples(mmproj_examples));
2235
+ add_opt(common_arg(
2236
+ {"--no-mmproj-offload"},
2237
+ "do not offload multimodal projector to GPU",
2238
+ [](common_params & params) {
2239
+ params.mmproj_use_gpu = false;
2240
+ }
2241
+ ).set_examples(mmproj_examples));
1567
2242
  add_opt(common_arg(
1568
2243
  {"--image"}, "FILE",
1569
2244
  "path to an image file. use with multimodal models. Specify multiple times for batching",
@@ -1647,6 +2322,41 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1647
2322
  exit(0);
1648
2323
  }
1649
2324
  ));
2325
+ add_opt(common_arg(
2326
+ {"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
2327
+ "override tensor buffer type", [](common_params & params, const std::string & value) {
2328
+ /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
2329
+ if (buft_list.empty()) {
2330
+ // enumerate all the devices and add their buffer types to the list
2331
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
2332
+ auto * dev = ggml_backend_dev_get(i);
2333
+ auto * buft = ggml_backend_dev_buffer_type(dev);
2334
+ if (buft) {
2335
+ buft_list[ggml_backend_buft_name(buft)] = buft;
2336
+ }
2337
+ }
2338
+ }
2339
+
2340
+ for (const auto & override : string_split<std::string>(value, ',')) {
2341
+ std::string::size_type pos = override.find('=');
2342
+ if (pos == std::string::npos) {
2343
+ throw std::invalid_argument("invalid value");
2344
+ }
2345
+ std::string tensor_name = override.substr(0, pos);
2346
+ std::string buffer_type = override.substr(pos + 1);
2347
+
2348
+ if (buft_list.find(buffer_type) == buft_list.end()) {
2349
+ printf("Available buffer types:\n");
2350
+ for (const auto & it : buft_list) {
2351
+ printf(" %s\n", ggml_backend_buft_name(it.second));
2352
+ }
2353
+ throw std::invalid_argument("unknown buffer type");
2354
+ }
2355
+ // FIXME: this leaks memory
2356
+ params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
2357
+ }
2358
+ }
2359
+ ));
1650
2360
  add_opt(common_arg(
1651
2361
  {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
1652
2362
  "number of layers to store in VRAM",
@@ -1790,51 +2500,52 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1790
2500
  "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
1791
2501
  ),
1792
2502
  [](common_params & params, const std::string & value) {
1793
- params.model = value;
2503
+ params.model.path = value;
1794
2504
  }
1795
2505
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
1796
2506
  add_opt(common_arg(
1797
2507
  {"-mu", "--model-url"}, "MODEL_URL",
1798
2508
  "model download url (default: unused)",
1799
2509
  [](common_params & params, const std::string & value) {
1800
- params.model_url = value;
2510
+ params.model.url = value;
1801
2511
  }
1802
2512
  ).set_env("LLAMA_ARG_MODEL_URL"));
1803
2513
  add_opt(common_arg(
1804
2514
  {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
1805
2515
  "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
2516
+ "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
1806
2517
  "example: unsloth/phi-4-GGUF:q4_k_m\n"
1807
2518
  "(default: unused)",
1808
2519
  [](common_params & params, const std::string & value) {
1809
- params.hf_repo = value;
2520
+ params.model.hf_repo = value;
1810
2521
  }
1811
2522
  ).set_env("LLAMA_ARG_HF_REPO"));
1812
2523
  add_opt(common_arg(
1813
2524
  {"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
1814
2525
  "Same as --hf-repo, but for the draft model (default: unused)",
1815
2526
  [](common_params & params, const std::string & value) {
1816
- params.speculative.hf_repo = value;
2527
+ params.speculative.model.hf_repo = value;
1817
2528
  }
1818
2529
  ).set_env("LLAMA_ARG_HFD_REPO"));
1819
2530
  add_opt(common_arg(
1820
2531
  {"-hff", "--hf-file"}, "FILE",
1821
2532
  "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
1822
2533
  [](common_params & params, const std::string & value) {
1823
- params.hf_file = value;
2534
+ params.model.hf_file = value;
1824
2535
  }
1825
2536
  ).set_env("LLAMA_ARG_HF_FILE"));
1826
2537
  add_opt(common_arg(
1827
2538
  {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
1828
2539
  "Hugging Face model repository for the vocoder model (default: unused)",
1829
2540
  [](common_params & params, const std::string & value) {
1830
- params.vocoder.hf_repo = value;
2541
+ params.vocoder.model.hf_repo = value;
1831
2542
  }
1832
2543
  ).set_env("LLAMA_ARG_HF_REPO_V"));
1833
2544
  add_opt(common_arg(
1834
2545
  {"-hffv", "--hf-file-v"}, "FILE",
1835
2546
  "Hugging Face model file for the vocoder model (default: unused)",
1836
2547
  [](common_params & params, const std::string & value) {
1837
- params.vocoder.hf_file = value;
2548
+ params.vocoder.model.hf_file = value;
1838
2549
  }
1839
2550
  ).set_env("LLAMA_ARG_HF_FILE_V"));
1840
2551
  add_opt(common_arg(
@@ -1979,7 +2690,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1979
2690
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
1980
2691
  add_opt(common_arg(
1981
2692
  {"--host"}, "HOST",
1982
- string_format("ip address to listen (default: %s)", params.hostname.c_str()),
2693
+ string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
1983
2694
  [](common_params & params, const std::string & value) {
1984
2695
  params.hostname = value;
1985
2696
  }
@@ -2147,7 +2858,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2147
2858
  [](common_params & params, const std::string & value) {
2148
2859
  params.chat_template = value;
2149
2860
  }
2150
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
2861
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
2151
2862
  add_opt(common_arg(
2152
2863
  {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
2153
2864
  string_format(
@@ -2157,14 +2868,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2157
2868
  "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
2158
2869
  ),
2159
2870
  [](common_params & params, const std::string & value) {
2160
- std::ifstream file(value);
2161
- if (!file) {
2162
- throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
2163
- }
2164
- std::copy(
2165
- std::istreambuf_iterator<char>(file),
2166
- std::istreambuf_iterator<char>(),
2167
- std::back_inserter(params.chat_template));
2871
+ params.chat_template = read_file(value);
2168
2872
  }
2169
2873
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
2170
2874
  add_opt(common_arg(
@@ -2454,7 +3158,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2454
3158
  {"-md", "--model-draft"}, "FNAME",
2455
3159
  "draft model for speculative decoding (default: unused)",
2456
3160
  [](common_params & params, const std::string & value) {
2457
- params.speculative.model = value;
3161
+ params.speculative.model.path = value;
2458
3162
  }
2459
3163
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
2460
3164
 
@@ -2462,7 +3166,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2462
3166
  {"-mv", "--model-vocoder"}, "FNAME",
2463
3167
  "vocoder model for audio generation (default: unused)",
2464
3168
  [](common_params & params, const std::string & value) {
2465
- params.vocoder.model = value;
3169
+ params.vocoder.model.path = value;
2466
3170
  }
2467
3171
  ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
2468
3172
  add_opt(common_arg(
@@ -2485,10 +3189,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2485
3189
  {"--tts-oute-default"},
2486
3190
  string_format("use default OuteTTS models (note: can download weights from the internet)"),
2487
3191
  [](common_params & params) {
2488
- params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
2489
- params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
2490
- params.vocoder.hf_repo = "ggml-org/WavTokenizer";
2491
- params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
3192
+ params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
3193
+ params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
3194
+ params.vocoder.model.hf_repo = "ggml-org/WavTokenizer";
3195
+ params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf";
2492
3196
  }
2493
3197
  ).set_examples({LLAMA_EXAMPLE_TTS}));
2494
3198
 
@@ -2496,8 +3200,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2496
3200
  {"--embd-bge-small-en-default"},
2497
3201
  string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
2498
3202
  [](common_params & params) {
2499
- params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
2500
- params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
3203
+ params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
3204
+ params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
2501
3205
  params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2502
3206
  params.embd_normalize = 2;
2503
3207
  params.n_ctx = 512;
@@ -2510,8 +3214,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2510
3214
  {"--embd-e5-small-en-default"},
2511
3215
  string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
2512
3216
  [](common_params & params) {
2513
- params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
2514
- params.hf_file = "e5-small-v2-q8_0.gguf";
3217
+ params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
3218
+ params.model.hf_file = "e5-small-v2-q8_0.gguf";
2515
3219
  params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2516
3220
  params.embd_normalize = 2;
2517
3221
  params.n_ctx = 512;
@@ -2524,8 +3228,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2524
3228
  {"--embd-gte-small-default"},
2525
3229
  string_format("use default gte-small model (note: can download weights from the internet)"),
2526
3230
  [](common_params & params) {
2527
- params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
2528
- params.hf_file = "gte-small-q8_0.gguf";
3231
+ params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
3232
+ params.model.hf_file = "gte-small-q8_0.gguf";
2529
3233
  params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2530
3234
  params.embd_normalize = 2;
2531
3235
  params.n_ctx = 512;
@@ -2538,8 +3242,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2538
3242
  {"--fim-qwen-1.5b-default"},
2539
3243
  string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
2540
3244
  [](common_params & params) {
2541
- params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
2542
- params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
3245
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
3246
+ params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
2543
3247
  params.port = 8012;
2544
3248
  params.n_gpu_layers = 99;
2545
3249
  params.flash_attn = true;
@@ -2554,8 +3258,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2554
3258
  {"--fim-qwen-3b-default"},
2555
3259
  string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
2556
3260
  [](common_params & params) {
2557
- params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
2558
- params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
3261
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
3262
+ params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
2559
3263
  params.port = 8012;
2560
3264
  params.n_gpu_layers = 99;
2561
3265
  params.flash_attn = true;
@@ -2570,8 +3274,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2570
3274
  {"--fim-qwen-7b-default"},
2571
3275
  string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
2572
3276
  [](common_params & params) {
2573
- params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
2574
- params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3277
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
3278
+ params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
2575
3279
  params.port = 8012;
2576
3280
  params.n_gpu_layers = 99;
2577
3281
  params.flash_attn = true;
@@ -2586,10 +3290,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2586
3290
  {"--fim-qwen-7b-spec"},
2587
3291
  string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
2588
3292
  [](common_params & params) {
2589
- params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
2590
- params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
2591
- params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
2592
- params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3293
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
3294
+ params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3295
+ params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3296
+ params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
2593
3297
  params.speculative.n_gpu_layers = 99;
2594
3298
  params.port = 8012;
2595
3299
  params.n_gpu_layers = 99;
@@ -2605,10 +3309,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2605
3309
  {"--fim-qwen-14b-spec"},
2606
3310
  string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
2607
3311
  [](common_params & params) {
2608
- params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
2609
- params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
2610
- params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
2611
- params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3312
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
3313
+ params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
3314
+ params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3315
+ params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
2612
3316
  params.speculative.n_gpu_layers = 99;
2613
3317
  params.port = 8012;
2614
3318
  params.n_gpu_layers = 99;