@fugood/llama.node 1.0.0-beta.5 → 1.0.0-beta.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/lib/binding.ts +3 -1
  2. package/lib/index.js +2 -0
  3. package/lib/index.ts +3 -1
  4. package/package.json +14 -14
  5. package/scripts/llama.cpp.patch +27 -26
  6. package/src/EmbeddingWorker.cpp +1 -1
  7. package/src/LlamaCompletionWorker.cpp +28 -7
  8. package/src/LlamaCompletionWorker.h +4 -0
  9. package/src/LlamaContext.cpp +14 -17
  10. package/src/common.hpp +7 -6
  11. package/src/llama.cpp/CMakeLists.txt +15 -4
  12. package/src/llama.cpp/common/CMakeLists.txt +15 -24
  13. package/src/llama.cpp/common/arg.cpp +172 -110
  14. package/src/llama.cpp/common/chat-parser.cpp +385 -0
  15. package/src/llama.cpp/common/chat-parser.h +120 -0
  16. package/src/llama.cpp/common/chat.cpp +726 -596
  17. package/src/llama.cpp/common/chat.h +74 -8
  18. package/src/llama.cpp/common/common.cpp +56 -38
  19. package/src/llama.cpp/common/common.h +9 -3
  20. package/src/llama.cpp/common/json-partial.cpp +256 -0
  21. package/src/llama.cpp/common/json-partial.h +38 -0
  22. package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
  23. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -4
  24. package/src/llama.cpp/common/sampling.cpp +7 -8
  25. package/src/llama.cpp/common/speculative.cpp +6 -4
  26. package/src/llama.cpp/ggml/CMakeLists.txt +48 -3
  27. package/src/llama.cpp/ggml/include/ggml.h +22 -3
  28. package/src/llama.cpp/ggml/src/CMakeLists.txt +81 -22
  29. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +131 -49
  30. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  31. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
  32. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2162 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
  39. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
  40. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
  41. package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
  42. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
  43. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
  44. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  45. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
  46. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +12 -13
  47. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +64 -88
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
  49. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  50. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  51. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
  52. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  53. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +282 -100
  54. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  55. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
  56. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  57. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1570 -0
  58. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  59. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +119 -5
  60. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  61. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
  62. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +204 -49
  63. package/src/llama.cpp/include/llama.h +145 -40
  64. package/src/llama.cpp/src/CMakeLists.txt +5 -1
  65. package/src/llama.cpp/src/llama-arch.cpp +99 -3
  66. package/src/llama.cpp/src/llama-arch.h +10 -1
  67. package/src/llama.cpp/src/llama-batch.cpp +728 -272
  68. package/src/llama.cpp/src/llama-batch.h +112 -54
  69. package/src/llama.cpp/src/llama-chat.cpp +19 -2
  70. package/src/llama.cpp/src/llama-chat.h +1 -0
  71. package/src/llama.cpp/src/llama-context.cpp +525 -339
  72. package/src/llama.cpp/src/llama-context.h +38 -17
  73. package/src/llama.cpp/src/llama-cparams.cpp +4 -0
  74. package/src/llama.cpp/src/llama-cparams.h +2 -0
  75. package/src/llama.cpp/src/llama-grammar.cpp +12 -2
  76. package/src/llama.cpp/src/llama-graph.cpp +413 -353
  77. package/src/llama.cpp/src/llama-graph.h +112 -56
  78. package/src/llama.cpp/src/llama-hparams.cpp +10 -2
  79. package/src/llama.cpp/src/llama-hparams.h +13 -2
  80. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +279 -0
  81. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +128 -0
  82. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +1815 -0
  83. package/src/llama.cpp/src/llama-kv-cache-unified.h +303 -0
  84. package/src/llama.cpp/src/llama-kv-cells.h +415 -0
  85. package/src/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  86. package/src/llama.cpp/src/llama-memory-hybrid.h +138 -0
  87. package/src/llama.cpp/src/llama-memory-recurrent.cpp +1112 -0
  88. package/src/llama.cpp/src/llama-memory-recurrent.h +183 -0
  89. package/src/llama.cpp/src/llama-memory.cpp +41 -0
  90. package/src/llama.cpp/src/llama-memory.h +86 -5
  91. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  92. package/src/llama.cpp/src/llama-model-loader.cpp +42 -17
  93. package/src/llama.cpp/src/llama-model-saver.cpp +1 -0
  94. package/src/llama.cpp/src/llama-model.cpp +1137 -528
  95. package/src/llama.cpp/src/llama-model.h +4 -0
  96. package/src/llama.cpp/src/llama-quant.cpp +2 -1
  97. package/src/llama.cpp/src/llama-sampling.cpp +2 -2
  98. package/src/llama.cpp/src/llama-vocab.cpp +69 -32
  99. package/src/llama.cpp/src/llama-vocab.h +1 -0
  100. package/src/llama.cpp/src/llama.cpp +11 -7
  101. package/src/llama.cpp/src/unicode.cpp +5 -0
  102. package/src/tts_utils.h +1 -1
  103. package/src/llama.cpp/common/json.hpp +0 -24766
  104. package/src/llama.cpp/common/minja/chat-template.hpp +0 -541
  105. package/src/llama.cpp/common/minja/minja.hpp +0 -2974
  106. package/src/llama.cpp/common/stb_image.h +0 -7988
  107. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  108. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13326
  109. package/src/llama.cpp/src/llama-kv-cache.cpp +0 -2827
  110. package/src/llama.cpp/src/llama-kv-cache.h +0 -515
  111. /package/src/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  112. /package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  113. /package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -1,10 +1,11 @@
1
- #include "gguf.h" // for reading GGUF splits
2
1
  #include "arg.h"
3
2
 
3
+ #include "chat.h"
4
4
  #include "common.h"
5
+ #include "gguf.h" // for reading GGUF splits
6
+ #include "json-schema-to-grammar.h"
5
7
  #include "log.h"
6
8
  #include "sampling.h"
7
- #include "chat.h"
8
9
 
9
10
  // fix problem with std::min and std::max
10
11
  #if defined(_WIN32)
@@ -15,6 +16,9 @@
15
16
  #include <windows.h>
16
17
  #endif
17
18
 
19
+ #define JSON_ASSERT GGML_ASSERT
20
+ #include <nlohmann/json.hpp>
21
+
18
22
  #include <algorithm>
19
23
  #include <climits>
20
24
  #include <cstdarg>
@@ -34,8 +38,6 @@
34
38
  #include <future>
35
39
  #endif
36
40
 
37
- #include "json-schema-to-grammar.h"
38
-
39
41
  using json = nlohmann::ordered_json;
40
42
 
41
43
  std::initializer_list<enum llama_example> mmproj_examples = {
@@ -242,33 +244,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
242
244
  }
243
245
 
244
246
  // download one single file from remote URL to local path
245
- static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
246
- // Initialize libcurl
247
- curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
248
- curl_slist_ptr http_headers;
249
- if (!curl) {
250
- LOG_ERR("%s: error initializing libcurl\n", __func__);
251
- return false;
252
- }
253
-
254
- // Set the URL, allow to follow http redirection
255
- curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
256
- curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
257
-
258
- http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
259
- // Check if hf-token or bearer-token was specified
260
- if (!bearer_token.empty()) {
261
- std::string auth_header = "Authorization: Bearer " + bearer_token;
262
- http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
263
- }
264
- curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
265
-
266
- #if defined(_WIN32)
267
- // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
268
- // operating system. Currently implemented under MS-Windows.
269
- curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
270
- #endif
271
-
247
+ static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
272
248
  // Check if the file already exists locally
273
249
  auto file_exists = std::filesystem::exists(path);
274
250
 
@@ -279,6 +255,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
279
255
  std::string last_modified;
280
256
 
281
257
  if (file_exists) {
258
+ if (offline) {
259
+ LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
260
+ return true; // skip verification/downloading
261
+ }
282
262
  // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
283
263
  std::ifstream metadata_in(metadata_path);
284
264
  if (metadata_in.good()) {
@@ -297,6 +277,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
297
277
  }
298
278
  // if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
299
279
  } else {
280
+ if (offline) {
281
+ LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
282
+ return false;
283
+ }
300
284
  LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
301
285
  }
302
286
 
@@ -310,50 +294,73 @@ static bool common_download_file_single(const std::string & url, const std::stri
310
294
  bool head_request_ok = false;
311
295
  bool should_download = !file_exists; // by default, we should download if the file does not exist
312
296
 
313
- // get ETag to see if the remote file has changed
314
- {
315
- typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
316
- auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
317
- common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
297
+ // Initialize libcurl
298
+ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
299
+ curl_slist_ptr http_headers;
300
+ if (!curl) {
301
+ LOG_ERR("%s: error initializing libcurl\n", __func__);
302
+ return false;
303
+ }
318
304
 
319
- static std::regex header_regex("([^:]+): (.*)\r\n");
320
- static std::regex etag_regex("ETag", std::regex_constants::icase);
321
- static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
305
+ // Set the URL, allow to follow http redirection
306
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
307
+ curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
322
308
 
323
- std::string header(buffer, n_items);
324
- std::smatch match;
325
- if (std::regex_match(header, match, header_regex)) {
326
- const std::string & key = match[1];
327
- const std::string & value = match[2];
328
- if (std::regex_match(key, match, etag_regex)) {
329
- headers->etag = value;
330
- } else if (std::regex_match(key, match, last_modified_regex)) {
331
- headers->last_modified = value;
332
- }
333
- }
334
- return n_items;
335
- };
309
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
310
+ // Check if hf-token or bearer-token was specified
311
+ if (!bearer_token.empty()) {
312
+ std::string auth_header = "Authorization: Bearer " + bearer_token;
313
+ http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
314
+ }
315
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
336
316
 
337
- curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
338
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
339
- curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
340
- curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
317
+ #if defined(_WIN32)
318
+ // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
319
+ // operating system. Currently implemented under MS-Windows.
320
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
321
+ #endif
341
322
 
342
- // we only allow retrying once for HEAD requests
343
- // this is for the use case of using running offline (no internet), retrying can be annoying
344
- bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
345
- if (!was_perform_successful) {
346
- head_request_ok = false;
347
- }
323
+ typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
324
+ auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
325
+ common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
348
326
 
349
- long http_code = 0;
350
- curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
351
- if (http_code == 200) {
352
- head_request_ok = true;
353
- } else {
354
- LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
355
- head_request_ok = false;
327
+ static std::regex header_regex("([^:]+): (.*)\r\n");
328
+ static std::regex etag_regex("ETag", std::regex_constants::icase);
329
+ static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
330
+
331
+ std::string header(buffer, n_items);
332
+ std::smatch match;
333
+ if (std::regex_match(header, match, header_regex)) {
334
+ const std::string & key = match[1];
335
+ const std::string & value = match[2];
336
+ if (std::regex_match(key, match, etag_regex)) {
337
+ headers->etag = value;
338
+ } else if (std::regex_match(key, match, last_modified_regex)) {
339
+ headers->last_modified = value;
340
+ }
356
341
  }
342
+ return n_items;
343
+ };
344
+
345
+ curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
346
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
347
+ curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
348
+ curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
349
+
350
+ // we only allow retrying once for HEAD requests
351
+ // this is for the use case of using running offline (no internet), retrying can be annoying
352
+ bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
353
+ if (!was_perform_successful) {
354
+ head_request_ok = false;
355
+ }
356
+
357
+ long http_code = 0;
358
+ curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
359
+ if (http_code == 200) {
360
+ head_request_ok = true;
361
+ } else {
362
+ LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
363
+ head_request_ok = false;
357
364
  }
358
365
 
359
366
  // if head_request_ok is false, we don't have the etag or last-modified headers
@@ -460,12 +467,12 @@ static bool common_download_file_single(const std::string & url, const std::stri
460
467
 
461
468
  // download multiple files from remote URLs to local paths
462
469
  // the input is a vector of pairs <url, path>
463
- static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
470
+ static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
464
471
  // Prepare download in parallel
465
472
  std::vector<std::future<bool>> futures_download;
466
473
  for (auto const & item : urls) {
467
- futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
468
- return common_download_file_single(it.first, it.second, bearer_token);
474
+ futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
475
+ return common_download_file_single(it.first, it.second, bearer_token, offline);
469
476
  }, item));
470
477
  }
471
478
 
@@ -481,14 +488,15 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
481
488
 
482
489
  static bool common_download_model(
483
490
  const common_params_model & model,
484
- const std::string & bearer_token) {
491
+ const std::string & bearer_token,
492
+ bool offline) {
485
493
  // Basic validation of the model.url
486
494
  if (model.url.empty()) {
487
495
  LOG_ERR("%s: invalid model url\n", __func__);
488
496
  return false;
489
497
  }
490
498
 
491
- if (!common_download_file_single(model.url, model.path, bearer_token)) {
499
+ if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
492
500
  return false;
493
501
  }
494
502
 
@@ -547,7 +555,7 @@ static bool common_download_model(
547
555
  }
548
556
 
549
557
  // Download in parallel
550
- common_download_file_multiple(urls, bearer_token);
558
+ common_download_file_multiple(urls, bearer_token, offline);
551
559
  }
552
560
 
553
561
  return true;
@@ -608,7 +616,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
608
616
  *
609
617
  * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
610
618
  */
611
- static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
619
+ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
612
620
  auto parts = string_split<std::string>(hf_repo_with_tag, ':');
613
621
  std::string tag = parts.size() > 1 ? parts.back() : "latest";
614
622
  std::string hf_repo = parts[0];
@@ -638,20 +646,25 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
638
646
  long res_code = 0;
639
647
  std::string res_str;
640
648
  bool use_cache = false;
641
- try {
642
- auto res = common_remote_get_content(url, params);
643
- res_code = res.first;
644
- res_str = std::string(res.second.data(), res.second.size());
645
- } catch (const std::exception & e) {
646
- LOG_WRN("error: failed to get manifest: %s\n", e.what());
647
- LOG_WRN("try reading from cache\n");
648
- // try to read from cache
649
+ if (!offline) {
649
650
  try {
651
+ auto res = common_remote_get_content(url, params);
652
+ res_code = res.first;
653
+ res_str = std::string(res.second.data(), res.second.size());
654
+ } catch (const std::exception & e) {
655
+ LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
656
+ }
657
+ }
658
+ if (res_code == 0) {
659
+ if (std::filesystem::exists(cached_response_path)) {
660
+ LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
650
661
  res_str = read_file(cached_response_path);
651
662
  res_code = 200;
652
663
  use_cache = true;
653
- } catch (const std::exception & e) {
654
- throw std::runtime_error("error: failed to get manifest (check your internet connection)");
664
+ } else {
665
+ throw std::runtime_error(
666
+ offline ? "error: failed to get manifest (offline mode)"
667
+ : "error: failed to get manifest (check your internet connection)");
655
668
  }
656
669
  }
657
670
  std::string ggufFile;
@@ -698,24 +711,25 @@ bool common_has_curl() {
698
711
  return false;
699
712
  }
700
713
 
701
- static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
714
+ static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
702
715
  LOG_ERR("error: built without CURL, cannot download model from internet\n");
703
716
  return false;
704
717
  }
705
718
 
706
- static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
719
+ static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
707
720
  LOG_ERR("error: built without CURL, cannot download model from the internet\n");
708
721
  return false;
709
722
  }
710
723
 
711
724
  static bool common_download_model(
712
725
  const common_params_model &,
713
- const std::string &) {
726
+ const std::string &,
727
+ bool) {
714
728
  LOG_ERR("error: built without CURL, cannot download model from the internet\n");
715
729
  return false;
716
730
  }
717
731
 
718
- static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
732
+ static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
719
733
  LOG_ERR("error: built without CURL, cannot download model from the internet\n");
720
734
  return {};
721
735
  }
@@ -742,7 +756,8 @@ struct handle_model_result {
742
756
  static handle_model_result common_params_handle_model(
743
757
  struct common_params_model & model,
744
758
  const std::string & bearer_token,
745
- const std::string & model_path_default) {
759
+ const std::string & model_path_default,
760
+ bool offline) {
746
761
  handle_model_result result;
747
762
  // handle pre-fill default model path and url based on hf_repo and hf_file
748
763
  {
@@ -750,7 +765,7 @@ static handle_model_result common_params_handle_model(
750
765
  // short-hand to avoid specifying --hf-file -> default it to --model
751
766
  if (model.hf_file.empty()) {
752
767
  if (model.path.empty()) {
753
- auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
768
+ auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
754
769
  if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
755
770
  exit(1); // built without CURL, error message already printed
756
771
  }
@@ -791,7 +806,7 @@ static handle_model_result common_params_handle_model(
791
806
 
792
807
  // then, download it if needed
793
808
  if (!model.url.empty()) {
794
- bool ok = common_download_model(model, bearer_token);
809
+ bool ok = common_download_model(model, bearer_token, offline);
795
810
  if (!ok) {
796
811
  LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
797
812
  exit(1);
@@ -934,7 +949,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
934
949
 
935
950
  // handle model and download
936
951
  {
937
- auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
952
+ auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
938
953
  if (params.no_mmproj) {
939
954
  params.mmproj = {};
940
955
  } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@@ -944,12 +959,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
944
959
  // only download mmproj if the current example is using it
945
960
  for (auto & ex : mmproj_examples) {
946
961
  if (ctx_arg.ex == ex) {
947
- common_params_handle_model(params.mmproj, params.hf_token, "");
962
+ common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
948
963
  break;
949
964
  }
950
965
  }
951
- common_params_handle_model(params.speculative.model, params.hf_token, "");
952
- common_params_handle_model(params.vocoder.model, params.hf_token, "");
966
+ common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
967
+ common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline);
953
968
  }
954
969
 
955
970
  if (params.escape) {
@@ -973,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
973
988
  params.tensor_buft_overrides.push_back({nullptr, nullptr});
974
989
  }
975
990
 
976
- if (params.reranking && params.embedding) {
977
- throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
978
- }
979
-
980
991
  if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
981
992
  throw std::runtime_error(string_format(
982
993
  "error: the supplied chat template is not supported: %s%s\n",
@@ -1333,9 +1344,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1333
1344
  ));
1334
1345
  add_opt(common_arg(
1335
1346
  {"--prio"}, "N",
1336
- string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
1347
+ string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
1337
1348
  [](common_params & params, int prio) {
1338
- if (prio < 0 || prio > 3) {
1349
+ if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
1339
1350
  throw std::invalid_argument("invalid value");
1340
1351
  }
1341
1352
  params.cpuparams.priority = (enum ggml_sched_priority) prio;
@@ -2695,6 +2706,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2695
2706
  params.embd_sep = value;
2696
2707
  }
2697
2708
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2709
+ add_opt(common_arg(
2710
+ {"--cls-separator"}, "STRING",
2711
+ "separator of classification sequences (default \\t) for example \"<#seq#>\"",
2712
+ [](common_params & params, const std::string & value) {
2713
+ params.cls_sep = value;
2714
+ }
2715
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2698
2716
  add_opt(common_arg(
2699
2717
  {"--host"}, "HOST",
2700
2718
  string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
@@ -2732,9 +2750,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2732
2750
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
2733
2751
  add_opt(common_arg(
2734
2752
  {"--reranking", "--rerank"},
2735
- string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
2753
+ string_format("enable reranking endpoint on server (default: %s)", "disabled"),
2736
2754
  [](common_params & params) {
2737
- params.reranking = true;
2755
+ params.embedding = true;
2756
+ params.pooling_type = LLAMA_POOLING_TYPE_RANK;
2738
2757
  }
2739
2758
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
2740
2759
  add_opt(common_arg(
@@ -2848,15 +2867,25 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2848
2867
  ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
2849
2868
  add_opt(common_arg(
2850
2869
  {"--reasoning-format"}, "FORMAT",
2851
- "reasoning format (default: deepseek; allowed values: deepseek, none)\n"
2852
- "controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
2853
- "only supported for non-streamed responses",
2870
+ "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
2871
+ "- none: leaves thoughts unparsed in `message.content`\n"
2872
+ "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
2873
+ "(default: deepseek)",
2854
2874
  [](common_params & params, const std::string & value) {
2855
2875
  /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
2876
+ else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
2856
2877
  else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2857
- else { std::invalid_argument("invalid value"); }
2878
+ else { throw std::invalid_argument("invalid value"); }
2858
2879
  }
2859
2880
  ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
2881
+ add_opt(common_arg(
2882
+ {"--reasoning-budget"}, "N",
2883
+ "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
2884
+ [](common_params & params, int value) {
2885
+ if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
2886
+ params.reasoning_budget = value;
2887
+ }
2888
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
2860
2889
  add_opt(common_arg(
2861
2890
  {"--chat-template"}, "JINJA_TEMPLATE",
2862
2891
  string_format(
@@ -2955,7 +2984,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2955
2984
  [](common_params & params, const std::string & value) {
2956
2985
  /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
2957
2986
  else if (value == "md") { params.batched_bench_output_jsonl = false; }
2958
- else { std::invalid_argument("invalid value"); }
2987
+ else { throw std::invalid_argument("invalid value"); }
2959
2988
  }
2960
2989
  ).set_examples({LLAMA_EXAMPLE_BENCH}));
2961
2990
  add_opt(common_arg(
@@ -2987,6 +3016,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2987
3016
  common_log_set_verbosity_thold(INT_MAX);
2988
3017
  }
2989
3018
  ));
3019
+ add_opt(common_arg(
3020
+ {"--offline"},
3021
+ "Offline mode: forces use of cache, prevents network access",
3022
+ [](common_params & params) {
3023
+ params.offline = true;
3024
+ }
3025
+ ).set_env("LLAMA_OFFLINE"));
2990
3026
  add_opt(common_arg(
2991
3027
  {"-lv", "--verbosity", "--log-verbosity"}, "N",
2992
3028
  "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
@@ -3181,6 +3217,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3181
3217
  params.speculative.model.path = value;
3182
3218
  }
3183
3219
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3220
+ add_opt(common_arg(
3221
+ {"-ctkd", "--cache-type-k-draft"}, "TYPE",
3222
+ string_format(
3223
+ "KV cache data type for K for the draft model\n"
3224
+ "allowed values: %s\n"
3225
+ "(default: %s)",
3226
+ get_all_kv_cache_types().c_str(),
3227
+ ggml_type_name(params.speculative.cache_type_k)
3228
+ ),
3229
+ [](common_params & params, const std::string & value) {
3230
+ params.speculative.cache_type_k = kv_cache_type_from_str(value);
3231
+ }
3232
+ ).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
3233
+ add_opt(common_arg(
3234
+ {"-ctvd", "--cache-type-v-draft"}, "TYPE",
3235
+ string_format(
3236
+ "KV cache data type for V for the draft model\n"
3237
+ "allowed values: %s\n"
3238
+ "(default: %s)",
3239
+ get_all_kv_cache_types().c_str(),
3240
+ ggml_type_name(params.speculative.cache_type_v)
3241
+ ),
3242
+ [](common_params & params, const std::string & value) {
3243
+ params.speculative.cache_type_v = kv_cache_type_from_str(value);
3244
+ }
3245
+ ).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
3184
3246
 
3185
3247
  add_opt(common_arg(
3186
3248
  {"-mv", "--model-vocoder"}, "FNAME",