@fugood/llama.node 1.2.3 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +33 -11
  3. package/src/llama.cpp/CMakeLists.txt +1 -0
  4. package/src/llama.cpp/common/CMakeLists.txt +46 -2
  5. package/src/llama.cpp/common/arg.cpp +484 -204
  6. package/src/llama.cpp/common/arg.h +0 -1
  7. package/src/llama.cpp/common/chat-parser.cpp +156 -15
  8. package/src/llama.cpp/common/chat-parser.h +3 -0
  9. package/src/llama.cpp/common/chat.cpp +217 -6
  10. package/src/llama.cpp/common/chat.h +5 -3
  11. package/src/llama.cpp/common/common.cpp +22 -6
  12. package/src/llama.cpp/common/common.h +6 -4
  13. package/src/llama.cpp/common/http.h +73 -0
  14. package/src/llama.cpp/common/json-partial.cpp +51 -0
  15. package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
  16. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  17. package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
  18. package/src/llama.cpp/ggml/include/ggml.h +22 -0
  19. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  21. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  25. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
  28. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
  29. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +17 -17
  31. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
  32. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
  39. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +11 -9
  40. package/src/llama.cpp/include/llama.h +8 -0
  41. package/src/llama.cpp/src/llama-arch.cpp +93 -0
  42. package/src/llama.cpp/src/llama-arch.h +22 -0
  43. package/src/llama.cpp/src/llama-chat.cpp +1 -1
  44. package/src/llama.cpp/src/llama-context.cpp +6 -0
  45. package/src/llama.cpp/src/llama-graph.cpp +57 -22
  46. package/src/llama.cpp/src/llama-graph.h +10 -1
  47. package/src/llama.cpp/src/llama-hparams.cpp +5 -1
  48. package/src/llama.cpp/src/llama-hparams.h +17 -2
  49. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +2 -2
  50. package/src/llama.cpp/src/llama-kv-cache.cpp +2 -5
  51. package/src/llama.cpp/src/llama-memory-hybrid.cpp +11 -9
  52. package/src/llama.cpp/src/llama-memory-recurrent.cpp +11 -3
  53. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  54. package/src/llama.cpp/src/llama-model.cpp +572 -45
  55. package/src/llama.cpp/src/llama-model.h +18 -0
  56. package/src/llama.cpp/src/llama-sampling.cpp +5 -0
  57. package/src/llama.cpp/src/llama-vocab.cpp +7 -1
  58. package/src/llama.cpp/src/llama-vocab.h +41 -40
  59. package/src/llama.cpp/src/unicode.h +43 -0
@@ -32,11 +32,11 @@
32
32
  #include <thread>
33
33
  #include <vector>
34
34
 
35
- //#define LLAMA_USE_CURL
36
-
37
35
  #if defined(LLAMA_USE_CURL)
38
36
  #include <curl/curl.h>
39
37
  #include <curl/easy.h>
38
+ #else
39
+ #include "http.h"
40
40
  #endif
41
41
 
42
42
  #ifdef __linux__
@@ -52,6 +52,13 @@
52
52
  #endif
53
53
  #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
54
54
 
55
+ // isatty
56
+ #if defined(_WIN32)
57
+ #include <io.h>
58
+ #else
59
+ #include <unistd.h>
60
+ #endif
61
+
55
62
  using json = nlohmann::ordered_json;
56
63
 
57
64
  std::initializer_list<enum llama_example> mmproj_examples = {
@@ -98,6 +105,14 @@ static void write_file(const std::string & fname, const std::string & content) {
98
105
  }
99
106
  }
100
107
 
108
+ static bool is_output_a_tty() {
109
+ #if defined(_WIN32)
110
+ return _isatty(_fileno(stdout));
111
+ #else
112
+ return isatty(1);
113
+ #endif
114
+ }
115
+
101
116
  common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
102
117
  this->examples = std::move(examples);
103
118
  return *this;
@@ -215,12 +230,55 @@ struct common_hf_file_res {
215
230
  std::string mmprojFile;
216
231
  };
217
232
 
218
- #ifdef LLAMA_USE_CURL
233
+ static void write_etag(const std::string & path, const std::string & etag) {
234
+ const std::string etag_path = path + ".etag";
235
+ write_file(etag_path, etag);
236
+ LOG_DBG("%s: file etag saved: %s\n", __func__, etag_path.c_str());
237
+ }
219
238
 
220
- bool common_has_curl() {
221
- return true;
239
+ static std::string read_etag(const std::string & path) {
240
+ std::string none;
241
+ const std::string etag_path = path + ".etag";
242
+
243
+ if (std::filesystem::exists(etag_path)) {
244
+ std::ifstream etag_in(etag_path);
245
+ if (!etag_in) {
246
+ LOG_ERR("%s: could not open .etag file for reading: %s\n", __func__, etag_path.c_str());
247
+ return none;
248
+ }
249
+ std::string etag;
250
+ std::getline(etag_in, etag);
251
+ return etag;
252
+ }
253
+
254
+ // no etag file, but maybe there is an old .json
255
+ // remove this code later
256
+ const std::string metadata_path = path + ".json";
257
+
258
+ if (std::filesystem::exists(metadata_path)) {
259
+ std::ifstream metadata_in(metadata_path);
260
+ try {
261
+ nlohmann::json metadata_json;
262
+ metadata_in >> metadata_json;
263
+ LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(),
264
+ metadata_json.dump().c_str());
265
+ if (metadata_json.contains("etag") && metadata_json.at("etag").is_string()) {
266
+ std::string etag = metadata_json.at("etag");
267
+ write_etag(path, etag);
268
+ if (!std::filesystem::remove(metadata_path)) {
269
+ LOG_WRN("%s: failed to delete old .json metadata file: %s\n", __func__, metadata_path.c_str());
270
+ }
271
+ return etag;
272
+ }
273
+ } catch (const nlohmann::json::exception & e) {
274
+ LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
275
+ }
276
+ }
277
+ return none;
222
278
  }
223
279
 
280
+ #ifdef LLAMA_USE_CURL
281
+
224
282
  //
225
283
  // CURL utils
226
284
  //
@@ -371,36 +429,15 @@ static bool common_download_head(CURL * curl,
371
429
  static bool common_download_file_single_online(const std::string & url,
372
430
  const std::string & path,
373
431
  const std::string & bearer_token) {
374
- // If the file exists, check its JSON metadata companion file.
375
- std::string metadata_path = path + ".json";
376
432
  static const int max_attempts = 3;
377
433
  static const int retry_delay_seconds = 2;
378
434
  for (int i = 0; i < max_attempts; ++i) {
379
- nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
380
- std::string etag;
381
- std::string last_modified;
435
+ std::string etag;
382
436
 
383
437
  // Check if the file already exists locally
384
438
  const auto file_exists = std::filesystem::exists(path);
385
439
  if (file_exists) {
386
- // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
387
- std::ifstream metadata_in(metadata_path);
388
- if (metadata_in.good()) {
389
- try {
390
- metadata_in >> metadata;
391
- LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(),
392
- metadata.dump().c_str());
393
- if (metadata.contains("etag") && metadata.at("etag").is_string()) {
394
- etag = metadata.at("etag");
395
- }
396
- if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
397
- last_modified = metadata.at("lastModified");
398
- }
399
- } catch (const nlohmann::json::exception & e) {
400
- LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
401
- }
402
- }
403
- // if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
440
+ etag = read_etag(path);
404
441
  } else {
405
442
  LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
406
443
  }
@@ -438,11 +475,6 @@ static bool common_download_file_single_online(const std::string & url,
438
475
  headers.etag.c_str());
439
476
  should_download = true;
440
477
  should_download_from_scratch = true;
441
- } else if (!last_modified.empty() && last_modified != headers.last_modified) {
442
- LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__,
443
- last_modified.c_str(), headers.last_modified.c_str());
444
- should_download = true;
445
- should_download_from_scratch = true;
446
478
  }
447
479
  }
448
480
 
@@ -473,15 +505,9 @@ static bool common_download_file_single_online(const std::string & url,
473
505
  }
474
506
  }
475
507
  }
476
-
477
- // Write the updated JSON metadata file.
478
- metadata.update({
479
- { "url", url },
480
- { "etag", headers.etag },
481
- { "lastModified", headers.last_modified }
482
- });
483
- write_file(metadata_path, metadata.dump(4));
484
- LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
508
+ if (head_request_ok) {
509
+ write_etag(path, headers.etag);
510
+ }
485
511
 
486
512
  // start the download
487
513
  LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n",
@@ -568,21 +594,238 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
568
594
 
569
595
  #else
570
596
 
571
- bool common_has_curl() {
572
- return false;
597
+ static void print_progress(size_t current, size_t total) {
598
+ if (!is_output_a_tty()) {
599
+ return;
600
+ }
601
+
602
+ if (!total) {
603
+ return;
604
+ }
605
+
606
+ size_t width = 50;
607
+ size_t pct = (100 * current) / total;
608
+ size_t pos = (width * current) / total;
609
+
610
+ std::cout << "["
611
+ << std::string(pos, '=')
612
+ << (pos < width ? ">" : "")
613
+ << std::string(width - pos, ' ')
614
+ << "] " << std::setw(3) << pct << "% ("
615
+ << current / (1024 * 1024) << " MB / "
616
+ << total / (1024 * 1024) << " MB)\r";
617
+ std::cout.flush();
573
618
  }
574
619
 
575
- static bool common_download_file_single_online(const std::string &, const std::string &, const std::string &) {
576
- LOG_ERR("error: built without CURL, cannot download model from internet\n");
577
- return false;
620
+ static bool common_pull_file(httplib::Client & cli,
621
+ const std::string & resolve_path,
622
+ const std::string & path_tmp,
623
+ bool supports_ranges,
624
+ size_t existing_size,
625
+ size_t & total_size) {
626
+ std::ofstream ofs(path_tmp, std::ios::binary | std::ios::app);
627
+ if (!ofs.is_open()) {
628
+ LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_tmp.c_str());
629
+ return false;
630
+ }
631
+
632
+ httplib::Headers headers;
633
+ if (supports_ranges && existing_size > 0) {
634
+ headers.emplace("Range", "bytes=" + std::to_string(existing_size) + "-");
635
+ }
636
+
637
+ std::atomic<size_t> downloaded{existing_size};
638
+
639
+ auto res = cli.Get(resolve_path, headers,
640
+ [&](const httplib::Response &response) {
641
+ if (existing_size > 0 && response.status != 206) {
642
+ LOG_WRN("%s: server did not respond with 206 Partial Content for a resume request. Status: %d\n", __func__, response.status);
643
+ return false;
644
+ }
645
+ if (existing_size == 0 && response.status != 200) {
646
+ LOG_WRN("%s: download received non-successful status code: %d\n", __func__, response.status);
647
+ return false;
648
+ }
649
+ if (total_size == 0 && response.has_header("Content-Length")) {
650
+ try {
651
+ size_t content_length = std::stoull(response.get_header_value("Content-Length"));
652
+ total_size = existing_size + content_length;
653
+ } catch (const std::exception &e) {
654
+ LOG_WRN("%s: invalid Content-Length header: %s\n", __func__, e.what());
655
+ }
656
+ }
657
+ return true;
658
+ },
659
+ [&](const char *data, size_t len) {
660
+ ofs.write(data, len);
661
+ if (!ofs) {
662
+ LOG_ERR("%s: error writing to file: %s\n", __func__, path_tmp.c_str());
663
+ return false;
664
+ }
665
+ downloaded += len;
666
+ print_progress(downloaded, total_size);
667
+ return true;
668
+ },
669
+ nullptr
670
+ );
671
+
672
+ std::cout << "\n";
673
+
674
+ if (!res) {
675
+ LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
676
+ return false;
677
+ }
678
+
679
+ return true;
680
+ }
681
+
682
+ // download one single file from remote URL to local path
683
+ static bool common_download_file_single_online(const std::string & url,
684
+ const std::string & path,
685
+ const std::string & bearer_token) {
686
+ static const int max_attempts = 3;
687
+ static const int retry_delay_seconds = 2;
688
+
689
+ auto [cli, parts] = common_http_client(url);
690
+
691
+ httplib::Headers default_headers = {{"User-Agent", "llama-cpp"}};
692
+ if (!bearer_token.empty()) {
693
+ default_headers.insert({"Authorization", "Bearer " + bearer_token});
694
+ }
695
+ cli.set_default_headers(default_headers);
696
+
697
+ const bool file_exists = std::filesystem::exists(path);
698
+
699
+ std::string last_etag;
700
+ if (file_exists) {
701
+ last_etag = read_etag(path);
702
+ } else {
703
+ LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
704
+ }
705
+
706
+ for (int i = 0; i < max_attempts; ++i) {
707
+ auto head = cli.Head(parts.path);
708
+ bool head_ok = head && head->status >= 200 && head->status < 300;
709
+ if (!head_ok) {
710
+ LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1);
711
+ if (file_exists) {
712
+ LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str());
713
+ return true;
714
+ }
715
+ }
716
+
717
+ std::string etag;
718
+ if (head_ok && head->has_header("ETag")) {
719
+ etag = head->get_header_value("ETag");
720
+ }
721
+
722
+ size_t total_size = 0;
723
+ if (head_ok && head->has_header("Content-Length")) {
724
+ try {
725
+ total_size = std::stoull(head->get_header_value("Content-Length"));
726
+ } catch (const std::exception& e) {
727
+ LOG_WRN("%s: Invalid Content-Length in HEAD response: %s\n", __func__, e.what());
728
+ }
729
+ }
730
+
731
+ bool supports_ranges = false;
732
+ if (head_ok && head->has_header("Accept-Ranges")) {
733
+ supports_ranges = head->get_header_value("Accept-Ranges") != "none";
734
+ }
735
+
736
+ bool should_download_from_scratch = false;
737
+ if (!last_etag.empty() && !etag.empty() && last_etag != etag) {
738
+ LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__,
739
+ last_etag.c_str(), etag.c_str());
740
+ should_download_from_scratch = true;
741
+ }
742
+
743
+ if (file_exists) {
744
+ if (!should_download_from_scratch) {
745
+ LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
746
+ return true;
747
+ }
748
+ LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
749
+ if (remove(path.c_str()) != 0) {
750
+ LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
751
+ return false;
752
+ }
753
+ }
754
+
755
+ const std::string path_temporary = path + ".downloadInProgress";
756
+ size_t existing_size = 0;
757
+
758
+ if (std::filesystem::exists(path_temporary)) {
759
+ if (supports_ranges && !should_download_from_scratch) {
760
+ existing_size = std::filesystem::file_size(path_temporary);
761
+ } else if (remove(path_temporary.c_str()) != 0) {
762
+ LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
763
+ return false;
764
+ }
765
+ }
766
+
767
+ // start the download
768
+ LOG_INF("%s: trying to download model from %s to %s (etag:%s)...\n",
769
+ __func__, common_http_show_masked_url(parts).c_str(), path_temporary.c_str(), etag.c_str());
770
+ const bool was_pull_successful = common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size);
771
+ if (!was_pull_successful) {
772
+ if (i + 1 < max_attempts) {
773
+ const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000;
774
+ LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay);
775
+ std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
776
+ } else {
777
+ LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts);
778
+ }
779
+ continue;
780
+ }
781
+
782
+ if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
783
+ LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
784
+ return false;
785
+ }
786
+ if (!etag.empty()) {
787
+ write_etag(path, etag);
788
+ }
789
+ break;
790
+ }
791
+
792
+ return true;
578
793
  }
579
794
 
580
- std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
581
- if (!url.empty()) {
582
- throw std::runtime_error("error: built without CURL, cannot download model from the internet");
795
+ std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url,
796
+ const common_remote_params & params) {
797
+ auto [cli, parts] = common_http_client(url);
798
+
799
+ httplib::Headers headers = {{"User-Agent", "llama-cpp"}};
800
+ for (const auto & header : params.headers) {
801
+ size_t pos = header.find(':');
802
+ if (pos != std::string::npos) {
803
+ headers.emplace(header.substr(0, pos), header.substr(pos + 1));
804
+ } else {
805
+ headers.emplace(header, "");
806
+ }
807
+ }
808
+
809
+ if (params.timeout > 0) {
810
+ cli.set_read_timeout(params.timeout, 0);
811
+ cli.set_write_timeout(params.timeout, 0);
812
+ }
813
+
814
+ std::vector<char> buf;
815
+ auto res = cli.Get(parts.path, headers,
816
+ [&](const char *data, size_t len) {
817
+ buf.insert(buf.end(), data, data + len);
818
+ return params.max_size == 0 ||
819
+ buf.size() <= static_cast<size_t>(params.max_size);
820
+ },
821
+ nullptr
822
+ );
823
+
824
+ if (!res) {
825
+ throw std::runtime_error("error: cannot make GET request");
583
826
  }
584
827
 
585
- return {};
828
+ return { res->status, std::move(buf) };
586
829
  }
587
830
 
588
831
  #endif // LLAMA_USE_CURL
@@ -1372,18 +1615,14 @@ static void add_rpc_devices(const std::string & servers) {
1372
1615
  if (!rpc_reg) {
1373
1616
  throw std::invalid_argument("failed to find RPC backend");
1374
1617
  }
1375
- typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
1376
- ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
1377
- if (!ggml_backend_rpc_add_device_fn) {
1378
- throw std::invalid_argument("failed to find RPC device add function");
1618
+ typedef ggml_backend_reg_t (*ggml_backend_rpc_add_server_t)(const char * endpoint);
1619
+ ggml_backend_rpc_add_server_t ggml_backend_rpc_add_server_fn = (ggml_backend_rpc_add_server_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server");
1620
+ if (!ggml_backend_rpc_add_server_fn) {
1621
+ throw std::invalid_argument("failed to find RPC add server function");
1379
1622
  }
1380
1623
  for (const auto & server : rpc_servers) {
1381
- ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
1382
- if (dev) {
1383
- ggml_backend_device_register(dev);
1384
- } else {
1385
- throw std::invalid_argument("failed to register RPC device");
1386
- }
1624
+ auto reg = ggml_backend_rpc_add_server_fn(server.c_str());
1625
+ ggml_backend_register(reg);
1387
1626
  }
1388
1627
  }
1389
1628
 
@@ -1689,13 +1928,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1689
1928
  }
1690
1929
  ).set_env("LLAMA_ARG_SWA_FULL"));
1691
1930
  add_opt(common_arg(
1692
- {"--swa-checkpoints"}, "N",
1693
- string_format("max number of SWA checkpoints per slot to create (default: %d)\n"
1694
- "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_swa_checkpoints),
1931
+ {"--ctx-checkpoints", "--swa-checkpoints"}, "N",
1932
+ string_format("max number of context checkpoints to create per slot (default: %d)\n"
1933
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
1934
+ [](common_params & params, int value) {
1935
+ params.n_ctx_checkpoints = value;
1936
+ }
1937
+ ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
1938
+ add_opt(common_arg(
1939
+ {"--cache-ram", "-cram"}, "N",
1940
+ string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
1941
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
1695
1942
  [](common_params & params, int value) {
1696
- params.n_swa_checkpoints = value;
1943
+ params.cache_ram_mib = value;
1697
1944
  }
1698
- ).set_env("LLAMA_ARG_SWA_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
1945
+ ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
1699
1946
  add_opt(common_arg(
1700
1947
  {"--kv-unified", "-kvu"},
1701
1948
  string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
@@ -2345,6 +2592,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2345
2592
  params.no_extra_bufts = true;
2346
2593
  }
2347
2594
  ).set_env("LLAMA_ARG_NO_REPACK"));
2595
+ add_opt(common_arg(
2596
+ {"--no-host"},
2597
+ "bypass host buffer allowing extra buffers to be used",
2598
+ [](common_params & params) {
2599
+ params.no_host = true;
2600
+ }
2601
+ ).set_env("LLAMA_ARG_NO_HOST"));
2348
2602
  add_opt(common_arg(
2349
2603
  {"-ctk", "--cache-type-k"}, "TYPE",
2350
2604
  string_format(
@@ -3104,7 +3358,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3104
3358
  add_opt(common_arg(
3105
3359
  {"--chat-template-kwargs"}, "STRING",
3106
3360
  string_format("sets additional params for the json template parser"),
3107
- [](common_params & params, const std::string & value) {
3361
+ [](common_params & params, const std::string & value) {
3108
3362
  auto parsed = json::parse(value);
3109
3363
  for (const auto & item : parsed.items()) {
3110
3364
  params.default_template_kwargs[item.key()] = item.value().dump();
@@ -3186,7 +3440,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3186
3440
  {"--reasoning-format"}, "FORMAT",
3187
3441
  "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
3188
3442
  "- none: leaves thoughts unparsed in `message.content`\n"
3189
- "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
3443
+ "- deepseek: puts thoughts in `message.reasoning_content`\n"
3444
+ "- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
3190
3445
  "(default: auto)",
3191
3446
  [](common_params & params, const std::string & value) {
3192
3447
  params.reasoning_format = common_reasoning_format_from_name(value);
@@ -3315,21 +3570,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3315
3570
  common_log_set_file(common_log_main(), value.c_str());
3316
3571
  }
3317
3572
  ));
3318
- add_opt(common_arg({ "--log-colors" }, "[on|off|auto]",
3319
- "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
3320
- "'auto' enables colors when output is to a terminal",
3321
- [](common_params &, const std::string & value) {
3322
- if (is_truthy(value)) {
3323
- common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
3324
- } else if (is_falsey(value)) {
3325
- common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
3326
- } else if (is_autoy(value)) {
3327
- common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
3328
- } else {
3329
- throw std::invalid_argument(
3330
- string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
3331
- }
3332
- }).set_env("LLAMA_LOG_COLORS"));
3573
+ add_opt(common_arg(
3574
+ {"--log-colors"}, "[on|off|auto]",
3575
+ "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
3576
+ "'auto' enables colors when output is to a terminal",
3577
+ [](common_params &, const std::string & value) {
3578
+ if (is_truthy(value)) {
3579
+ common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
3580
+ } else if (is_falsey(value)) {
3581
+ common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
3582
+ } else if (is_autoy(value)) {
3583
+ common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
3584
+ } else {
3585
+ throw std::invalid_argument(
3586
+ string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
3587
+ }
3588
+ }
3589
+ ).set_env("LLAMA_LOG_COLORS"));
3333
3590
  add_opt(common_arg(
3334
3591
  {"-v", "--verbose", "--log-verbose"},
3335
3592
  "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
@@ -3595,7 +3852,87 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3595
3852
  }
3596
3853
  ).set_examples({LLAMA_EXAMPLE_TTS}));
3597
3854
 
3598
- // model-specific
3855
+ add_opt(common_arg(
3856
+ {"--diffusion-steps"}, "N",
3857
+ string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
3858
+ [](common_params & params, int value) { params.diffusion.steps = value; }
3859
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3860
+ add_opt(common_arg(
3861
+ {"--diffusion-visual"},
3862
+ string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
3863
+ [](common_params & params) { params.diffusion.visual_mode = true; }
3864
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3865
+ add_opt(common_arg(
3866
+ {"--diffusion-eps"}, "F",
3867
+ string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
3868
+ [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
3869
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3870
+ add_opt(common_arg(
3871
+ {"--diffusion-algorithm"}, "N",
3872
+ string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm),
3873
+ [](common_params & params, int value) { params.diffusion.algorithm = value; }
3874
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3875
+ add_opt(common_arg(
3876
+ {"--diffusion-alg-temp"}, "F",
3877
+ string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3878
+ [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
3879
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3880
+ add_opt(common_arg(
3881
+ {"--diffusion-block-length"}, "N",
3882
+ string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
3883
+ [](common_params & params, int value) { params.diffusion.block_length = value; }
3884
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3885
+ add_opt(common_arg(
3886
+ {"--diffusion-cfg-scale"}, "F",
3887
+ string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
3888
+ [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
3889
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3890
+ add_opt(common_arg(
3891
+ {"--diffusion-add-gumbel-noise"}, "F",
3892
+ string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
3893
+ [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
3894
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3895
+ add_opt(common_arg(
3896
+ { "-lr", "--learning-rate" }, "ALPHA",
3897
+ string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
3898
+ [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); }
3899
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3900
+ add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
3901
+ string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
3902
+ (double) params.lr.lr_min),
3903
+ [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); }
3904
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3905
+ add_opt(common_arg(
3906
+ {"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
3907
+ string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
3908
+ [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); }
3909
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3910
+ add_opt(common_arg(
3911
+ {"-wd", "--weight-decay"}, "WD",
3912
+ string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
3913
+ [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }
3914
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3915
+ add_opt(common_arg(
3916
+ {"-val-split", "--val-split"}, "FRACTION",
3917
+ string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
3918
+ [](common_params & params, const std::string & value) { params.val_split = std::stof(value); }
3919
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3920
+ add_opt(common_arg(
3921
+ {"-epochs", "--epochs"}, "N",
3922
+ string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
3923
+ [](common_params & params, int epochs) { params.lr.epochs = epochs; }
3924
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3925
+ add_opt(common_arg(
3926
+ {"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd",
3927
+ [](common_params & params, const std::string & name) {
3928
+ params.optimizer = common_opt_get_optimizer(name.c_str());
3929
+ if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
3930
+ throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
3931
+ }
3932
+ }
3933
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3934
+
3935
+ // presets
3599
3936
  add_opt(common_arg(
3600
3937
  {"--tts-oute-default"},
3601
3938
  string_format("use default OuteTTS models (note: can download weights from the internet)"),
@@ -3608,42 +3945,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3608
3945
  ).set_examples({LLAMA_EXAMPLE_TTS}));
3609
3946
 
3610
3947
  add_opt(common_arg(
3611
- {"--embd-bge-small-en-default"},
3612
- string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
3948
+ {"--embd-gemma-default"},
3949
+ string_format("use default EmbeddingGemma model (note: can download weights from the internet)"),
3613
3950
  [](common_params & params) {
3614
- params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
3615
- params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
3616
- params.pooling_type = LLAMA_POOLING_TYPE_NONE;
3617
- params.embd_normalize = 2;
3618
- params.n_ctx = 512;
3619
- params.verbose_prompt = true;
3620
- params.embedding = true;
3621
- }
3622
- ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
3623
-
3624
- add_opt(common_arg(
3625
- {"--embd-e5-small-en-default"},
3626
- string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
3627
- [](common_params & params) {
3628
- params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
3629
- params.model.hf_file = "e5-small-v2-q8_0.gguf";
3630
- params.pooling_type = LLAMA_POOLING_TYPE_NONE;
3631
- params.embd_normalize = 2;
3632
- params.n_ctx = 512;
3633
- params.verbose_prompt = true;
3634
- params.embedding = true;
3635
- }
3636
- ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
3637
-
3638
- add_opt(common_arg(
3639
- {"--embd-gte-small-default"},
3640
- string_format("use default gte-small model (note: can download weights from the internet)"),
3641
- [](common_params & params) {
3642
- params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
3643
- params.model.hf_file = "gte-small-q8_0.gguf";
3644
- params.pooling_type = LLAMA_POOLING_TYPE_NONE;
3645
- params.embd_normalize = 2;
3646
- params.n_ctx = 512;
3951
+ params.model.hf_repo = "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF";
3952
+ params.model.hf_file = "embeddinggemma-300M-qat-Q4_0.gguf";
3953
+ params.port = 8011;
3954
+ params.n_ubatch = 2048;
3955
+ params.n_batch = 2048;
3956
+ params.n_parallel = 32;
3957
+ params.n_ctx = 2048*params.n_parallel;
3647
3958
  params.verbose_prompt = true;
3648
3959
  params.embedding = true;
3649
3960
  }
@@ -3738,96 +4049,65 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3738
4049
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
3739
4050
 
3740
4051
  add_opt(common_arg(
3741
- { "--diffusion-steps" }, "N",
3742
- string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
3743
- [](common_params & params, int value) { params.diffusion.steps = value; }
3744
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3745
- add_opt(common_arg(
3746
- { "--diffusion-visual" },
3747
- string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
3748
- params.diffusion.visual_mode ? "true" : "false"),
3749
- [](common_params & params) { params.diffusion.visual_mode = true; }
3750
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4052
+ {"--gpt-oss-20b-default"},
4053
+ string_format("use gpt-oss-20b (note: can download weights from the internet)"),
4054
+ [](common_params & params) {
4055
+ params.model.hf_repo = "ggml-org/gpt-oss-20b-GGUF";
4056
+ params.model.hf_file = "gpt-oss-20b-mxfp4.gguf";
4057
+ params.port = 8013;
4058
+ params.n_ubatch = 2048;
4059
+ params.n_batch = 32768;
4060
+ params.n_parallel = 2;
4061
+ params.n_ctx = 131072*params.n_parallel;
4062
+ params.sampling.temp = 1.0f;
4063
+ params.sampling.top_p = 1.0f;
4064
+ params.sampling.top_k = 0;
4065
+ params.sampling.min_p = 0.01f;
4066
+ params.use_jinja = true;
4067
+ //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
4068
+ }
4069
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
3751
4070
 
3752
4071
  add_opt(common_arg(
3753
- { "--diffusion-eps" }, "F",
3754
- string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
3755
- [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
3756
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3757
- add_opt(common_arg(
3758
- { "--diffusion-algorithm" }, "N",
3759
- string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
3760
- params.diffusion.algorithm),
3761
- [](common_params & params, int value) { params.diffusion.algorithm = value; }
3762
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3763
- add_opt(common_arg(
3764
- { "--diffusion-alg-temp" }, "F",
3765
- string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3766
- [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
3767
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4072
+ {"--gpt-oss-120b-default"},
4073
+ string_format("use gpt-oss-120b (note: can download weights from the internet)"),
4074
+ [](common_params & params) {
4075
+ params.model.hf_repo = "ggml-org/gpt-oss-120b-GGUF";
4076
+ params.port = 8013;
4077
+ params.n_ubatch = 2048;
4078
+ params.n_batch = 32768;
4079
+ params.n_parallel = 2;
4080
+ params.n_ctx = 131072*params.n_parallel;
4081
+ params.sampling.temp = 1.0f;
4082
+ params.sampling.top_p = 1.0f;
4083
+ params.sampling.top_k = 0;
4084
+ params.sampling.min_p = 0.01f;
4085
+ params.use_jinja = true;
4086
+ //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
4087
+ }
4088
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
3768
4089
 
3769
4090
  add_opt(common_arg(
3770
- { "--diffusion-block-length" }, "N",
3771
- string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
3772
- [](common_params & params, int value) { params.diffusion.block_length = value; }
3773
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3774
- add_opt(common_arg(
3775
- { "--diffusion-cfg-scale" }, "F",
3776
- string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
3777
- [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
3778
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3779
- add_opt(common_arg(
3780
- { "--diffusion-add-gumbel-noise" }, "F",
3781
- string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
3782
- [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
3783
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3784
-
4091
+ {"--vision-gemma-4b-default"},
4092
+ string_format("use Gemma 3 4B QAT (note: can download weights from the internet)"),
4093
+ [](common_params & params) {
4094
+ params.model.hf_repo = "ggml-org/gemma-3-4b-it-qat-GGUF";
4095
+ params.port = 8014;
4096
+ params.n_ctx = 0;
4097
+ params.use_jinja = true;
4098
+ }
4099
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
3785
4100
 
3786
- add_opt(
3787
- common_arg({ "-lr", "--learning-rate" }, "ALPHA",
3788
- string_format(
3789
- "adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)",
3790
- (double) params.lr.lr0),
3791
- [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); })
3792
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3793
- add_opt(
3794
- common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
3795
- string_format(
3796
- "(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
3797
- (double) params.lr.lr_min),
3798
- [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
3799
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3800
- add_opt(
3801
- common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
3802
- string_format(
3803
- "(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
3804
- (double) params.lr.decay_epochs),
3805
- [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
3806
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3807
- add_opt(common_arg(
3808
- { "-wd", "--weight-decay" }, "WD",
3809
- string_format(
3810
- "adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
3811
- (double) params.lr.wd),
3812
- [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
3813
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3814
- add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
3815
- string_format("fraction of data to use as validation set for training (default: %.2g).",
3816
- (double) params.val_split),
3817
- [](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
3818
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3819
- add_opt(common_arg({ "-epochs", "--epochs" }, "N",
3820
- string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
3821
- [](common_params & params, int epochs) { params.lr.epochs = epochs; })
3822
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3823
- add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
3824
- [](common_params & params, const std::string & name) {
3825
- params.optimizer = common_opt_get_optimizer(name.c_str());
3826
- if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
3827
- throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
3828
- }
3829
- })
3830
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4101
+ add_opt(common_arg(
4102
+ {"--vision-gemma-12b-default"},
4103
+ string_format("use Gemma 3 12B QAT (note: can download weights from the internet)"),
4104
+ [](common_params & params) {
4105
+ params.model.hf_repo = "ggml-org/gemma-3-12b-it-qat-GGUF";
4106
+ params.port = 8014;
4107
+ params.n_ctx = 0;
4108
+ params.use_jinja = true;
4109
+ }
4110
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
3831
4111
 
3832
4112
  return ctx_arg;
3833
4113
  }