@fugood/llama.node 1.2.2 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.2.2",
4
+ "version": "1.2.3",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,19 +72,19 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-linux-x64": "1.2.2",
76
- "@fugood/node-llama-linux-x64-vulkan": "1.2.2",
77
- "@fugood/node-llama-linux-x64-cuda": "1.2.2",
78
- "@fugood/node-llama-linux-arm64": "1.2.2",
79
- "@fugood/node-llama-linux-arm64-vulkan": "1.2.2",
80
- "@fugood/node-llama-linux-arm64-cuda": "1.2.2",
81
- "@fugood/node-llama-win32-x64": "1.2.2",
82
- "@fugood/node-llama-win32-x64-vulkan": "1.2.2",
83
- "@fugood/node-llama-win32-x64-cuda": "1.2.2",
84
- "@fugood/node-llama-win32-arm64": "1.2.2",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.2.2",
86
- "@fugood/node-llama-darwin-x64": "1.2.2",
87
- "@fugood/node-llama-darwin-arm64": "1.2.2"
75
+ "@fugood/node-llama-linux-x64": "1.2.3",
76
+ "@fugood/node-llama-linux-x64-vulkan": "1.2.3",
77
+ "@fugood/node-llama-linux-x64-cuda": "1.2.3",
78
+ "@fugood/node-llama-linux-arm64": "1.2.3",
79
+ "@fugood/node-llama-linux-arm64-vulkan": "1.2.3",
80
+ "@fugood/node-llama-linux-arm64-cuda": "1.2.3",
81
+ "@fugood/node-llama-win32-x64": "1.2.3",
82
+ "@fugood/node-llama-win32-x64-vulkan": "1.2.3",
83
+ "@fugood/node-llama-win32-x64-cuda": "1.2.3",
84
+ "@fugood/node-llama-win32-arm64": "1.2.3",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.2.3",
86
+ "@fugood/node-llama-darwin-x64": "1.2.3",
87
+ "@fugood/node-llama-darwin-arm64": "1.2.3"
88
88
  },
89
89
  "devDependencies": {
90
90
  "@babel/preset-env": "^7.24.4",
@@ -24,6 +24,7 @@
24
24
  #include <cstdarg>
25
25
  #include <filesystem>
26
26
  #include <fstream>
27
+ #include <future>
27
28
  #include <list>
28
29
  #include <regex>
29
30
  #include <set>
@@ -36,9 +37,21 @@
36
37
  #if defined(LLAMA_USE_CURL)
37
38
  #include <curl/curl.h>
38
39
  #include <curl/easy.h>
39
- #include <future>
40
40
  #endif
41
41
 
42
+ #ifdef __linux__
43
+ #include <linux/limits.h>
44
+ #elif defined(_WIN32)
45
+ # if !defined(PATH_MAX)
46
+ # define PATH_MAX MAX_PATH
47
+ # endif
48
+ #elif defined(_AIX)
49
+ #include <sys/limits.h>
50
+ #else
51
+ #include <sys/syslimits.h>
52
+ #endif
53
+ #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
54
+
42
55
  using json = nlohmann::ordered_json;
43
56
 
44
57
  std::initializer_list<enum llama_example> mmproj_examples = {
@@ -208,19 +221,6 @@ bool common_has_curl() {
208
221
  return true;
209
222
  }
210
223
 
211
- #ifdef __linux__
212
- #include <linux/limits.h>
213
- #elif defined(_WIN32)
214
- # if !defined(PATH_MAX)
215
- # define PATH_MAX MAX_PATH
216
- # endif
217
- #elif defined(_AIX)
218
- #include <sys/limits.h>
219
- #else
220
- #include <sys/syslimits.h>
221
- #endif
222
- #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
223
-
224
224
  //
225
225
  // CURL utils
226
226
  //
@@ -368,10 +368,9 @@ static bool common_download_head(CURL * curl,
368
368
  }
369
369
 
370
370
  // download one single file from remote URL to local path
371
- static bool common_download_file_single(const std::string & url,
372
- const std::string & path,
373
- const std::string & bearer_token,
374
- bool offline) {
371
+ static bool common_download_file_single_online(const std::string & url,
372
+ const std::string & path,
373
+ const std::string & bearer_token) {
375
374
  // If the file exists, check its JSON metadata companion file.
376
375
  std::string metadata_path = path + ".json";
377
376
  static const int max_attempts = 3;
@@ -384,10 +383,6 @@ static bool common_download_file_single(const std::string & url,
384
383
  // Check if the file already exists locally
385
384
  const auto file_exists = std::filesystem::exists(path);
386
385
  if (file_exists) {
387
- if (offline) {
388
- LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
389
- return true; // skip verification/downloading
390
- }
391
386
  // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
392
387
  std::ifstream metadata_in(metadata_path);
393
388
  if (metadata_in.good()) {
@@ -407,10 +402,6 @@ static bool common_download_file_single(const std::string & url,
407
402
  }
408
403
  // if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
409
404
  } else {
410
- if (offline) {
411
- LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
412
- return false;
413
- }
414
405
  LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
415
406
  }
416
407
 
@@ -530,6 +521,89 @@ static bool common_download_file_single(const std::string & url,
530
521
  return true;
531
522
  }
532
523
 
524
+ std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
525
+ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
526
+ curl_slist_ptr http_headers;
527
+ std::vector<char> res_buffer;
528
+
529
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
530
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
531
+ curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
532
+ curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 1L);
533
+ typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
534
+ auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
535
+ auto data_vec = static_cast<std::vector<char> *>(data);
536
+ data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
537
+ return size * nmemb;
538
+ };
539
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
540
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
541
+ #if defined(_WIN32)
542
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
543
+ #endif
544
+ if (params.timeout > 0) {
545
+ curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
546
+ }
547
+ if (params.max_size > 0) {
548
+ curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
549
+ }
550
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
551
+ for (const auto & header : params.headers) {
552
+ http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
553
+ }
554
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
555
+
556
+ CURLcode res = curl_easy_perform(curl.get());
557
+
558
+ if (res != CURLE_OK) {
559
+ std::string error_msg = curl_easy_strerror(res);
560
+ throw std::runtime_error("error: cannot make GET request: " + error_msg);
561
+ }
562
+
563
+ long res_code;
564
+ curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
565
+
566
+ return { res_code, std::move(res_buffer) };
567
+ }
568
+
569
+ #else
570
+
571
+ bool common_has_curl() {
572
+ return false;
573
+ }
574
+
575
+ static bool common_download_file_single_online(const std::string &, const std::string &, const std::string &) {
576
+ LOG_ERR("error: built without CURL, cannot download model from internet\n");
577
+ return false;
578
+ }
579
+
580
+ std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
581
+ if (!url.empty()) {
582
+ throw std::runtime_error("error: built without CURL, cannot download model from the internet");
583
+ }
584
+
585
+ return {};
586
+ }
587
+
588
+ #endif // LLAMA_USE_CURL
589
+
590
+ static bool common_download_file_single(const std::string & url,
591
+ const std::string & path,
592
+ const std::string & bearer_token,
593
+ bool offline) {
594
+ if (!offline) {
595
+ return common_download_file_single_online(url, path, bearer_token);
596
+ }
597
+
598
+ if (!std::filesystem::exists(path)) {
599
+ LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
600
+ return false;
601
+ }
602
+
603
+ LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
604
+ return true;
605
+ }
606
+
533
607
  // download multiple files from remote URLs to local paths
534
608
  // the input is a vector of pairs <url, path>
535
609
  static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
@@ -588,7 +662,7 @@ static bool common_download_model(
588
662
 
589
663
  if (n_split > 1) {
590
664
  char split_prefix[PATH_MAX] = {0};
591
- char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
665
+ char split_url_prefix[LLAMA_MAX_URL_LENGTH] = {0};
592
666
 
593
667
  // Verify the first split file format
594
668
  // and extract split URL and PATH prefixes
@@ -609,7 +683,7 @@ static bool common_download_model(
609
683
  char split_path[PATH_MAX] = {0};
610
684
  llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
611
685
 
612
- char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
686
+ char split_url[LLAMA_MAX_URL_LENGTH] = {0};
613
687
  llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
614
688
 
615
689
  if (std::string(split_path) == model.path) {
@@ -626,50 +700,6 @@ static bool common_download_model(
626
700
  return true;
627
701
  }
628
702
 
629
- std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
630
- curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
631
- curl_slist_ptr http_headers;
632
- std::vector<char> res_buffer;
633
-
634
- curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
635
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
636
- curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
637
- typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
638
- auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
639
- auto data_vec = static_cast<std::vector<char> *>(data);
640
- data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
641
- return size * nmemb;
642
- };
643
- curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
644
- curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
645
- #if defined(_WIN32)
646
- curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
647
- #endif
648
- if (params.timeout > 0) {
649
- curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
650
- }
651
- if (params.max_size > 0) {
652
- curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
653
- }
654
- http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
655
- for (const auto & header : params.headers) {
656
- http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
657
- }
658
- curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
659
-
660
- CURLcode res = curl_easy_perform(curl.get());
661
-
662
- if (res != CURLE_OK) {
663
- std::string error_msg = curl_easy_strerror(res);
664
- throw std::runtime_error("error: cannot make GET request: " + error_msg);
665
- }
666
-
667
- long res_code;
668
- curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
669
-
670
- return { res_code, std::move(res_buffer) };
671
- }
672
-
673
703
  /**
674
704
  * Allow getting the HF file from the HF repo with tag (like ollama), for example:
675
705
  * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
@@ -736,21 +766,17 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
736
766
  std::string mmprojFile;
737
767
 
738
768
  if (res_code == 200 || res_code == 304) {
739
- // extract ggufFile.rfilename in json, using regex
740
- {
741
- std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
742
- std::smatch match;
743
- if (std::regex_search(res_str, match, pattern)) {
744
- ggufFile = match[1].str();
769
+ try {
770
+ auto j = json::parse(res_str);
771
+
772
+ if (j.contains("ggufFile") && j["ggufFile"].contains("rfilename")) {
773
+ ggufFile = j["ggufFile"]["rfilename"].get<std::string>();
745
774
  }
746
- }
747
- // extract mmprojFile.rfilename in json, using regex
748
- {
749
- std::regex pattern("\"mmprojFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
750
- std::smatch match;
751
- if (std::regex_search(res_str, match, pattern)) {
752
- mmprojFile = match[1].str();
775
+ if (j.contains("mmprojFile") && j["mmprojFile"].contains("rfilename")) {
776
+ mmprojFile = j["mmprojFile"]["rfilename"].get<std::string>();
753
777
  }
778
+ } catch (const std::exception & e) {
779
+ throw std::runtime_error(std::string("error parsing manifest JSON: ") + e.what());
754
780
  }
755
781
  if (!use_cache) {
756
782
  // if not using cached response, update the cache file
@@ -770,45 +796,6 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
770
796
  return { hf_repo, ggufFile, mmprojFile };
771
797
  }
772
798
 
773
- #else
774
-
775
- bool common_has_curl() {
776
- return false;
777
- }
778
-
779
- static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
780
- LOG_ERR("error: built without CURL, cannot download model from internet\n");
781
- return false;
782
- }
783
-
784
- static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
785
- LOG_ERR("error: built without CURL, cannot download model from the internet\n");
786
- return false;
787
- }
788
-
789
- static bool common_download_model(
790
- const common_params_model &,
791
- const std::string &,
792
- bool) {
793
- LOG_ERR("error: built without CURL, cannot download model from the internet\n");
794
- return false;
795
- }
796
-
797
- static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
798
- LOG_ERR("error: built without CURL, cannot download model from the internet\n");
799
- return {};
800
- }
801
-
802
- std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
803
- if (!url.empty()) {
804
- throw std::runtime_error("error: built without CURL, cannot download model from the internet");
805
- }
806
-
807
- return {};
808
- }
809
-
810
- #endif // LLAMA_USE_CURL
811
-
812
799
  //
813
800
  // Docker registry functions
814
801
  //
@@ -1068,8 +1055,6 @@ static std::string get_all_kv_cache_types() {
1068
1055
  //
1069
1056
 
1070
1057
  static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
1071
- std::string arg;
1072
- const std::string arg_prefix = "--";
1073
1058
  common_params & params = ctx_arg.params;
1074
1059
 
1075
1060
  std::unordered_map<std::string, common_arg *> arg_to_options;
@@ -14,6 +14,7 @@
14
14
  #include <climits>
15
15
  #include <cmath>
16
16
  #include <codecvt>
17
+ #include <chrono>
17
18
  #include <cstdarg>
18
19
  #include <cstring>
19
20
  #include <ctime>
@@ -332,6 +332,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
332
332
  }
333
333
  if (ctx) {
334
334
  llama_perf_context_print(ctx);
335
+ llama_memory_breakdown_print(ctx);
335
336
  }
336
337
  }
337
338
 
@@ -314,7 +314,8 @@ extern "C" {
314
314
  GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
315
315
  GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
316
316
 
317
- GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
317
+ GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
318
+ GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
318
319
 
319
320
  GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
320
321
  GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
@@ -7,6 +7,9 @@
7
7
  extern "C" {
8
8
  #endif
9
9
 
10
+ // device buffer
11
+ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void);
12
+
10
13
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
11
14
 
12
15
  #ifdef __cplusplus
@@ -878,7 +878,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
878
878
  const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
879
879
  const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
880
880
 
881
- // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
881
+ // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
882
882
  const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
883
883
  const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
884
884
  const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
@@ -1231,7 +1231,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
1231
1231
  const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
1232
1232
  const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
1233
1233
 
1234
- // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
1234
+ // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
1235
1235
  const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
1236
1236
  const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
1237
1237
  const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
@@ -473,10 +473,10 @@ struct ggml_threadpool {
473
473
  struct ggml_compute_state {
474
474
  #ifndef GGML_USE_OPENMP
475
475
  ggml_thread_t thrd;
476
- bool cpumask[GGML_MAX_N_THREADS];
477
476
  int last_graph;
478
477
  bool pending;
479
478
  #endif
479
+ bool cpumask[GGML_MAX_N_THREADS];
480
480
  struct ggml_threadpool * threadpool;
481
481
  int ith;
482
482
  };
@@ -3081,7 +3081,14 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
3081
3081
 
3082
3082
  threadpool->workers = workers;
3083
3083
 
3084
- #ifndef GGML_USE_OPENMP
3084
+ #ifdef GGML_USE_OPENMP
3085
+ int32_t cpumask_iter = 0;
3086
+
3087
+ // Compute CPU masks for each thread
3088
+ for (int j = 0; j < tpp->n_threads; j++) {
3089
+ ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
3090
+ }
3091
+ #else // GGML_USE_OPENMP
3085
3092
  ggml_mutex_init(&threadpool->mutex);
3086
3093
  ggml_cond_init(&threadpool->cond);
3087
3094
 
@@ -3154,7 +3161,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
3154
3161
  atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
3155
3162
  }
3156
3163
 
3157
- ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
3164
+ // Apply thread CPU mask and priority
3165
+ int ith = omp_get_thread_num();
3166
+
3167
+ ggml_thread_apply_priority(threadpool->prio);
3168
+ if (ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
3169
+ ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
3170
+ }
3171
+ ggml_graph_compute_thread(&threadpool->workers[ith]);
3158
3172
  }
3159
3173
  } else {
3160
3174
  atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
@@ -4739,6 +4739,7 @@ void ggml_compute_forward_get_rows(
4739
4739
  //}
4740
4740
  }
4741
4741
 
4742
+ template<typename idx_t>
4742
4743
  static void ggml_compute_forward_set_rows_f32(
4743
4744
  const ggml_compute_params * params,
4744
4745
  ggml_tensor * dst) {
@@ -4777,7 +4778,7 @@ static void ggml_compute_forward_set_rows_f32(
4777
4778
  const int64_t i11 = i02%ne11;
4778
4779
  const int64_t i10 = i;
4779
4780
 
4780
- const int64_t i1 = *(int64_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
4781
+ const int64_t i1 = *(idx_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
4781
4782
 
4782
4783
  GGML_ASSERT(i1 >= 0 && i1 < ne1);
4783
4784
 
@@ -4794,11 +4795,18 @@ void ggml_compute_forward_set_rows(
4794
4795
  ggml_tensor * dst) {
4795
4796
 
4796
4797
  const ggml_tensor * src0 = dst->src[0];
4798
+ const ggml_tensor * src1 = dst->src[1];
4797
4799
 
4798
4800
  switch (src0->type) {
4799
4801
  case GGML_TYPE_F32:
4800
4802
  {
4801
- ggml_compute_forward_set_rows_f32(params, dst);
4803
+ if (src1->type == GGML_TYPE_I64) {
4804
+ ggml_compute_forward_set_rows_f32<int64_t>(params, dst);
4805
+ } else if (src1->type == GGML_TYPE_I32) {
4806
+ ggml_compute_forward_set_rows_f32<int32_t>(params, dst);
4807
+ } else {
4808
+ GGML_ABORT("src1->type = %d (%s) not supported", src1->type, ggml_type_name(src1->type));
4809
+ }
4802
4810
  } break;
4803
4811
  default:
4804
4812
  {
@@ -1329,24 +1329,25 @@ extern "C" {
1329
1329
  //
1330
1330
  // Performance utils
1331
1331
  //
1332
- // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
1332
+ // NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
1333
1333
  //
1334
1334
 
1335
1335
  struct llama_perf_context_data {
1336
- double t_start_ms;
1337
- double t_load_ms;
1338
- double t_p_eval_ms;
1339
- double t_eval_ms;
1340
-
1341
- int32_t n_p_eval;
1342
- int32_t n_eval;
1343
- int32_t n_reused; // number of times a ggml compute graph had been reused
1336
+ // ms == milliseconds
1337
+ double t_start_ms; // absolute start time
1338
+ double t_load_ms; // time needed for loading the model
1339
+ double t_p_eval_ms; // time needed for processing the prompt
1340
+ double t_eval_ms; // time needed for generating tokens
1341
+
1342
+ int32_t n_p_eval; // number of prompt tokens
1343
+ int32_t n_eval; // number of generated tokens
1344
+ int32_t n_reused; // number of times a ggml compute graph had been reused
1344
1345
  };
1345
1346
 
1346
1347
  struct llama_perf_sampler_data {
1347
- double t_sample_ms;
1348
+ double t_sample_ms; // time needed for sampling in ms
1348
1349
 
1349
- int32_t n_sample;
1350
+ int32_t n_sample; // number of sampled tokens
1350
1351
  };
1351
1352
 
1352
1353
  LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
@@ -1358,6 +1359,9 @@ extern "C" {
1358
1359
  LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
1359
1360
  LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
1360
1361
 
1362
+ // print a breakdown of per-device memory use via LLAMA_LOG:
1363
+ LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
1364
+
1361
1365
  //
1362
1366
  // training
1363
1367
  //
@@ -2027,6 +2027,21 @@ void llama_context::perf_reset() {
2027
2027
  n_reused = 0;
2028
2028
  }
2029
2029
 
2030
+ std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
2031
+ std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
2032
+ for (const auto & buft_size : model.memory_breakdown()) {
2033
+ ret[buft_size.first].model += buft_size.second;
2034
+ }
2035
+ for (const auto & buft_size : memory->memory_breakdown()) {
2036
+ ret[buft_size.first].context += buft_size.second;
2037
+ }
2038
+ for (const auto & backend_ptr : backends) {
2039
+ ggml_backend_t backend = backend_ptr.get();
2040
+ ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
2041
+ }
2042
+ return ret;
2043
+ }
2044
+
2030
2045
  //
2031
2046
  // training
2032
2047
  //
@@ -2765,6 +2780,142 @@ void llama_perf_context_reset(llama_context * ctx) {
2765
2780
  ctx->perf_reset();
2766
2781
  }
2767
2782
 
2783
+ void llama_memory_breakdown_print(const struct llama_context * ctx) {
2784
+ const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;
2785
+
2786
+ std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
2787
+
2788
+ std::vector<std::array<std::string, 9>> table_data;
2789
+ table_data.reserve(devices.size());
2790
+ const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
2791
+ const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
2792
+ const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
2793
+
2794
+ table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
2795
+
2796
+ constexpr size_t MiB = 1024 * 1024;
2797
+ const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
2798
+
2799
+ // track seen buffer types to avoid double counting:
2800
+ std::set<ggml_backend_buffer_type_t> seen_buffer_types;
2801
+
2802
+ // accumulative memory breakdown for each device and for host:
2803
+ std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
2804
+ llama_memory_breakdown_data mb_host;
2805
+
2806
+ for (const auto & buft_mb : memory_breakdown) {
2807
+ ggml_backend_buffer_type_t buft = buft_mb.first;
2808
+ const llama_memory_breakdown_data & mb = buft_mb.second;
2809
+ if (ggml_backend_buft_is_host(buft)) {
2810
+ mb_host.model += mb.model;
2811
+ mb_host.context += mb.context;
2812
+ mb_host.compute += mb.compute;
2813
+ seen_buffer_types.insert(buft);
2814
+ continue;
2815
+ }
2816
+ ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
2817
+ if (dev) {
2818
+ int i_dev = -1;
2819
+ for (size_t i = 0; i < devices.size(); i++) {
2820
+ if (devices[i] == dev) {
2821
+ i_dev = i;
2822
+ break;
2823
+ }
2824
+ }
2825
+ if (i_dev != -1) {
2826
+ mb_dev[i_dev].model += mb.model;
2827
+ mb_dev[i_dev].context += mb.context;
2828
+ mb_dev[i_dev].compute += mb.compute;
2829
+ seen_buffer_types.insert(buft);
2830
+ continue;
2831
+ }
2832
+ }
2833
+ }
2834
+
2835
+ // print memory breakdown for each device:
2836
+ for (size_t i = 0; i < devices.size(); i++) {
2837
+ ggml_backend_dev_t dev = devices[i];
2838
+ llama_memory_breakdown_data mb = mb_dev[i];
2839
+
2840
+ const std::string name = ggml_backend_dev_name(dev);
2841
+ std::string desc = ggml_backend_dev_description(dev);
2842
+ for (const std::string & prefix : desc_prefixes_strip) {
2843
+ if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
2844
+ desc = desc.substr(prefix.length());
2845
+ }
2846
+ }
2847
+
2848
+ size_t free, total;
2849
+ ggml_backend_dev_memory(dev, &free, &total);
2850
+
2851
+ const size_t self = mb.model + mb.context + mb.compute;
2852
+ const size_t unaccounted = total - self - free;
2853
+
2854
+ table_data.push_back({
2855
+ template_gpu,
2856
+ " - " + name + " (" + desc + ")",
2857
+ std::to_string(total / MiB),
2858
+ std::to_string(free / MiB),
2859
+ std::to_string(self / MiB),
2860
+ std::to_string(mb.model / MiB),
2861
+ std::to_string(mb.context / MiB),
2862
+ std::to_string(mb.compute / MiB),
2863
+ std::to_string(unaccounted / MiB)});
2864
+ }
2865
+
2866
+ // print memory breakdown for host:
2867
+ {
2868
+ const size_t self = mb_host.model + mb_host.context + mb_host.compute;
2869
+ table_data.push_back({
2870
+ template_other,
2871
+ " - Host",
2872
+ "", // total
2873
+ "", // free
2874
+ std::to_string(self / MiB),
2875
+ std::to_string(mb_host.model / MiB),
2876
+ std::to_string(mb_host.context / MiB),
2877
+ std::to_string(mb_host.compute / MiB),
2878
+ ""}); // unaccounted
2879
+ }
2880
+
2881
+ // print memory breakdown for all remaining buffer types:
2882
+ for (const auto & buft_mb : memory_breakdown) {
2883
+ ggml_backend_buffer_type_t buft = buft_mb.first;
2884
+ const llama_memory_breakdown_data & mb = buft_mb.second;
2885
+ if (seen_buffer_types.count(buft) == 1) {
2886
+ continue;
2887
+ }
2888
+ const std::string name = ggml_backend_buft_name(buft);
2889
+ const size_t self = mb.model + mb.context + mb.compute;
2890
+ table_data.push_back({
2891
+ template_other,
2892
+ " - " + name,
2893
+ "", // total
2894
+ "", // free
2895
+ std::to_string(self / MiB),
2896
+ std::to_string(mb.model / MiB),
2897
+ std::to_string(mb.context / MiB),
2898
+ std::to_string(mb.compute / MiB),
2899
+ ""}); // unaccounted
2900
+ seen_buffer_types.insert(buft);
2901
+ }
2902
+
2903
+ for (size_t j = 1; j < table_data[0].size(); j++) {
2904
+ size_t max_len = 0;
2905
+ for (const auto & td : table_data) {
2906
+ max_len = std::max(max_len, td[j].length());
2907
+ }
2908
+ for (auto & td : table_data) {
2909
+ td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
2910
+ }
2911
+ }
2912
+ for (const auto & td : table_data) {
2913
+ LLAMA_LOG_INFO(td[0].c_str(),
2914
+ __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
2915
+ td[6].c_str(), td[7].c_str(), td[8].c_str());
2916
+ }
2917
+ }
2918
+
2768
2919
  //
2769
2920
  // training
2770
2921
  //
@@ -17,9 +17,17 @@ class llama_batch_allocr;
17
17
  class llama_io_read_i;
18
18
  class llama_io_write_i;
19
19
 
20
+ // "memory" as in abstract memory for the context
20
21
  struct llama_memory_i;
21
22
  struct llama_memory_context_i;
22
23
 
24
+ // "memory" as in physical memory for a buffer type, in bytes
25
+ struct llama_memory_breakdown_data {
26
+ size_t model = 0; // memory allocated for the model
27
+ size_t context = 0; // memory allocated for the context
28
+ size_t compute = 0; // memory allocated for temporary compute buffers
29
+ };
30
+
23
31
  struct llama_context {
24
32
  // init scheduler and compute buffers, reserve worst-case graphs
25
33
  llama_context(
@@ -144,6 +152,8 @@ struct llama_context {
144
152
  llama_perf_context_data perf_get_data() const;
145
153
  void perf_reset();
146
154
 
155
+ std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
156
+
147
157
  //
148
158
  // training
149
159
  //
@@ -113,6 +113,14 @@ llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
113
113
  return kv_swa->seq_pos_max(seq_id);
114
114
  }
115
115
 
116
+ std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
117
+ std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
118
+ for (const auto & buft_size : kv_swa->memory_breakdown()) {
119
+ mb[buft_size.first] += buft_size.second;
120
+ }
121
+ return mb;
122
+ }
123
+
116
124
  llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
117
125
  GGML_UNUSED(embd_all);
118
126
 
@@ -56,6 +56,8 @@ public:
56
56
  llama_pos seq_pos_min(llama_seq_id seq_id) const override;
57
57
  llama_pos seq_pos_max(llama_seq_id seq_id) const override;
58
58
 
59
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
60
+
59
61
  // state write/load
60
62
 
61
63
  void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
@@ -473,6 +473,14 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
473
473
  return cells.seq_pos_max(seq_id);
474
474
  }
475
475
 
476
+ std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
477
+ std::map<ggml_backend_buffer_type_t, size_t> ret;
478
+ for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
479
+ ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
480
+ }
481
+ return ret;
482
+ }
483
+
476
484
  llama_memory_context_ptr llama_kv_cache::init_batch(
477
485
  llama_batch_allocr & balloc,
478
486
  uint32_t n_ubatch,
@@ -121,6 +121,8 @@ public:
121
121
  llama_pos seq_pos_min(llama_seq_id seq_id) const override;
122
122
  llama_pos seq_pos_max(llama_seq_id seq_id) const override;
123
123
 
124
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
125
+
124
126
  // state write/load
125
127
 
126
128
  void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
@@ -166,6 +166,14 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
166
166
  return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
167
167
  }
168
168
 
169
+ std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
170
+ std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
171
+ for (const auto & buft_size : mem_recr->memory_breakdown()) {
172
+ mb[buft_size.first] += buft_size.second;
173
+ }
174
+ return mb;
175
+ }
176
+
169
177
  void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
170
178
  GGML_UNUSED(flags);
171
179
 
@@ -68,6 +68,8 @@ public:
68
68
  llama_pos seq_pos_min(llama_seq_id seq_id) const override;
69
69
  llama_pos seq_pos_max(llama_seq_id seq_id) const override;
70
70
 
71
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
72
+
71
73
  // state write/load
72
74
 
73
75
  void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
@@ -359,6 +359,14 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
359
359
  return result;
360
360
  }
361
361
 
362
+ std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
363
+ std::map<ggml_backend_buffer_type_t, size_t> ret;
364
+ for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
365
+ ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
366
+ }
367
+ return ret;
368
+ }
369
+
362
370
  llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
363
371
  do {
364
372
  balloc.split_reset();
@@ -4,6 +4,7 @@
4
4
  #include "llama-graph.h"
5
5
  #include "llama-memory.h"
6
6
 
7
+ #include <map>
7
8
  #include <set>
8
9
  #include <vector>
9
10
 
@@ -50,6 +51,8 @@ public:
50
51
  llama_pos seq_pos_min(llama_seq_id seq_id) const override;
51
52
  llama_pos seq_pos_max(llama_seq_id seq_id) const override;
52
53
 
54
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
55
+
53
56
  bool prepare(const std::vector<llama_ubatch> & ubatches);
54
57
 
55
58
  // find a contiguous slot of memory cells and emplace the ubatch there
@@ -2,6 +2,7 @@
2
2
 
3
3
  #include "llama.h"
4
4
 
5
+ #include <map>
5
6
  #include <memory>
6
7
  #include <functional>
7
8
 
@@ -108,6 +109,8 @@ struct llama_memory_i {
108
109
  virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
109
110
  virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
110
111
 
112
+ virtual std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const = 0;
113
+
111
114
  //
112
115
  // state write/read
113
116
  //
@@ -66,6 +66,7 @@ const char * llm_type_name(llm_type type) {
66
66
  case LLM_TYPE_1_7B: return "1.7B";
67
67
  case LLM_TYPE_1_8B: return "1.8B";
68
68
  case LLM_TYPE_2B: return "2B";
69
+ case LLM_TYPE_2_6B: return "2.6B";
69
70
  case LLM_TYPE_2_8B: return "2.8B";
70
71
  case LLM_TYPE_2_9B: return "2.9B";
71
72
  case LLM_TYPE_3B: return "3B";
@@ -1977,10 +1978,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1977
1978
  for (uint32_t il = 0; il < hparams.n_layer; ++il) {
1978
1979
  hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
1979
1980
  }
1980
- switch (hparams.n_embd) {
1981
- case 1024: type = LLM_TYPE_350M; break;
1982
- case 1536: type = LLM_TYPE_700M; break;
1983
- case 2048: type = LLM_TYPE_1_2B; break;
1981
+ switch (hparams.n_ff()) {
1982
+ case 4608: type = LLM_TYPE_350M; break;
1983
+ case 6912: type = LLM_TYPE_700M; break;
1984
+ case 8192: type = LLM_TYPE_1_2B; break;
1985
+ case 10752: type = LLM_TYPE_2_6B; break;
1984
1986
  default: type = LLM_TYPE_UNKNOWN;
1985
1987
  }
1986
1988
  } break;
@@ -6003,6 +6005,14 @@ size_t llama_model::n_devices() const {
6003
6005
  return devices.size();
6004
6006
  }
6005
6007
 
6008
+ std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
6009
+ std::map<ggml_backend_buffer_type_t, size_t> ret;
6010
+ for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs) {
6011
+ ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
6012
+ }
6013
+ return ret;
6014
+ }
6015
+
6006
6016
  uint64_t llama_model::n_elements() const {
6007
6017
  return pimpl->n_elements;
6008
6018
  }
@@ -7,6 +7,7 @@
7
7
  #include "llama-memory.h"
8
8
  #include "llama-vocab.h"
9
9
 
10
+ #include <map>
10
11
  #include <memory>
11
12
  #include <string>
12
13
  #include <unordered_map>
@@ -58,6 +59,7 @@ enum llm_type {
58
59
  LLM_TYPE_1_7B,
59
60
  LLM_TYPE_1_8B,
60
61
  LLM_TYPE_2B,
62
+ LLM_TYPE_2_6B,
61
63
  LLM_TYPE_2_8B,
62
64
  LLM_TYPE_2_9B,
63
65
  LLM_TYPE_3B,
@@ -452,10 +454,12 @@ struct llama_model {
452
454
 
453
455
  std::string desc() const;
454
456
 
455
- size_t size() const;
457
+ size_t size() const; // file size
456
458
  size_t n_tensors() const;
457
459
  size_t n_devices() const;
458
460
 
461
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
462
+
459
463
  // total number of parameters in the model
460
464
  uint64_t n_elements() const;
461
465