@fugood/llama.node 1.2.2 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/common/arg.cpp +111 -126
- package/src/llama.cpp/common/common.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +1 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +2 -1
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +17 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +10 -2
- package/src/llama.cpp/include/llama.h +15 -11
- package/src/llama.cpp/src/llama-context.cpp +151 -0
- package/src/llama.cpp/src/llama-context.h +10 -0
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +8 -0
- package/src/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +8 -0
- package/src/llama.cpp/src/llama-kv-cache.h +2 -0
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +8 -0
- package/src/llama.cpp/src/llama-memory-hybrid.h +2 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +8 -0
- package/src/llama.cpp/src/llama-memory-recurrent.h +3 -0
- package/src/llama.cpp/src/llama-memory.h +3 -0
- package/src/llama.cpp/src/llama-model.cpp +14 -4
- package/src/llama.cpp/src/llama-model.h +5 -1
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.2.
|
|
4
|
+
"version": "1.2.3",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,19 +72,19 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-linux-x64": "1.2.
|
|
76
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.2.
|
|
77
|
-
"@fugood/node-llama-linux-x64-cuda": "1.2.
|
|
78
|
-
"@fugood/node-llama-linux-arm64": "1.2.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.2.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.2.
|
|
81
|
-
"@fugood/node-llama-win32-x64": "1.2.
|
|
82
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.2.
|
|
83
|
-
"@fugood/node-llama-win32-x64-cuda": "1.2.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.2.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.2.
|
|
86
|
-
"@fugood/node-llama-darwin-x64": "1.2.
|
|
87
|
-
"@fugood/node-llama-darwin-arm64": "1.2.
|
|
75
|
+
"@fugood/node-llama-linux-x64": "1.2.3",
|
|
76
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.2.3",
|
|
77
|
+
"@fugood/node-llama-linux-x64-cuda": "1.2.3",
|
|
78
|
+
"@fugood/node-llama-linux-arm64": "1.2.3",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.2.3",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.2.3",
|
|
81
|
+
"@fugood/node-llama-win32-x64": "1.2.3",
|
|
82
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.2.3",
|
|
83
|
+
"@fugood/node-llama-win32-x64-cuda": "1.2.3",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.2.3",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.2.3",
|
|
86
|
+
"@fugood/node-llama-darwin-x64": "1.2.3",
|
|
87
|
+
"@fugood/node-llama-darwin-arm64": "1.2.3"
|
|
88
88
|
},
|
|
89
89
|
"devDependencies": {
|
|
90
90
|
"@babel/preset-env": "^7.24.4",
|
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
#include <cstdarg>
|
|
25
25
|
#include <filesystem>
|
|
26
26
|
#include <fstream>
|
|
27
|
+
#include <future>
|
|
27
28
|
#include <list>
|
|
28
29
|
#include <regex>
|
|
29
30
|
#include <set>
|
|
@@ -36,9 +37,21 @@
|
|
|
36
37
|
#if defined(LLAMA_USE_CURL)
|
|
37
38
|
#include <curl/curl.h>
|
|
38
39
|
#include <curl/easy.h>
|
|
39
|
-
#include <future>
|
|
40
40
|
#endif
|
|
41
41
|
|
|
42
|
+
#ifdef __linux__
|
|
43
|
+
#include <linux/limits.h>
|
|
44
|
+
#elif defined(_WIN32)
|
|
45
|
+
# if !defined(PATH_MAX)
|
|
46
|
+
# define PATH_MAX MAX_PATH
|
|
47
|
+
# endif
|
|
48
|
+
#elif defined(_AIX)
|
|
49
|
+
#include <sys/limits.h>
|
|
50
|
+
#else
|
|
51
|
+
#include <sys/syslimits.h>
|
|
52
|
+
#endif
|
|
53
|
+
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
|
54
|
+
|
|
42
55
|
using json = nlohmann::ordered_json;
|
|
43
56
|
|
|
44
57
|
std::initializer_list<enum llama_example> mmproj_examples = {
|
|
@@ -208,19 +221,6 @@ bool common_has_curl() {
|
|
|
208
221
|
return true;
|
|
209
222
|
}
|
|
210
223
|
|
|
211
|
-
#ifdef __linux__
|
|
212
|
-
#include <linux/limits.h>
|
|
213
|
-
#elif defined(_WIN32)
|
|
214
|
-
# if !defined(PATH_MAX)
|
|
215
|
-
# define PATH_MAX MAX_PATH
|
|
216
|
-
# endif
|
|
217
|
-
#elif defined(_AIX)
|
|
218
|
-
#include <sys/limits.h>
|
|
219
|
-
#else
|
|
220
|
-
#include <sys/syslimits.h>
|
|
221
|
-
#endif
|
|
222
|
-
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
|
223
|
-
|
|
224
224
|
//
|
|
225
225
|
// CURL utils
|
|
226
226
|
//
|
|
@@ -368,10 +368,9 @@ static bool common_download_head(CURL * curl,
|
|
|
368
368
|
}
|
|
369
369
|
|
|
370
370
|
// download one single file from remote URL to local path
|
|
371
|
-
static bool
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
bool offline) {
|
|
371
|
+
static bool common_download_file_single_online(const std::string & url,
|
|
372
|
+
const std::string & path,
|
|
373
|
+
const std::string & bearer_token) {
|
|
375
374
|
// If the file exists, check its JSON metadata companion file.
|
|
376
375
|
std::string metadata_path = path + ".json";
|
|
377
376
|
static const int max_attempts = 3;
|
|
@@ -384,10 +383,6 @@ static bool common_download_file_single(const std::string & url,
|
|
|
384
383
|
// Check if the file already exists locally
|
|
385
384
|
const auto file_exists = std::filesystem::exists(path);
|
|
386
385
|
if (file_exists) {
|
|
387
|
-
if (offline) {
|
|
388
|
-
LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
|
|
389
|
-
return true; // skip verification/downloading
|
|
390
|
-
}
|
|
391
386
|
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
|
392
387
|
std::ifstream metadata_in(metadata_path);
|
|
393
388
|
if (metadata_in.good()) {
|
|
@@ -407,10 +402,6 @@ static bool common_download_file_single(const std::string & url,
|
|
|
407
402
|
}
|
|
408
403
|
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
|
409
404
|
} else {
|
|
410
|
-
if (offline) {
|
|
411
|
-
LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
|
|
412
|
-
return false;
|
|
413
|
-
}
|
|
414
405
|
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
|
415
406
|
}
|
|
416
407
|
|
|
@@ -530,6 +521,89 @@ static bool common_download_file_single(const std::string & url,
|
|
|
530
521
|
return true;
|
|
531
522
|
}
|
|
532
523
|
|
|
524
|
+
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
|
|
525
|
+
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
|
526
|
+
curl_slist_ptr http_headers;
|
|
527
|
+
std::vector<char> res_buffer;
|
|
528
|
+
|
|
529
|
+
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
530
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
|
531
|
+
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
|
532
|
+
curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 1L);
|
|
533
|
+
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
|
534
|
+
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
|
|
535
|
+
auto data_vec = static_cast<std::vector<char> *>(data);
|
|
536
|
+
data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
|
|
537
|
+
return size * nmemb;
|
|
538
|
+
};
|
|
539
|
+
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
|
540
|
+
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
|
|
541
|
+
#if defined(_WIN32)
|
|
542
|
+
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
543
|
+
#endif
|
|
544
|
+
if (params.timeout > 0) {
|
|
545
|
+
curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
|
|
546
|
+
}
|
|
547
|
+
if (params.max_size > 0) {
|
|
548
|
+
curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
|
|
549
|
+
}
|
|
550
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
|
551
|
+
for (const auto & header : params.headers) {
|
|
552
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
|
|
553
|
+
}
|
|
554
|
+
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
|
555
|
+
|
|
556
|
+
CURLcode res = curl_easy_perform(curl.get());
|
|
557
|
+
|
|
558
|
+
if (res != CURLE_OK) {
|
|
559
|
+
std::string error_msg = curl_easy_strerror(res);
|
|
560
|
+
throw std::runtime_error("error: cannot make GET request: " + error_msg);
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
long res_code;
|
|
564
|
+
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
|
|
565
|
+
|
|
566
|
+
return { res_code, std::move(res_buffer) };
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
#else
|
|
570
|
+
|
|
571
|
+
bool common_has_curl() {
|
|
572
|
+
return false;
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
static bool common_download_file_single_online(const std::string &, const std::string &, const std::string &) {
|
|
576
|
+
LOG_ERR("error: built without CURL, cannot download model from internet\n");
|
|
577
|
+
return false;
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
|
|
581
|
+
if (!url.empty()) {
|
|
582
|
+
throw std::runtime_error("error: built without CURL, cannot download model from the internet");
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
return {};
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
#endif // LLAMA_USE_CURL
|
|
589
|
+
|
|
590
|
+
static bool common_download_file_single(const std::string & url,
|
|
591
|
+
const std::string & path,
|
|
592
|
+
const std::string & bearer_token,
|
|
593
|
+
bool offline) {
|
|
594
|
+
if (!offline) {
|
|
595
|
+
return common_download_file_single_online(url, path, bearer_token);
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
if (!std::filesystem::exists(path)) {
|
|
599
|
+
LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
|
|
600
|
+
return false;
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
|
|
604
|
+
return true;
|
|
605
|
+
}
|
|
606
|
+
|
|
533
607
|
// download multiple files from remote URLs to local paths
|
|
534
608
|
// the input is a vector of pairs <url, path>
|
|
535
609
|
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
|
|
@@ -588,7 +662,7 @@ static bool common_download_model(
|
|
|
588
662
|
|
|
589
663
|
if (n_split > 1) {
|
|
590
664
|
char split_prefix[PATH_MAX] = {0};
|
|
591
|
-
char split_url_prefix[
|
|
665
|
+
char split_url_prefix[LLAMA_MAX_URL_LENGTH] = {0};
|
|
592
666
|
|
|
593
667
|
// Verify the first split file format
|
|
594
668
|
// and extract split URL and PATH prefixes
|
|
@@ -609,7 +683,7 @@ static bool common_download_model(
|
|
|
609
683
|
char split_path[PATH_MAX] = {0};
|
|
610
684
|
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
|
|
611
685
|
|
|
612
|
-
char split_url[
|
|
686
|
+
char split_url[LLAMA_MAX_URL_LENGTH] = {0};
|
|
613
687
|
llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
|
|
614
688
|
|
|
615
689
|
if (std::string(split_path) == model.path) {
|
|
@@ -626,50 +700,6 @@ static bool common_download_model(
|
|
|
626
700
|
return true;
|
|
627
701
|
}
|
|
628
702
|
|
|
629
|
-
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
|
|
630
|
-
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
|
631
|
-
curl_slist_ptr http_headers;
|
|
632
|
-
std::vector<char> res_buffer;
|
|
633
|
-
|
|
634
|
-
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
635
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
|
636
|
-
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
|
637
|
-
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
|
638
|
-
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
|
|
639
|
-
auto data_vec = static_cast<std::vector<char> *>(data);
|
|
640
|
-
data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
|
|
641
|
-
return size * nmemb;
|
|
642
|
-
};
|
|
643
|
-
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
|
644
|
-
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
|
|
645
|
-
#if defined(_WIN32)
|
|
646
|
-
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
647
|
-
#endif
|
|
648
|
-
if (params.timeout > 0) {
|
|
649
|
-
curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
|
|
650
|
-
}
|
|
651
|
-
if (params.max_size > 0) {
|
|
652
|
-
curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
|
|
653
|
-
}
|
|
654
|
-
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
|
655
|
-
for (const auto & header : params.headers) {
|
|
656
|
-
http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
|
|
657
|
-
}
|
|
658
|
-
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
|
659
|
-
|
|
660
|
-
CURLcode res = curl_easy_perform(curl.get());
|
|
661
|
-
|
|
662
|
-
if (res != CURLE_OK) {
|
|
663
|
-
std::string error_msg = curl_easy_strerror(res);
|
|
664
|
-
throw std::runtime_error("error: cannot make GET request: " + error_msg);
|
|
665
|
-
}
|
|
666
|
-
|
|
667
|
-
long res_code;
|
|
668
|
-
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
|
|
669
|
-
|
|
670
|
-
return { res_code, std::move(res_buffer) };
|
|
671
|
-
}
|
|
672
|
-
|
|
673
703
|
/**
|
|
674
704
|
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
|
|
675
705
|
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
|
|
@@ -736,21 +766,17 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
|
|
|
736
766
|
std::string mmprojFile;
|
|
737
767
|
|
|
738
768
|
if (res_code == 200 || res_code == 304) {
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
ggufFile = match[1].str();
|
|
769
|
+
try {
|
|
770
|
+
auto j = json::parse(res_str);
|
|
771
|
+
|
|
772
|
+
if (j.contains("ggufFile") && j["ggufFile"].contains("rfilename")) {
|
|
773
|
+
ggufFile = j["ggufFile"]["rfilename"].get<std::string>();
|
|
745
774
|
}
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
{
|
|
749
|
-
std::regex pattern("\"mmprojFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
|
|
750
|
-
std::smatch match;
|
|
751
|
-
if (std::regex_search(res_str, match, pattern)) {
|
|
752
|
-
mmprojFile = match[1].str();
|
|
775
|
+
if (j.contains("mmprojFile") && j["mmprojFile"].contains("rfilename")) {
|
|
776
|
+
mmprojFile = j["mmprojFile"]["rfilename"].get<std::string>();
|
|
753
777
|
}
|
|
778
|
+
} catch (const std::exception & e) {
|
|
779
|
+
throw std::runtime_error(std::string("error parsing manifest JSON: ") + e.what());
|
|
754
780
|
}
|
|
755
781
|
if (!use_cache) {
|
|
756
782
|
// if not using cached response, update the cache file
|
|
@@ -770,45 +796,6 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
|
|
|
770
796
|
return { hf_repo, ggufFile, mmprojFile };
|
|
771
797
|
}
|
|
772
798
|
|
|
773
|
-
#else
|
|
774
|
-
|
|
775
|
-
bool common_has_curl() {
|
|
776
|
-
return false;
|
|
777
|
-
}
|
|
778
|
-
|
|
779
|
-
static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
|
|
780
|
-
LOG_ERR("error: built without CURL, cannot download model from internet\n");
|
|
781
|
-
return false;
|
|
782
|
-
}
|
|
783
|
-
|
|
784
|
-
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
|
|
785
|
-
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
|
786
|
-
return false;
|
|
787
|
-
}
|
|
788
|
-
|
|
789
|
-
static bool common_download_model(
|
|
790
|
-
const common_params_model &,
|
|
791
|
-
const std::string &,
|
|
792
|
-
bool) {
|
|
793
|
-
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
|
794
|
-
return false;
|
|
795
|
-
}
|
|
796
|
-
|
|
797
|
-
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
|
|
798
|
-
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
|
799
|
-
return {};
|
|
800
|
-
}
|
|
801
|
-
|
|
802
|
-
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
|
|
803
|
-
if (!url.empty()) {
|
|
804
|
-
throw std::runtime_error("error: built without CURL, cannot download model from the internet");
|
|
805
|
-
}
|
|
806
|
-
|
|
807
|
-
return {};
|
|
808
|
-
}
|
|
809
|
-
|
|
810
|
-
#endif // LLAMA_USE_CURL
|
|
811
|
-
|
|
812
799
|
//
|
|
813
800
|
// Docker registry functions
|
|
814
801
|
//
|
|
@@ -1068,8 +1055,6 @@ static std::string get_all_kv_cache_types() {
|
|
|
1068
1055
|
//
|
|
1069
1056
|
|
|
1070
1057
|
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
|
|
1071
|
-
std::string arg;
|
|
1072
|
-
const std::string arg_prefix = "--";
|
|
1073
1058
|
common_params & params = ctx_arg.params;
|
|
1074
1059
|
|
|
1075
1060
|
std::unordered_map<std::string, common_arg *> arg_to_options;
|
|
@@ -314,7 +314,8 @@ extern "C" {
|
|
|
314
314
|
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
|
315
315
|
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
|
316
316
|
|
|
317
|
-
GGML_API
|
|
317
|
+
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
|
|
318
|
+
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
|
318
319
|
|
|
319
320
|
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
|
320
321
|
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
|
@@ -878,7 +878,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
|
|
878
878
|
const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
|
|
879
879
|
const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
|
|
880
880
|
|
|
881
|
-
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of
|
|
881
|
+
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
|
|
882
882
|
const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
|
|
883
883
|
const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
|
|
884
884
|
const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
|
|
@@ -1231,7 +1231,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
|
|
1231
1231
|
const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
|
|
1232
1232
|
const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
|
|
1233
1233
|
|
|
1234
|
-
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of
|
|
1234
|
+
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
|
|
1235
1235
|
const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
|
|
1236
1236
|
const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
|
|
1237
1237
|
const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
|
|
@@ -473,10 +473,10 @@ struct ggml_threadpool {
|
|
|
473
473
|
struct ggml_compute_state {
|
|
474
474
|
#ifndef GGML_USE_OPENMP
|
|
475
475
|
ggml_thread_t thrd;
|
|
476
|
-
bool cpumask[GGML_MAX_N_THREADS];
|
|
477
476
|
int last_graph;
|
|
478
477
|
bool pending;
|
|
479
478
|
#endif
|
|
479
|
+
bool cpumask[GGML_MAX_N_THREADS];
|
|
480
480
|
struct ggml_threadpool * threadpool;
|
|
481
481
|
int ith;
|
|
482
482
|
};
|
|
@@ -3081,7 +3081,14 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
|
|
3081
3081
|
|
|
3082
3082
|
threadpool->workers = workers;
|
|
3083
3083
|
|
|
3084
|
-
#
|
|
3084
|
+
#ifdef GGML_USE_OPENMP
|
|
3085
|
+
int32_t cpumask_iter = 0;
|
|
3086
|
+
|
|
3087
|
+
// Compute CPU masks for each thread
|
|
3088
|
+
for (int j = 0; j < tpp->n_threads; j++) {
|
|
3089
|
+
ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
|
|
3090
|
+
}
|
|
3091
|
+
#else // GGML_USE_OPENMP
|
|
3085
3092
|
ggml_mutex_init(&threadpool->mutex);
|
|
3086
3093
|
ggml_cond_init(&threadpool->cond);
|
|
3087
3094
|
|
|
@@ -3154,7 +3161,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
|
3154
3161
|
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
|
|
3155
3162
|
}
|
|
3156
3163
|
|
|
3157
|
-
|
|
3164
|
+
// Apply thread CPU mask and priority
|
|
3165
|
+
int ith = omp_get_thread_num();
|
|
3166
|
+
|
|
3167
|
+
ggml_thread_apply_priority(threadpool->prio);
|
|
3168
|
+
if (ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
|
|
3169
|
+
ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
|
|
3170
|
+
}
|
|
3171
|
+
ggml_graph_compute_thread(&threadpool->workers[ith]);
|
|
3158
3172
|
}
|
|
3159
3173
|
} else {
|
|
3160
3174
|
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
|
|
@@ -4739,6 +4739,7 @@ void ggml_compute_forward_get_rows(
|
|
|
4739
4739
|
//}
|
|
4740
4740
|
}
|
|
4741
4741
|
|
|
4742
|
+
template<typename idx_t>
|
|
4742
4743
|
static void ggml_compute_forward_set_rows_f32(
|
|
4743
4744
|
const ggml_compute_params * params,
|
|
4744
4745
|
ggml_tensor * dst) {
|
|
@@ -4777,7 +4778,7 @@ static void ggml_compute_forward_set_rows_f32(
|
|
|
4777
4778
|
const int64_t i11 = i02%ne11;
|
|
4778
4779
|
const int64_t i10 = i;
|
|
4779
4780
|
|
|
4780
|
-
const int64_t i1 = *(
|
|
4781
|
+
const int64_t i1 = *(idx_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
|
4781
4782
|
|
|
4782
4783
|
GGML_ASSERT(i1 >= 0 && i1 < ne1);
|
|
4783
4784
|
|
|
@@ -4794,11 +4795,18 @@ void ggml_compute_forward_set_rows(
|
|
|
4794
4795
|
ggml_tensor * dst) {
|
|
4795
4796
|
|
|
4796
4797
|
const ggml_tensor * src0 = dst->src[0];
|
|
4798
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
4797
4799
|
|
|
4798
4800
|
switch (src0->type) {
|
|
4799
4801
|
case GGML_TYPE_F32:
|
|
4800
4802
|
{
|
|
4801
|
-
|
|
4803
|
+
if (src1->type == GGML_TYPE_I64) {
|
|
4804
|
+
ggml_compute_forward_set_rows_f32<int64_t>(params, dst);
|
|
4805
|
+
} else if (src1->type == GGML_TYPE_I32) {
|
|
4806
|
+
ggml_compute_forward_set_rows_f32<int32_t>(params, dst);
|
|
4807
|
+
} else {
|
|
4808
|
+
GGML_ABORT("src1->type = %d (%s) not supported", src1->type, ggml_type_name(src1->type));
|
|
4809
|
+
}
|
|
4802
4810
|
} break;
|
|
4803
4811
|
default:
|
|
4804
4812
|
{
|
|
@@ -1329,24 +1329,25 @@ extern "C" {
|
|
|
1329
1329
|
//
|
|
1330
1330
|
// Performance utils
|
|
1331
1331
|
//
|
|
1332
|
-
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
|
|
1332
|
+
// NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
|
|
1333
1333
|
//
|
|
1334
1334
|
|
|
1335
1335
|
struct llama_perf_context_data {
|
|
1336
|
-
|
|
1337
|
-
double
|
|
1338
|
-
double
|
|
1339
|
-
double
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
int32_t
|
|
1343
|
-
int32_t
|
|
1336
|
+
// ms == milliseconds
|
|
1337
|
+
double t_start_ms; // absolute start time
|
|
1338
|
+
double t_load_ms; // time needed for loading the model
|
|
1339
|
+
double t_p_eval_ms; // time needed for processing the prompt
|
|
1340
|
+
double t_eval_ms; // time needed for generating tokens
|
|
1341
|
+
|
|
1342
|
+
int32_t n_p_eval; // number of prompt tokens
|
|
1343
|
+
int32_t n_eval; // number of generated tokens
|
|
1344
|
+
int32_t n_reused; // number of times a ggml compute graph had been reused
|
|
1344
1345
|
};
|
|
1345
1346
|
|
|
1346
1347
|
struct llama_perf_sampler_data {
|
|
1347
|
-
double t_sample_ms;
|
|
1348
|
+
double t_sample_ms; // time needed for sampling in ms
|
|
1348
1349
|
|
|
1349
|
-
int32_t n_sample;
|
|
1350
|
+
int32_t n_sample; // number of sampled tokens
|
|
1350
1351
|
};
|
|
1351
1352
|
|
|
1352
1353
|
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
|
|
@@ -1358,6 +1359,9 @@ extern "C" {
|
|
|
1358
1359
|
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
|
1359
1360
|
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
|
1360
1361
|
|
|
1362
|
+
// print a breakdown of per-device memory use via LLAMA_LOG:
|
|
1363
|
+
LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
|
|
1364
|
+
|
|
1361
1365
|
//
|
|
1362
1366
|
// training
|
|
1363
1367
|
//
|
|
@@ -2027,6 +2027,21 @@ void llama_context::perf_reset() {
|
|
|
2027
2027
|
n_reused = 0;
|
|
2028
2028
|
}
|
|
2029
2029
|
|
|
2030
|
+
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
|
2031
|
+
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
|
2032
|
+
for (const auto & buft_size : model.memory_breakdown()) {
|
|
2033
|
+
ret[buft_size.first].model += buft_size.second;
|
|
2034
|
+
}
|
|
2035
|
+
for (const auto & buft_size : memory->memory_breakdown()) {
|
|
2036
|
+
ret[buft_size.first].context += buft_size.second;
|
|
2037
|
+
}
|
|
2038
|
+
for (const auto & backend_ptr : backends) {
|
|
2039
|
+
ggml_backend_t backend = backend_ptr.get();
|
|
2040
|
+
ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
|
2041
|
+
}
|
|
2042
|
+
return ret;
|
|
2043
|
+
}
|
|
2044
|
+
|
|
2030
2045
|
//
|
|
2031
2046
|
// training
|
|
2032
2047
|
//
|
|
@@ -2765,6 +2780,142 @@ void llama_perf_context_reset(llama_context * ctx) {
|
|
|
2765
2780
|
ctx->perf_reset();
|
|
2766
2781
|
}
|
|
2767
2782
|
|
|
2783
|
+
void llama_memory_breakdown_print(const struct llama_context * ctx) {
|
|
2784
|
+
const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;
|
|
2785
|
+
|
|
2786
|
+
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
|
2787
|
+
|
|
2788
|
+
std::vector<std::array<std::string, 9>> table_data;
|
|
2789
|
+
table_data.reserve(devices.size());
|
|
2790
|
+
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
|
|
2791
|
+
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
|
|
2792
|
+
const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
|
|
2793
|
+
|
|
2794
|
+
table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
|
|
2795
|
+
|
|
2796
|
+
constexpr size_t MiB = 1024 * 1024;
|
|
2797
|
+
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
|
|
2798
|
+
|
|
2799
|
+
// track seen buffer types to avoid double counting:
|
|
2800
|
+
std::set<ggml_backend_buffer_type_t> seen_buffer_types;
|
|
2801
|
+
|
|
2802
|
+
// accumulative memory breakdown for each device and for host:
|
|
2803
|
+
std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
|
|
2804
|
+
llama_memory_breakdown_data mb_host;
|
|
2805
|
+
|
|
2806
|
+
for (const auto & buft_mb : memory_breakdown) {
|
|
2807
|
+
ggml_backend_buffer_type_t buft = buft_mb.first;
|
|
2808
|
+
const llama_memory_breakdown_data & mb = buft_mb.second;
|
|
2809
|
+
if (ggml_backend_buft_is_host(buft)) {
|
|
2810
|
+
mb_host.model += mb.model;
|
|
2811
|
+
mb_host.context += mb.context;
|
|
2812
|
+
mb_host.compute += mb.compute;
|
|
2813
|
+
seen_buffer_types.insert(buft);
|
|
2814
|
+
continue;
|
|
2815
|
+
}
|
|
2816
|
+
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
|
2817
|
+
if (dev) {
|
|
2818
|
+
int i_dev = -1;
|
|
2819
|
+
for (size_t i = 0; i < devices.size(); i++) {
|
|
2820
|
+
if (devices[i] == dev) {
|
|
2821
|
+
i_dev = i;
|
|
2822
|
+
break;
|
|
2823
|
+
}
|
|
2824
|
+
}
|
|
2825
|
+
if (i_dev != -1) {
|
|
2826
|
+
mb_dev[i_dev].model += mb.model;
|
|
2827
|
+
mb_dev[i_dev].context += mb.context;
|
|
2828
|
+
mb_dev[i_dev].compute += mb.compute;
|
|
2829
|
+
seen_buffer_types.insert(buft);
|
|
2830
|
+
continue;
|
|
2831
|
+
}
|
|
2832
|
+
}
|
|
2833
|
+
}
|
|
2834
|
+
|
|
2835
|
+
// print memory breakdown for each device:
|
|
2836
|
+
for (size_t i = 0; i < devices.size(); i++) {
|
|
2837
|
+
ggml_backend_dev_t dev = devices[i];
|
|
2838
|
+
llama_memory_breakdown_data mb = mb_dev[i];
|
|
2839
|
+
|
|
2840
|
+
const std::string name = ggml_backend_dev_name(dev);
|
|
2841
|
+
std::string desc = ggml_backend_dev_description(dev);
|
|
2842
|
+
for (const std::string & prefix : desc_prefixes_strip) {
|
|
2843
|
+
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
|
|
2844
|
+
desc = desc.substr(prefix.length());
|
|
2845
|
+
}
|
|
2846
|
+
}
|
|
2847
|
+
|
|
2848
|
+
size_t free, total;
|
|
2849
|
+
ggml_backend_dev_memory(dev, &free, &total);
|
|
2850
|
+
|
|
2851
|
+
const size_t self = mb.model + mb.context + mb.compute;
|
|
2852
|
+
const size_t unaccounted = total - self - free;
|
|
2853
|
+
|
|
2854
|
+
table_data.push_back({
|
|
2855
|
+
template_gpu,
|
|
2856
|
+
" - " + name + " (" + desc + ")",
|
|
2857
|
+
std::to_string(total / MiB),
|
|
2858
|
+
std::to_string(free / MiB),
|
|
2859
|
+
std::to_string(self / MiB),
|
|
2860
|
+
std::to_string(mb.model / MiB),
|
|
2861
|
+
std::to_string(mb.context / MiB),
|
|
2862
|
+
std::to_string(mb.compute / MiB),
|
|
2863
|
+
std::to_string(unaccounted / MiB)});
|
|
2864
|
+
}
|
|
2865
|
+
|
|
2866
|
+
// print memory breakdown for host:
|
|
2867
|
+
{
|
|
2868
|
+
const size_t self = mb_host.model + mb_host.context + mb_host.compute;
|
|
2869
|
+
table_data.push_back({
|
|
2870
|
+
template_other,
|
|
2871
|
+
" - Host",
|
|
2872
|
+
"", // total
|
|
2873
|
+
"", // free
|
|
2874
|
+
std::to_string(self / MiB),
|
|
2875
|
+
std::to_string(mb_host.model / MiB),
|
|
2876
|
+
std::to_string(mb_host.context / MiB),
|
|
2877
|
+
std::to_string(mb_host.compute / MiB),
|
|
2878
|
+
""}); // unaccounted
|
|
2879
|
+
}
|
|
2880
|
+
|
|
2881
|
+
// print memory breakdown for all remaining buffer types:
|
|
2882
|
+
for (const auto & buft_mb : memory_breakdown) {
|
|
2883
|
+
ggml_backend_buffer_type_t buft = buft_mb.first;
|
|
2884
|
+
const llama_memory_breakdown_data & mb = buft_mb.second;
|
|
2885
|
+
if (seen_buffer_types.count(buft) == 1) {
|
|
2886
|
+
continue;
|
|
2887
|
+
}
|
|
2888
|
+
const std::string name = ggml_backend_buft_name(buft);
|
|
2889
|
+
const size_t self = mb.model + mb.context + mb.compute;
|
|
2890
|
+
table_data.push_back({
|
|
2891
|
+
template_other,
|
|
2892
|
+
" - " + name,
|
|
2893
|
+
"", // total
|
|
2894
|
+
"", // free
|
|
2895
|
+
std::to_string(self / MiB),
|
|
2896
|
+
std::to_string(mb.model / MiB),
|
|
2897
|
+
std::to_string(mb.context / MiB),
|
|
2898
|
+
std::to_string(mb.compute / MiB),
|
|
2899
|
+
""}); // unaccounted
|
|
2900
|
+
seen_buffer_types.insert(buft);
|
|
2901
|
+
}
|
|
2902
|
+
|
|
2903
|
+
for (size_t j = 1; j < table_data[0].size(); j++) {
|
|
2904
|
+
size_t max_len = 0;
|
|
2905
|
+
for (const auto & td : table_data) {
|
|
2906
|
+
max_len = std::max(max_len, td[j].length());
|
|
2907
|
+
}
|
|
2908
|
+
for (auto & td : table_data) {
|
|
2909
|
+
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
|
|
2910
|
+
}
|
|
2911
|
+
}
|
|
2912
|
+
for (const auto & td : table_data) {
|
|
2913
|
+
LLAMA_LOG_INFO(td[0].c_str(),
|
|
2914
|
+
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
|
|
2915
|
+
td[6].c_str(), td[7].c_str(), td[8].c_str());
|
|
2916
|
+
}
|
|
2917
|
+
}
|
|
2918
|
+
|
|
2768
2919
|
//
|
|
2769
2920
|
// training
|
|
2770
2921
|
//
|
|
@@ -17,9 +17,17 @@ class llama_batch_allocr;
|
|
|
17
17
|
class llama_io_read_i;
|
|
18
18
|
class llama_io_write_i;
|
|
19
19
|
|
|
20
|
+
// "memory" as in abstract memory for the context
|
|
20
21
|
struct llama_memory_i;
|
|
21
22
|
struct llama_memory_context_i;
|
|
22
23
|
|
|
24
|
+
// "memory" as in physical memory for a buffer type, in bytes
|
|
25
|
+
struct llama_memory_breakdown_data {
|
|
26
|
+
size_t model = 0; // memory allocated for the model
|
|
27
|
+
size_t context = 0; // memory allocated for the context
|
|
28
|
+
size_t compute = 0; // memory allocated for temporary compute buffers
|
|
29
|
+
};
|
|
30
|
+
|
|
23
31
|
struct llama_context {
|
|
24
32
|
// init scheduler and compute buffers, reserve worst-case graphs
|
|
25
33
|
llama_context(
|
|
@@ -144,6 +152,8 @@ struct llama_context {
|
|
|
144
152
|
llama_perf_context_data perf_get_data() const;
|
|
145
153
|
void perf_reset();
|
|
146
154
|
|
|
155
|
+
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
|
|
156
|
+
|
|
147
157
|
//
|
|
148
158
|
// training
|
|
149
159
|
//
|
|
@@ -113,6 +113,14 @@ llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
113
113
|
return kv_swa->seq_pos_max(seq_id);
|
|
114
114
|
}
|
|
115
115
|
|
|
116
|
+
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
|
|
117
|
+
std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
|
|
118
|
+
for (const auto & buft_size : kv_swa->memory_breakdown()) {
|
|
119
|
+
mb[buft_size.first] += buft_size.second;
|
|
120
|
+
}
|
|
121
|
+
return mb;
|
|
122
|
+
}
|
|
123
|
+
|
|
116
124
|
llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
|
117
125
|
GGML_UNUSED(embd_all);
|
|
118
126
|
|
|
@@ -56,6 +56,8 @@ public:
|
|
|
56
56
|
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
|
57
57
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
|
58
58
|
|
|
59
|
+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
|
60
|
+
|
|
59
61
|
// state write/load
|
|
60
62
|
|
|
61
63
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
|
@@ -473,6 +473,14 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
473
473
|
return cells.seq_pos_max(seq_id);
|
|
474
474
|
}
|
|
475
475
|
|
|
476
|
+
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
|
|
477
|
+
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
478
|
+
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
|
|
479
|
+
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
|
|
480
|
+
}
|
|
481
|
+
return ret;
|
|
482
|
+
}
|
|
483
|
+
|
|
476
484
|
llama_memory_context_ptr llama_kv_cache::init_batch(
|
|
477
485
|
llama_batch_allocr & balloc,
|
|
478
486
|
uint32_t n_ubatch,
|
|
@@ -121,6 +121,8 @@ public:
|
|
|
121
121
|
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
|
122
122
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
|
123
123
|
|
|
124
|
+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
|
125
|
+
|
|
124
126
|
// state write/load
|
|
125
127
|
|
|
126
128
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
|
@@ -166,6 +166,14 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
166
166
|
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
|
|
167
167
|
}
|
|
168
168
|
|
|
169
|
+
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
|
|
170
|
+
std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
|
|
171
|
+
for (const auto & buft_size : mem_recr->memory_breakdown()) {
|
|
172
|
+
mb[buft_size.first] += buft_size.second;
|
|
173
|
+
}
|
|
174
|
+
return mb;
|
|
175
|
+
}
|
|
176
|
+
|
|
169
177
|
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
170
178
|
GGML_UNUSED(flags);
|
|
171
179
|
|
|
@@ -68,6 +68,8 @@ public:
|
|
|
68
68
|
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
|
69
69
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
|
70
70
|
|
|
71
|
+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
|
72
|
+
|
|
71
73
|
// state write/load
|
|
72
74
|
|
|
73
75
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
|
@@ -359,6 +359,14 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
359
359
|
return result;
|
|
360
360
|
}
|
|
361
361
|
|
|
362
|
+
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
|
|
363
|
+
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
364
|
+
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
|
|
365
|
+
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
|
|
366
|
+
}
|
|
367
|
+
return ret;
|
|
368
|
+
}
|
|
369
|
+
|
|
362
370
|
llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
|
363
371
|
do {
|
|
364
372
|
balloc.split_reset();
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
#include "llama-graph.h"
|
|
5
5
|
#include "llama-memory.h"
|
|
6
6
|
|
|
7
|
+
#include <map>
|
|
7
8
|
#include <set>
|
|
8
9
|
#include <vector>
|
|
9
10
|
|
|
@@ -50,6 +51,8 @@ public:
|
|
|
50
51
|
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
|
51
52
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
|
52
53
|
|
|
54
|
+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
|
55
|
+
|
|
53
56
|
bool prepare(const std::vector<llama_ubatch> & ubatches);
|
|
54
57
|
|
|
55
58
|
// find a contiguous slot of memory cells and emplace the ubatch there
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
#include "llama.h"
|
|
4
4
|
|
|
5
|
+
#include <map>
|
|
5
6
|
#include <memory>
|
|
6
7
|
#include <functional>
|
|
7
8
|
|
|
@@ -108,6 +109,8 @@ struct llama_memory_i {
|
|
|
108
109
|
virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
|
|
109
110
|
virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
|
|
110
111
|
|
|
112
|
+
virtual std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const = 0;
|
|
113
|
+
|
|
111
114
|
//
|
|
112
115
|
// state write/read
|
|
113
116
|
//
|
|
@@ -66,6 +66,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
66
66
|
case LLM_TYPE_1_7B: return "1.7B";
|
|
67
67
|
case LLM_TYPE_1_8B: return "1.8B";
|
|
68
68
|
case LLM_TYPE_2B: return "2B";
|
|
69
|
+
case LLM_TYPE_2_6B: return "2.6B";
|
|
69
70
|
case LLM_TYPE_2_8B: return "2.8B";
|
|
70
71
|
case LLM_TYPE_2_9B: return "2.9B";
|
|
71
72
|
case LLM_TYPE_3B: return "3B";
|
|
@@ -1977,10 +1978,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1977
1978
|
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
|
|
1978
1979
|
hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
|
|
1979
1980
|
}
|
|
1980
|
-
switch (hparams.
|
|
1981
|
-
case
|
|
1982
|
-
case
|
|
1983
|
-
case
|
|
1981
|
+
switch (hparams.n_ff()) {
|
|
1982
|
+
case 4608: type = LLM_TYPE_350M; break;
|
|
1983
|
+
case 6912: type = LLM_TYPE_700M; break;
|
|
1984
|
+
case 8192: type = LLM_TYPE_1_2B; break;
|
|
1985
|
+
case 10752: type = LLM_TYPE_2_6B; break;
|
|
1984
1986
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1985
1987
|
}
|
|
1986
1988
|
} break;
|
|
@@ -6003,6 +6005,14 @@ size_t llama_model::n_devices() const {
|
|
|
6003
6005
|
return devices.size();
|
|
6004
6006
|
}
|
|
6005
6007
|
|
|
6008
|
+
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
|
6009
|
+
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
6010
|
+
for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs) {
|
|
6011
|
+
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
|
|
6012
|
+
}
|
|
6013
|
+
return ret;
|
|
6014
|
+
}
|
|
6015
|
+
|
|
6006
6016
|
uint64_t llama_model::n_elements() const {
|
|
6007
6017
|
return pimpl->n_elements;
|
|
6008
6018
|
}
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
#include "llama-memory.h"
|
|
8
8
|
#include "llama-vocab.h"
|
|
9
9
|
|
|
10
|
+
#include <map>
|
|
10
11
|
#include <memory>
|
|
11
12
|
#include <string>
|
|
12
13
|
#include <unordered_map>
|
|
@@ -58,6 +59,7 @@ enum llm_type {
|
|
|
58
59
|
LLM_TYPE_1_7B,
|
|
59
60
|
LLM_TYPE_1_8B,
|
|
60
61
|
LLM_TYPE_2B,
|
|
62
|
+
LLM_TYPE_2_6B,
|
|
61
63
|
LLM_TYPE_2_8B,
|
|
62
64
|
LLM_TYPE_2_9B,
|
|
63
65
|
LLM_TYPE_3B,
|
|
@@ -452,10 +454,12 @@ struct llama_model {
|
|
|
452
454
|
|
|
453
455
|
std::string desc() const;
|
|
454
456
|
|
|
455
|
-
size_t size() const;
|
|
457
|
+
size_t size() const; // file size
|
|
456
458
|
size_t n_tensors() const;
|
|
457
459
|
size_t n_devices() const;
|
|
458
460
|
|
|
461
|
+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
|
|
462
|
+
|
|
459
463
|
// total number of parameters in the model
|
|
460
464
|
uint64_t n_elements() const;
|
|
461
465
|
|