@fugood/llama.node 1.2.1 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/common/arg.cpp +359 -310
- package/src/llama.cpp/common/chat.cpp +27 -15
- package/src/llama.cpp/common/common.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +1 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +37 -21
- package/src/llama.cpp/ggml/include/ggml-backend.h +2 -1
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +3 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +4 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +17 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +93 -862
- package/src/llama.cpp/include/llama.h +15 -11
- package/src/llama.cpp/src/llama-context.cpp +151 -0
- package/src/llama.cpp/src/llama-context.h +10 -0
- package/src/llama.cpp/src/llama-cparams.h +1 -1
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +8 -0
- package/src/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +8 -0
- package/src/llama.cpp/src/llama-kv-cache.h +2 -0
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +8 -0
- package/src/llama.cpp/src/llama-memory-hybrid.h +2 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +8 -0
- package/src/llama.cpp/src/llama-memory-recurrent.h +3 -0
- package/src/llama.cpp/src/llama-memory.h +3 -0
- package/src/llama.cpp/src/llama-model.cpp +14 -4
- package/src/llama.cpp/src/llama-model.h +5 -1
|
@@ -1329,24 +1329,25 @@ extern "C" {
|
|
|
1329
1329
|
//
|
|
1330
1330
|
// Performance utils
|
|
1331
1331
|
//
|
|
1332
|
-
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
|
|
1332
|
+
// NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
|
|
1333
1333
|
//
|
|
1334
1334
|
|
|
1335
1335
|
struct llama_perf_context_data {
|
|
1336
|
-
|
|
1337
|
-
double
|
|
1338
|
-
double
|
|
1339
|
-
double
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
int32_t
|
|
1343
|
-
int32_t
|
|
1336
|
+
// ms == milliseconds
|
|
1337
|
+
double t_start_ms; // absolute start time
|
|
1338
|
+
double t_load_ms; // time needed for loading the model
|
|
1339
|
+
double t_p_eval_ms; // time needed for processing the prompt
|
|
1340
|
+
double t_eval_ms; // time needed for generating tokens
|
|
1341
|
+
|
|
1342
|
+
int32_t n_p_eval; // number of prompt tokens
|
|
1343
|
+
int32_t n_eval; // number of generated tokens
|
|
1344
|
+
int32_t n_reused; // number of times a ggml compute graph had been reused
|
|
1344
1345
|
};
|
|
1345
1346
|
|
|
1346
1347
|
struct llama_perf_sampler_data {
|
|
1347
|
-
double t_sample_ms;
|
|
1348
|
+
double t_sample_ms; // time needed for sampling in ms
|
|
1348
1349
|
|
|
1349
|
-
int32_t n_sample;
|
|
1350
|
+
int32_t n_sample; // number of sampled tokens
|
|
1350
1351
|
};
|
|
1351
1352
|
|
|
1352
1353
|
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
|
|
@@ -1358,6 +1359,9 @@ extern "C" {
|
|
|
1358
1359
|
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
|
1359
1360
|
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
|
1360
1361
|
|
|
1362
|
+
// print a breakdown of per-device memory use via LLAMA_LOG:
|
|
1363
|
+
LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
|
|
1364
|
+
|
|
1361
1365
|
//
|
|
1362
1366
|
// training
|
|
1363
1367
|
//
|
|
@@ -2027,6 +2027,21 @@ void llama_context::perf_reset() {
|
|
|
2027
2027
|
n_reused = 0;
|
|
2028
2028
|
}
|
|
2029
2029
|
|
|
2030
|
+
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
|
2031
|
+
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
|
2032
|
+
for (const auto & buft_size : model.memory_breakdown()) {
|
|
2033
|
+
ret[buft_size.first].model += buft_size.second;
|
|
2034
|
+
}
|
|
2035
|
+
for (const auto & buft_size : memory->memory_breakdown()) {
|
|
2036
|
+
ret[buft_size.first].context += buft_size.second;
|
|
2037
|
+
}
|
|
2038
|
+
for (const auto & backend_ptr : backends) {
|
|
2039
|
+
ggml_backend_t backend = backend_ptr.get();
|
|
2040
|
+
ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
|
2041
|
+
}
|
|
2042
|
+
return ret;
|
|
2043
|
+
}
|
|
2044
|
+
|
|
2030
2045
|
//
|
|
2031
2046
|
// training
|
|
2032
2047
|
//
|
|
@@ -2765,6 +2780,142 @@ void llama_perf_context_reset(llama_context * ctx) {
|
|
|
2765
2780
|
ctx->perf_reset();
|
|
2766
2781
|
}
|
|
2767
2782
|
|
|
2783
|
+
void llama_memory_breakdown_print(const struct llama_context * ctx) {
|
|
2784
|
+
const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;
|
|
2785
|
+
|
|
2786
|
+
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
|
2787
|
+
|
|
2788
|
+
std::vector<std::array<std::string, 9>> table_data;
|
|
2789
|
+
table_data.reserve(devices.size());
|
|
2790
|
+
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
|
|
2791
|
+
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
|
|
2792
|
+
const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
|
|
2793
|
+
|
|
2794
|
+
table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
|
|
2795
|
+
|
|
2796
|
+
constexpr size_t MiB = 1024 * 1024;
|
|
2797
|
+
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
|
|
2798
|
+
|
|
2799
|
+
// track seen buffer types to avoid double counting:
|
|
2800
|
+
std::set<ggml_backend_buffer_type_t> seen_buffer_types;
|
|
2801
|
+
|
|
2802
|
+
// accumulative memory breakdown for each device and for host:
|
|
2803
|
+
std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
|
|
2804
|
+
llama_memory_breakdown_data mb_host;
|
|
2805
|
+
|
|
2806
|
+
for (const auto & buft_mb : memory_breakdown) {
|
|
2807
|
+
ggml_backend_buffer_type_t buft = buft_mb.first;
|
|
2808
|
+
const llama_memory_breakdown_data & mb = buft_mb.second;
|
|
2809
|
+
if (ggml_backend_buft_is_host(buft)) {
|
|
2810
|
+
mb_host.model += mb.model;
|
|
2811
|
+
mb_host.context += mb.context;
|
|
2812
|
+
mb_host.compute += mb.compute;
|
|
2813
|
+
seen_buffer_types.insert(buft);
|
|
2814
|
+
continue;
|
|
2815
|
+
}
|
|
2816
|
+
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
|
2817
|
+
if (dev) {
|
|
2818
|
+
int i_dev = -1;
|
|
2819
|
+
for (size_t i = 0; i < devices.size(); i++) {
|
|
2820
|
+
if (devices[i] == dev) {
|
|
2821
|
+
i_dev = i;
|
|
2822
|
+
break;
|
|
2823
|
+
}
|
|
2824
|
+
}
|
|
2825
|
+
if (i_dev != -1) {
|
|
2826
|
+
mb_dev[i_dev].model += mb.model;
|
|
2827
|
+
mb_dev[i_dev].context += mb.context;
|
|
2828
|
+
mb_dev[i_dev].compute += mb.compute;
|
|
2829
|
+
seen_buffer_types.insert(buft);
|
|
2830
|
+
continue;
|
|
2831
|
+
}
|
|
2832
|
+
}
|
|
2833
|
+
}
|
|
2834
|
+
|
|
2835
|
+
// print memory breakdown for each device:
|
|
2836
|
+
for (size_t i = 0; i < devices.size(); i++) {
|
|
2837
|
+
ggml_backend_dev_t dev = devices[i];
|
|
2838
|
+
llama_memory_breakdown_data mb = mb_dev[i];
|
|
2839
|
+
|
|
2840
|
+
const std::string name = ggml_backend_dev_name(dev);
|
|
2841
|
+
std::string desc = ggml_backend_dev_description(dev);
|
|
2842
|
+
for (const std::string & prefix : desc_prefixes_strip) {
|
|
2843
|
+
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
|
|
2844
|
+
desc = desc.substr(prefix.length());
|
|
2845
|
+
}
|
|
2846
|
+
}
|
|
2847
|
+
|
|
2848
|
+
size_t free, total;
|
|
2849
|
+
ggml_backend_dev_memory(dev, &free, &total);
|
|
2850
|
+
|
|
2851
|
+
const size_t self = mb.model + mb.context + mb.compute;
|
|
2852
|
+
const size_t unaccounted = total - self - free;
|
|
2853
|
+
|
|
2854
|
+
table_data.push_back({
|
|
2855
|
+
template_gpu,
|
|
2856
|
+
" - " + name + " (" + desc + ")",
|
|
2857
|
+
std::to_string(total / MiB),
|
|
2858
|
+
std::to_string(free / MiB),
|
|
2859
|
+
std::to_string(self / MiB),
|
|
2860
|
+
std::to_string(mb.model / MiB),
|
|
2861
|
+
std::to_string(mb.context / MiB),
|
|
2862
|
+
std::to_string(mb.compute / MiB),
|
|
2863
|
+
std::to_string(unaccounted / MiB)});
|
|
2864
|
+
}
|
|
2865
|
+
|
|
2866
|
+
// print memory breakdown for host:
|
|
2867
|
+
{
|
|
2868
|
+
const size_t self = mb_host.model + mb_host.context + mb_host.compute;
|
|
2869
|
+
table_data.push_back({
|
|
2870
|
+
template_other,
|
|
2871
|
+
" - Host",
|
|
2872
|
+
"", // total
|
|
2873
|
+
"", // free
|
|
2874
|
+
std::to_string(self / MiB),
|
|
2875
|
+
std::to_string(mb_host.model / MiB),
|
|
2876
|
+
std::to_string(mb_host.context / MiB),
|
|
2877
|
+
std::to_string(mb_host.compute / MiB),
|
|
2878
|
+
""}); // unaccounted
|
|
2879
|
+
}
|
|
2880
|
+
|
|
2881
|
+
// print memory breakdown for all remaining buffer types:
|
|
2882
|
+
for (const auto & buft_mb : memory_breakdown) {
|
|
2883
|
+
ggml_backend_buffer_type_t buft = buft_mb.first;
|
|
2884
|
+
const llama_memory_breakdown_data & mb = buft_mb.second;
|
|
2885
|
+
if (seen_buffer_types.count(buft) == 1) {
|
|
2886
|
+
continue;
|
|
2887
|
+
}
|
|
2888
|
+
const std::string name = ggml_backend_buft_name(buft);
|
|
2889
|
+
const size_t self = mb.model + mb.context + mb.compute;
|
|
2890
|
+
table_data.push_back({
|
|
2891
|
+
template_other,
|
|
2892
|
+
" - " + name,
|
|
2893
|
+
"", // total
|
|
2894
|
+
"", // free
|
|
2895
|
+
std::to_string(self / MiB),
|
|
2896
|
+
std::to_string(mb.model / MiB),
|
|
2897
|
+
std::to_string(mb.context / MiB),
|
|
2898
|
+
std::to_string(mb.compute / MiB),
|
|
2899
|
+
""}); // unaccounted
|
|
2900
|
+
seen_buffer_types.insert(buft);
|
|
2901
|
+
}
|
|
2902
|
+
|
|
2903
|
+
for (size_t j = 1; j < table_data[0].size(); j++) {
|
|
2904
|
+
size_t max_len = 0;
|
|
2905
|
+
for (const auto & td : table_data) {
|
|
2906
|
+
max_len = std::max(max_len, td[j].length());
|
|
2907
|
+
}
|
|
2908
|
+
for (auto & td : table_data) {
|
|
2909
|
+
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
|
|
2910
|
+
}
|
|
2911
|
+
}
|
|
2912
|
+
for (const auto & td : table_data) {
|
|
2913
|
+
LLAMA_LOG_INFO(td[0].c_str(),
|
|
2914
|
+
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
|
|
2915
|
+
td[6].c_str(), td[7].c_str(), td[8].c_str());
|
|
2916
|
+
}
|
|
2917
|
+
}
|
|
2918
|
+
|
|
2768
2919
|
//
|
|
2769
2920
|
// training
|
|
2770
2921
|
//
|
|
@@ -17,9 +17,17 @@ class llama_batch_allocr;
|
|
|
17
17
|
class llama_io_read_i;
|
|
18
18
|
class llama_io_write_i;
|
|
19
19
|
|
|
20
|
+
// "memory" as in abstract memory for the context
|
|
20
21
|
struct llama_memory_i;
|
|
21
22
|
struct llama_memory_context_i;
|
|
22
23
|
|
|
24
|
+
// "memory" as in physical memory for a buffer type, in bytes
|
|
25
|
+
struct llama_memory_breakdown_data {
|
|
26
|
+
size_t model = 0; // memory allocated for the model
|
|
27
|
+
size_t context = 0; // memory allocated for the context
|
|
28
|
+
size_t compute = 0; // memory allocated for temporary compute buffers
|
|
29
|
+
};
|
|
30
|
+
|
|
23
31
|
struct llama_context {
|
|
24
32
|
// init scheduler and compute buffers, reserve worst-case graphs
|
|
25
33
|
llama_context(
|
|
@@ -144,6 +152,8 @@ struct llama_context {
|
|
|
144
152
|
llama_perf_context_data perf_get_data() const;
|
|
145
153
|
void perf_reset();
|
|
146
154
|
|
|
155
|
+
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
|
|
156
|
+
|
|
147
157
|
//
|
|
148
158
|
// training
|
|
149
159
|
//
|
|
@@ -113,6 +113,14 @@ llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
113
113
|
return kv_swa->seq_pos_max(seq_id);
|
|
114
114
|
}
|
|
115
115
|
|
|
116
|
+
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
|
|
117
|
+
std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
|
|
118
|
+
for (const auto & buft_size : kv_swa->memory_breakdown()) {
|
|
119
|
+
mb[buft_size.first] += buft_size.second;
|
|
120
|
+
}
|
|
121
|
+
return mb;
|
|
122
|
+
}
|
|
123
|
+
|
|
116
124
|
llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
|
117
125
|
GGML_UNUSED(embd_all);
|
|
118
126
|
|
|
@@ -56,6 +56,8 @@ public:
|
|
|
56
56
|
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
|
57
57
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
|
58
58
|
|
|
59
|
+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
|
60
|
+
|
|
59
61
|
// state write/load
|
|
60
62
|
|
|
61
63
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
|
@@ -473,6 +473,14 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
473
473
|
return cells.seq_pos_max(seq_id);
|
|
474
474
|
}
|
|
475
475
|
|
|
476
|
+
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
|
|
477
|
+
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
478
|
+
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
|
|
479
|
+
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
|
|
480
|
+
}
|
|
481
|
+
return ret;
|
|
482
|
+
}
|
|
483
|
+
|
|
476
484
|
llama_memory_context_ptr llama_kv_cache::init_batch(
|
|
477
485
|
llama_batch_allocr & balloc,
|
|
478
486
|
uint32_t n_ubatch,
|
|
@@ -121,6 +121,8 @@ public:
|
|
|
121
121
|
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
|
122
122
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
|
123
123
|
|
|
124
|
+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
|
125
|
+
|
|
124
126
|
// state write/load
|
|
125
127
|
|
|
126
128
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
|
@@ -166,6 +166,14 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
166
166
|
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
|
|
167
167
|
}
|
|
168
168
|
|
|
169
|
+
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
|
|
170
|
+
std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
|
|
171
|
+
for (const auto & buft_size : mem_recr->memory_breakdown()) {
|
|
172
|
+
mb[buft_size.first] += buft_size.second;
|
|
173
|
+
}
|
|
174
|
+
return mb;
|
|
175
|
+
}
|
|
176
|
+
|
|
169
177
|
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
170
178
|
GGML_UNUSED(flags);
|
|
171
179
|
|
|
@@ -68,6 +68,8 @@ public:
|
|
|
68
68
|
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
|
69
69
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
|
70
70
|
|
|
71
|
+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
|
72
|
+
|
|
71
73
|
// state write/load
|
|
72
74
|
|
|
73
75
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
|
@@ -359,6 +359,14 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
359
359
|
return result;
|
|
360
360
|
}
|
|
361
361
|
|
|
362
|
+
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
|
|
363
|
+
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
364
|
+
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
|
|
365
|
+
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
|
|
366
|
+
}
|
|
367
|
+
return ret;
|
|
368
|
+
}
|
|
369
|
+
|
|
362
370
|
llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
|
363
371
|
do {
|
|
364
372
|
balloc.split_reset();
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
#include "llama-graph.h"
|
|
5
5
|
#include "llama-memory.h"
|
|
6
6
|
|
|
7
|
+
#include <map>
|
|
7
8
|
#include <set>
|
|
8
9
|
#include <vector>
|
|
9
10
|
|
|
@@ -50,6 +51,8 @@ public:
|
|
|
50
51
|
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
|
51
52
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
|
52
53
|
|
|
54
|
+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
|
55
|
+
|
|
53
56
|
bool prepare(const std::vector<llama_ubatch> & ubatches);
|
|
54
57
|
|
|
55
58
|
// find a contiguous slot of memory cells and emplace the ubatch there
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
#include "llama.h"
|
|
4
4
|
|
|
5
|
+
#include <map>
|
|
5
6
|
#include <memory>
|
|
6
7
|
#include <functional>
|
|
7
8
|
|
|
@@ -108,6 +109,8 @@ struct llama_memory_i {
|
|
|
108
109
|
virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
|
|
109
110
|
virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
|
|
110
111
|
|
|
112
|
+
virtual std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const = 0;
|
|
113
|
+
|
|
111
114
|
//
|
|
112
115
|
// state write/read
|
|
113
116
|
//
|
|
@@ -66,6 +66,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
66
66
|
case LLM_TYPE_1_7B: return "1.7B";
|
|
67
67
|
case LLM_TYPE_1_8B: return "1.8B";
|
|
68
68
|
case LLM_TYPE_2B: return "2B";
|
|
69
|
+
case LLM_TYPE_2_6B: return "2.6B";
|
|
69
70
|
case LLM_TYPE_2_8B: return "2.8B";
|
|
70
71
|
case LLM_TYPE_2_9B: return "2.9B";
|
|
71
72
|
case LLM_TYPE_3B: return "3B";
|
|
@@ -1977,10 +1978,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1977
1978
|
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
|
|
1978
1979
|
hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
|
|
1979
1980
|
}
|
|
1980
|
-
switch (hparams.
|
|
1981
|
-
case
|
|
1982
|
-
case
|
|
1983
|
-
case
|
|
1981
|
+
switch (hparams.n_ff()) {
|
|
1982
|
+
case 4608: type = LLM_TYPE_350M; break;
|
|
1983
|
+
case 6912: type = LLM_TYPE_700M; break;
|
|
1984
|
+
case 8192: type = LLM_TYPE_1_2B; break;
|
|
1985
|
+
case 10752: type = LLM_TYPE_2_6B; break;
|
|
1984
1986
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1985
1987
|
}
|
|
1986
1988
|
} break;
|
|
@@ -6003,6 +6005,14 @@ size_t llama_model::n_devices() const {
|
|
|
6003
6005
|
return devices.size();
|
|
6004
6006
|
}
|
|
6005
6007
|
|
|
6008
|
+
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
|
6009
|
+
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
6010
|
+
for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs) {
|
|
6011
|
+
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
|
|
6012
|
+
}
|
|
6013
|
+
return ret;
|
|
6014
|
+
}
|
|
6015
|
+
|
|
6006
6016
|
uint64_t llama_model::n_elements() const {
|
|
6007
6017
|
return pimpl->n_elements;
|
|
6008
6018
|
}
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
#include "llama-memory.h"
|
|
8
8
|
#include "llama-vocab.h"
|
|
9
9
|
|
|
10
|
+
#include <map>
|
|
10
11
|
#include <memory>
|
|
11
12
|
#include <string>
|
|
12
13
|
#include <unordered_map>
|
|
@@ -58,6 +59,7 @@ enum llm_type {
|
|
|
58
59
|
LLM_TYPE_1_7B,
|
|
59
60
|
LLM_TYPE_1_8B,
|
|
60
61
|
LLM_TYPE_2B,
|
|
62
|
+
LLM_TYPE_2_6B,
|
|
61
63
|
LLM_TYPE_2_8B,
|
|
62
64
|
LLM_TYPE_2_9B,
|
|
63
65
|
LLM_TYPE_3B,
|
|
@@ -452,10 +454,12 @@ struct llama_model {
|
|
|
452
454
|
|
|
453
455
|
std::string desc() const;
|
|
454
456
|
|
|
455
|
-
size_t size() const;
|
|
457
|
+
size_t size() const; // file size
|
|
456
458
|
size_t n_tensors() const;
|
|
457
459
|
size_t n_devices() const;
|
|
458
460
|
|
|
461
|
+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
|
|
462
|
+
|
|
459
463
|
// total number of parameters in the model
|
|
460
464
|
uint64_t n_elements() const;
|
|
461
465
|
|