@fugood/llama.node 1.2.1 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/package.json +14 -14
  2. package/src/llama.cpp/common/arg.cpp +359 -310
  3. package/src/llama.cpp/common/chat.cpp +27 -15
  4. package/src/llama.cpp/common/common.cpp +1 -0
  5. package/src/llama.cpp/common/sampling.cpp +1 -0
  6. package/src/llama.cpp/ggml/CMakeLists.txt +37 -21
  7. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -1
  8. package/src/llama.cpp/ggml/include/ggml-zdnn.h +3 -0
  9. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  10. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +4 -2
  11. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +2 -2
  12. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
  13. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +17 -3
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +93 -862
  16. package/src/llama.cpp/include/llama.h +15 -11
  17. package/src/llama.cpp/src/llama-context.cpp +151 -0
  18. package/src/llama.cpp/src/llama-context.h +10 -0
  19. package/src/llama.cpp/src/llama-cparams.h +1 -1
  20. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +8 -0
  21. package/src/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
  22. package/src/llama.cpp/src/llama-kv-cache.cpp +8 -0
  23. package/src/llama.cpp/src/llama-kv-cache.h +2 -0
  24. package/src/llama.cpp/src/llama-memory-hybrid.cpp +8 -0
  25. package/src/llama.cpp/src/llama-memory-hybrid.h +2 -0
  26. package/src/llama.cpp/src/llama-memory-recurrent.cpp +8 -0
  27. package/src/llama.cpp/src/llama-memory-recurrent.h +3 -0
  28. package/src/llama.cpp/src/llama-memory.h +3 -0
  29. package/src/llama.cpp/src/llama-model.cpp +14 -4
  30. package/src/llama.cpp/src/llama-model.h +5 -1
@@ -1329,24 +1329,25 @@ extern "C" {
1329
1329
  //
1330
1330
  // Performance utils
1331
1331
  //
1332
- // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
1332
+ // NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
1333
1333
  //
1334
1334
 
1335
1335
  struct llama_perf_context_data {
1336
- double t_start_ms;
1337
- double t_load_ms;
1338
- double t_p_eval_ms;
1339
- double t_eval_ms;
1340
-
1341
- int32_t n_p_eval;
1342
- int32_t n_eval;
1343
- int32_t n_reused; // number of times a ggml compute graph had been reused
1336
+ // ms == milliseconds
1337
+ double t_start_ms; // absolute start time
1338
+ double t_load_ms; // time needed for loading the model
1339
+ double t_p_eval_ms; // time needed for processing the prompt
1340
+ double t_eval_ms; // time needed for generating tokens
1341
+
1342
+ int32_t n_p_eval; // number of prompt tokens
1343
+ int32_t n_eval; // number of generated tokens
1344
+ int32_t n_reused; // number of times a ggml compute graph had been reused
1344
1345
  };
1345
1346
 
1346
1347
  struct llama_perf_sampler_data {
1347
- double t_sample_ms;
1348
+ double t_sample_ms; // time needed for sampling in ms
1348
1349
 
1349
- int32_t n_sample;
1350
+ int32_t n_sample; // number of sampled tokens
1350
1351
  };
1351
1352
 
1352
1353
  LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
@@ -1358,6 +1359,9 @@ extern "C" {
1358
1359
  LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
1359
1360
  LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
1360
1361
 
1362
+ // print a breakdown of per-device memory use via LLAMA_LOG:
1363
+ LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
1364
+
1361
1365
  //
1362
1366
  // training
1363
1367
  //
@@ -2027,6 +2027,21 @@ void llama_context::perf_reset() {
2027
2027
  n_reused = 0;
2028
2028
  }
2029
2029
 
2030
+ std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
2031
+ std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
2032
+ for (const auto & buft_size : model.memory_breakdown()) {
2033
+ ret[buft_size.first].model += buft_size.second;
2034
+ }
2035
+ for (const auto & buft_size : memory->memory_breakdown()) {
2036
+ ret[buft_size.first].context += buft_size.second;
2037
+ }
2038
+ for (const auto & backend_ptr : backends) {
2039
+ ggml_backend_t backend = backend_ptr.get();
2040
+ ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
2041
+ }
2042
+ return ret;
2043
+ }
2044
+
2030
2045
  //
2031
2046
  // training
2032
2047
  //
@@ -2765,6 +2780,142 @@ void llama_perf_context_reset(llama_context * ctx) {
2765
2780
  ctx->perf_reset();
2766
2781
  }
2767
2782
 
2783
+ void llama_memory_breakdown_print(const struct llama_context * ctx) {
2784
+ const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;
2785
+
2786
+ std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
2787
+
2788
+ std::vector<std::array<std::string, 9>> table_data;
2789
+ table_data.reserve(devices.size());
2790
+ const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
2791
+ const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
2792
+ const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
2793
+
2794
+ table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
2795
+
2796
+ constexpr size_t MiB = 1024 * 1024;
2797
+ const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
2798
+
2799
+ // track seen buffer types to avoid double counting:
2800
+ std::set<ggml_backend_buffer_type_t> seen_buffer_types;
2801
+
2802
+ // accumulative memory breakdown for each device and for host:
2803
+ std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
2804
+ llama_memory_breakdown_data mb_host;
2805
+
2806
+ for (const auto & buft_mb : memory_breakdown) {
2807
+ ggml_backend_buffer_type_t buft = buft_mb.first;
2808
+ const llama_memory_breakdown_data & mb = buft_mb.second;
2809
+ if (ggml_backend_buft_is_host(buft)) {
2810
+ mb_host.model += mb.model;
2811
+ mb_host.context += mb.context;
2812
+ mb_host.compute += mb.compute;
2813
+ seen_buffer_types.insert(buft);
2814
+ continue;
2815
+ }
2816
+ ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
2817
+ if (dev) {
2818
+ int i_dev = -1;
2819
+ for (size_t i = 0; i < devices.size(); i++) {
2820
+ if (devices[i] == dev) {
2821
+ i_dev = i;
2822
+ break;
2823
+ }
2824
+ }
2825
+ if (i_dev != -1) {
2826
+ mb_dev[i_dev].model += mb.model;
2827
+ mb_dev[i_dev].context += mb.context;
2828
+ mb_dev[i_dev].compute += mb.compute;
2829
+ seen_buffer_types.insert(buft);
2830
+ continue;
2831
+ }
2832
+ }
2833
+ }
2834
+
2835
+ // print memory breakdown for each device:
2836
+ for (size_t i = 0; i < devices.size(); i++) {
2837
+ ggml_backend_dev_t dev = devices[i];
2838
+ llama_memory_breakdown_data mb = mb_dev[i];
2839
+
2840
+ const std::string name = ggml_backend_dev_name(dev);
2841
+ std::string desc = ggml_backend_dev_description(dev);
2842
+ for (const std::string & prefix : desc_prefixes_strip) {
2843
+ if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
2844
+ desc = desc.substr(prefix.length());
2845
+ }
2846
+ }
2847
+
2848
+ size_t free, total;
2849
+ ggml_backend_dev_memory(dev, &free, &total);
2850
+
2851
+ const size_t self = mb.model + mb.context + mb.compute;
2852
+ const size_t unaccounted = total - self - free;
2853
+
2854
+ table_data.push_back({
2855
+ template_gpu,
2856
+ " - " + name + " (" + desc + ")",
2857
+ std::to_string(total / MiB),
2858
+ std::to_string(free / MiB),
2859
+ std::to_string(self / MiB),
2860
+ std::to_string(mb.model / MiB),
2861
+ std::to_string(mb.context / MiB),
2862
+ std::to_string(mb.compute / MiB),
2863
+ std::to_string(unaccounted / MiB)});
2864
+ }
2865
+
2866
+ // print memory breakdown for host:
2867
+ {
2868
+ const size_t self = mb_host.model + mb_host.context + mb_host.compute;
2869
+ table_data.push_back({
2870
+ template_other,
2871
+ " - Host",
2872
+ "", // total
2873
+ "", // free
2874
+ std::to_string(self / MiB),
2875
+ std::to_string(mb_host.model / MiB),
2876
+ std::to_string(mb_host.context / MiB),
2877
+ std::to_string(mb_host.compute / MiB),
2878
+ ""}); // unaccounted
2879
+ }
2880
+
2881
+ // print memory breakdown for all remaining buffer types:
2882
+ for (const auto & buft_mb : memory_breakdown) {
2883
+ ggml_backend_buffer_type_t buft = buft_mb.first;
2884
+ const llama_memory_breakdown_data & mb = buft_mb.second;
2885
+ if (seen_buffer_types.count(buft) == 1) {
2886
+ continue;
2887
+ }
2888
+ const std::string name = ggml_backend_buft_name(buft);
2889
+ const size_t self = mb.model + mb.context + mb.compute;
2890
+ table_data.push_back({
2891
+ template_other,
2892
+ " - " + name,
2893
+ "", // total
2894
+ "", // free
2895
+ std::to_string(self / MiB),
2896
+ std::to_string(mb.model / MiB),
2897
+ std::to_string(mb.context / MiB),
2898
+ std::to_string(mb.compute / MiB),
2899
+ ""}); // unaccounted
2900
+ seen_buffer_types.insert(buft);
2901
+ }
2902
+
2903
+ for (size_t j = 1; j < table_data[0].size(); j++) {
2904
+ size_t max_len = 0;
2905
+ for (const auto & td : table_data) {
2906
+ max_len = std::max(max_len, td[j].length());
2907
+ }
2908
+ for (auto & td : table_data) {
2909
+ td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
2910
+ }
2911
+ }
2912
+ for (const auto & td : table_data) {
2913
+ LLAMA_LOG_INFO(td[0].c_str(),
2914
+ __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
2915
+ td[6].c_str(), td[7].c_str(), td[8].c_str());
2916
+ }
2917
+ }
2918
+
2768
2919
  //
2769
2920
  // training
2770
2921
  //
@@ -17,9 +17,17 @@ class llama_batch_allocr;
17
17
  class llama_io_read_i;
18
18
  class llama_io_write_i;
19
19
 
20
+ // "memory" as in abstract memory for the context
20
21
  struct llama_memory_i;
21
22
  struct llama_memory_context_i;
22
23
 
24
+ // "memory" as in physical memory for a buffer type, in bytes
25
+ struct llama_memory_breakdown_data {
26
+ size_t model = 0; // memory allocated for the model
27
+ size_t context = 0; // memory allocated for the context
28
+ size_t compute = 0; // memory allocated for temporary compute buffers
29
+ };
30
+
23
31
  struct llama_context {
24
32
  // init scheduler and compute buffers, reserve worst-case graphs
25
33
  llama_context(
@@ -144,6 +152,8 @@ struct llama_context {
144
152
  llama_perf_context_data perf_get_data() const;
145
153
  void perf_reset();
146
154
 
155
+ std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
156
+
147
157
  //
148
158
  // training
149
159
  //
@@ -4,7 +4,7 @@
4
4
 
5
5
  #include <cstdint>
6
6
 
7
- #define LLAMA_MAX_SEQ 64
7
+ #define LLAMA_MAX_SEQ 256
8
8
 
9
9
  struct llama_cparams {
10
10
  uint32_t n_ctx; // context size used during inference
@@ -113,6 +113,14 @@ llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
113
113
  return kv_swa->seq_pos_max(seq_id);
114
114
  }
115
115
 
116
+ std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
117
+ std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
118
+ for (const auto & buft_size : kv_swa->memory_breakdown()) {
119
+ mb[buft_size.first] += buft_size.second;
120
+ }
121
+ return mb;
122
+ }
123
+
116
124
  llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
117
125
  GGML_UNUSED(embd_all);
118
126
 
@@ -56,6 +56,8 @@ public:
56
56
  llama_pos seq_pos_min(llama_seq_id seq_id) const override;
57
57
  llama_pos seq_pos_max(llama_seq_id seq_id) const override;
58
58
 
59
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
60
+
59
61
  // state write/load
60
62
 
61
63
  void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
@@ -473,6 +473,14 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
473
473
  return cells.seq_pos_max(seq_id);
474
474
  }
475
475
 
476
+ std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
477
+ std::map<ggml_backend_buffer_type_t, size_t> ret;
478
+ for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
479
+ ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
480
+ }
481
+ return ret;
482
+ }
483
+
476
484
  llama_memory_context_ptr llama_kv_cache::init_batch(
477
485
  llama_batch_allocr & balloc,
478
486
  uint32_t n_ubatch,
@@ -121,6 +121,8 @@ public:
121
121
  llama_pos seq_pos_min(llama_seq_id seq_id) const override;
122
122
  llama_pos seq_pos_max(llama_seq_id seq_id) const override;
123
123
 
124
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
125
+
124
126
  // state write/load
125
127
 
126
128
  void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
@@ -166,6 +166,14 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
166
166
  return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
167
167
  }
168
168
 
169
+ std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
170
+ std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
171
+ for (const auto & buft_size : mem_recr->memory_breakdown()) {
172
+ mb[buft_size.first] += buft_size.second;
173
+ }
174
+ return mb;
175
+ }
176
+
169
177
  void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
170
178
  GGML_UNUSED(flags);
171
179
 
@@ -68,6 +68,8 @@ public:
68
68
  llama_pos seq_pos_min(llama_seq_id seq_id) const override;
69
69
  llama_pos seq_pos_max(llama_seq_id seq_id) const override;
70
70
 
71
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
72
+
71
73
  // state write/load
72
74
 
73
75
  void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
@@ -359,6 +359,14 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
359
359
  return result;
360
360
  }
361
361
 
362
+ std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
363
+ std::map<ggml_backend_buffer_type_t, size_t> ret;
364
+ for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
365
+ ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
366
+ }
367
+ return ret;
368
+ }
369
+
362
370
  llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
363
371
  do {
364
372
  balloc.split_reset();
@@ -4,6 +4,7 @@
4
4
  #include "llama-graph.h"
5
5
  #include "llama-memory.h"
6
6
 
7
+ #include <map>
7
8
  #include <set>
8
9
  #include <vector>
9
10
 
@@ -50,6 +51,8 @@ public:
50
51
  llama_pos seq_pos_min(llama_seq_id seq_id) const override;
51
52
  llama_pos seq_pos_max(llama_seq_id seq_id) const override;
52
53
 
54
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
55
+
53
56
  bool prepare(const std::vector<llama_ubatch> & ubatches);
54
57
 
55
58
  // find a contiguous slot of memory cells and emplace the ubatch there
@@ -2,6 +2,7 @@
2
2
 
3
3
  #include "llama.h"
4
4
 
5
+ #include <map>
5
6
  #include <memory>
6
7
  #include <functional>
7
8
 
@@ -108,6 +109,8 @@ struct llama_memory_i {
108
109
  virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
109
110
  virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
110
111
 
112
+ virtual std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const = 0;
113
+
111
114
  //
112
115
  // state write/read
113
116
  //
@@ -66,6 +66,7 @@ const char * llm_type_name(llm_type type) {
66
66
  case LLM_TYPE_1_7B: return "1.7B";
67
67
  case LLM_TYPE_1_8B: return "1.8B";
68
68
  case LLM_TYPE_2B: return "2B";
69
+ case LLM_TYPE_2_6B: return "2.6B";
69
70
  case LLM_TYPE_2_8B: return "2.8B";
70
71
  case LLM_TYPE_2_9B: return "2.9B";
71
72
  case LLM_TYPE_3B: return "3B";
@@ -1977,10 +1978,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1977
1978
  for (uint32_t il = 0; il < hparams.n_layer; ++il) {
1978
1979
  hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
1979
1980
  }
1980
- switch (hparams.n_embd) {
1981
- case 1024: type = LLM_TYPE_350M; break;
1982
- case 1536: type = LLM_TYPE_700M; break;
1983
- case 2048: type = LLM_TYPE_1_2B; break;
1981
+ switch (hparams.n_ff()) {
1982
+ case 4608: type = LLM_TYPE_350M; break;
1983
+ case 6912: type = LLM_TYPE_700M; break;
1984
+ case 8192: type = LLM_TYPE_1_2B; break;
1985
+ case 10752: type = LLM_TYPE_2_6B; break;
1984
1986
  default: type = LLM_TYPE_UNKNOWN;
1985
1987
  }
1986
1988
  } break;
@@ -6003,6 +6005,14 @@ size_t llama_model::n_devices() const {
6003
6005
  return devices.size();
6004
6006
  }
6005
6007
 
6008
+ std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
6009
+ std::map<ggml_backend_buffer_type_t, size_t> ret;
6010
+ for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs) {
6011
+ ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
6012
+ }
6013
+ return ret;
6014
+ }
6015
+
6006
6016
  uint64_t llama_model::n_elements() const {
6007
6017
  return pimpl->n_elements;
6008
6018
  }
@@ -7,6 +7,7 @@
7
7
  #include "llama-memory.h"
8
8
  #include "llama-vocab.h"
9
9
 
10
+ #include <map>
10
11
  #include <memory>
11
12
  #include <string>
12
13
  #include <unordered_map>
@@ -58,6 +59,7 @@ enum llm_type {
58
59
  LLM_TYPE_1_7B,
59
60
  LLM_TYPE_1_8B,
60
61
  LLM_TYPE_2B,
62
+ LLM_TYPE_2_6B,
61
63
  LLM_TYPE_2_8B,
62
64
  LLM_TYPE_2_9B,
63
65
  LLM_TYPE_3B,
@@ -452,10 +454,12 @@ struct llama_model {
452
454
 
453
455
  std::string desc() const;
454
456
 
455
- size_t size() const;
457
+ size_t size() const; // file size
456
458
  size_t n_tensors() const;
457
459
  size_t n_devices() const;
458
460
 
461
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
462
+
459
463
  // total number of parameters in the model
460
464
  uint64_t n_elements() const;
461
465