@fugood/llama.node 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/package.json +14 -14
  2. package/src/llama.cpp/CMakeLists.txt +0 -1
  3. package/src/llama.cpp/common/CMakeLists.txt +4 -5
  4. package/src/llama.cpp/common/arg.cpp +44 -0
  5. package/src/llama.cpp/common/common.cpp +22 -6
  6. package/src/llama.cpp/common/common.h +15 -1
  7. package/src/llama.cpp/ggml/CMakeLists.txt +10 -2
  8. package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  9. package/src/llama.cpp/ggml/include/ggml.h +104 -10
  10. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  11. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
  12. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +12 -1
  13. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +749 -163
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  17. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +12 -9
  18. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +88 -9
  19. package/src/llama.cpp/include/llama.h +13 -47
  20. package/src/llama.cpp/src/llama-arch.cpp +298 -3
  21. package/src/llama.cpp/src/llama-arch.h +22 -1
  22. package/src/llama.cpp/src/llama-batch.cpp +103 -71
  23. package/src/llama.cpp/src/llama-batch.h +31 -18
  24. package/src/llama.cpp/src/llama-chat.cpp +59 -1
  25. package/src/llama.cpp/src/llama-chat.h +3 -0
  26. package/src/llama.cpp/src/llama-context.cpp +134 -95
  27. package/src/llama.cpp/src/llama-context.h +13 -16
  28. package/src/llama.cpp/src/llama-cparams.h +3 -2
  29. package/src/llama.cpp/src/llama-graph.cpp +279 -180
  30. package/src/llama.cpp/src/llama-graph.h +183 -122
  31. package/src/llama.cpp/src/llama-hparams.cpp +47 -1
  32. package/src/llama.cpp/src/llama-hparams.h +12 -1
  33. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
  34. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
  35. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
  36. package/src/llama.cpp/src/llama-kv-cache-unified.h +143 -47
  37. package/src/llama.cpp/src/llama-kv-cells.h +62 -10
  38. package/src/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
  39. package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
  40. package/src/llama.cpp/src/llama-memory-recurrent.cpp +21 -11
  41. package/src/llama.cpp/src/llama-memory.cpp +17 -0
  42. package/src/llama.cpp/src/llama-memory.h +3 -0
  43. package/src/llama.cpp/src/llama-model.cpp +3373 -743
  44. package/src/llama.cpp/src/llama-model.h +20 -4
  45. package/src/llama.cpp/src/llama-quant.cpp +2 -2
  46. package/src/llama.cpp/src/llama-vocab.cpp +376 -10
  47. package/src/llama.cpp/src/llama-vocab.h +43 -0
  48. package/src/llama.cpp/src/unicode.cpp +207 -0
  49. package/src/llama.cpp/src/unicode.h +2 -0
  50. package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
@@ -18,16 +18,17 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
18
18
  bool v_trans,
19
19
  bool offload,
20
20
  bool swa_full,
21
+ bool unified,
21
22
  uint32_t kv_size,
22
23
  uint32_t n_seq_max,
23
24
  uint32_t n_ubatch,
24
- uint32_t n_pad) : hparams(model.hparams) {
25
+ uint32_t n_pad) : hparams(model.hparams), unified(unified) {
25
26
  llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
26
27
  llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
27
28
 
28
29
  const uint32_t size_base = kv_size;
29
30
 
30
- uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad));
31
+ uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch, n_pad));
31
32
 
32
33
  // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
33
34
  if (swa_full) {
@@ -41,14 +42,14 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
41
42
 
42
43
  kv_base = std::make_unique<llama_kv_cache_unified>(
43
44
  model, std::move(filter_base), type_k, type_v,
44
- v_trans, offload, size_base, n_seq_max, n_pad,
45
+ v_trans, offload, unified, size_base, n_seq_max, n_pad,
45
46
  0, LLAMA_SWA_TYPE_NONE);
46
47
 
47
48
  LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
48
49
 
49
50
  kv_swa = std::make_unique<llama_kv_cache_unified>(
50
51
  model, std::move(filter_swa), type_k, type_v,
51
- v_trans, offload, size_swa, n_seq_max, n_pad,
52
+ v_trans, offload, unified, size_swa, n_seq_max, n_pad,
52
53
  hparams.n_swa, hparams.swa_type);
53
54
  }
54
55
 
@@ -100,6 +101,11 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
100
101
 
101
102
  // first try simple split
102
103
  do {
104
+ if (!unified) {
105
+ // requires equal splits, so we skip the simple split
106
+ break;
107
+ }
108
+
103
109
  balloc.split_reset();
104
110
 
105
111
  std::vector<llama_ubatch> ubatches;
@@ -113,20 +119,25 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
113
119
  ubatches.push_back(std::move(ubatch)); // NOLINT
114
120
  }
115
121
 
116
- auto heads_base = kv_base->prepare(ubatches);
117
- if (heads_base.empty()) {
122
+ if (balloc.get_n_used() < balloc.get_n_tokens()) {
123
+ // failed to find a suitable split
124
+ break;
125
+ }
126
+
127
+ auto sinfos_base = kv_base->prepare(ubatches);
128
+ if (sinfos_base.empty()) {
118
129
  break;
119
130
  }
120
131
 
121
- auto heads_swa = kv_swa->prepare(ubatches);
122
- if (heads_swa.empty()) {
132
+ auto sinfos_swa = kv_swa->prepare(ubatches);
133
+ if (sinfos_swa.empty()) {
123
134
  break;
124
135
  }
125
136
 
126
- assert(heads_base.size() == heads_swa.size());
137
+ assert(sinfos_base.size() == sinfos_swa.size());
127
138
 
128
139
  return std::make_unique<llama_kv_cache_unified_iswa_context>(
129
- this, std::move(heads_base), std::move(heads_swa), std::move(ubatches));
140
+ this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
130
141
  } while (false);
131
142
 
132
143
  // if it fails, try equal split
@@ -135,7 +146,7 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
135
146
 
136
147
  std::vector<llama_ubatch> ubatches;
137
148
  while (true) {
138
- auto ubatch = balloc.split_equal(n_ubatch);
149
+ auto ubatch = balloc.split_equal(n_ubatch, !unified);
139
150
 
140
151
  if (ubatch.n_tokens == 0) {
141
152
  break;
@@ -144,20 +155,25 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
144
155
  ubatches.push_back(std::move(ubatch)); // NOLINT
145
156
  }
146
157
 
147
- auto heads_base = kv_base->prepare(ubatches);
148
- if (heads_base.empty()) {
158
+ if (balloc.get_n_used() < balloc.get_n_tokens()) {
159
+ // failed to find a suitable split
149
160
  break;
150
161
  }
151
162
 
152
- auto heads_swa = kv_swa->prepare(ubatches);
153
- if (heads_swa.empty()) {
163
+ auto sinfos_base = kv_base->prepare(ubatches);
164
+ if (sinfos_base.empty()) {
154
165
  break;
155
166
  }
156
167
 
157
- assert(heads_base.size() == heads_swa.size());
168
+ auto sinfos_swa = kv_swa->prepare(ubatches);
169
+ if (sinfos_swa.empty()) {
170
+ break;
171
+ }
172
+
173
+ assert(sinfos_base.size() == sinfos_swa.size());
158
174
 
159
175
  return std::make_unique<llama_kv_cache_unified_iswa_context>(
160
- this, std::move(heads_base), std::move(heads_swa), std::move(ubatches));
176
+ this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
161
177
  } while (false);
162
178
 
163
179
  // TODO: if we fail again, we should attempt different splitting strategies
@@ -220,13 +236,13 @@ llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
220
236
 
221
237
  llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
222
238
  llama_kv_cache_unified_iswa * kv,
223
- std::vector<uint32_t> heads_base,
224
- std::vector<uint32_t> heads_swa,
239
+ slot_info_vec_t sinfos_base,
240
+ slot_info_vec_t sinfos_swa,
225
241
  std::vector<llama_ubatch> ubatches) :
226
242
  ubatches(std::move(ubatches)),
227
243
  // note: here we copy the ubatches. not sure if this is ideal
228
- ctx_base(new llama_kv_cache_unified_context(kv->get_base(), std::move(heads_base), this->ubatches)),
229
- ctx_swa (new llama_kv_cache_unified_context(kv->get_swa (), std::move(heads_swa), this->ubatches)),
244
+ ctx_base(new llama_kv_cache_unified_context(kv->get_base(), std::move(sinfos_base), this->ubatches)),
245
+ ctx_swa (new llama_kv_cache_unified_context(kv->get_swa (), std::move(sinfos_swa), this->ubatches)),
230
246
  status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
231
247
  }
232
248
 
@@ -246,7 +262,7 @@ bool llama_kv_cache_unified_iswa_context::next() {
246
262
  }
247
263
 
248
264
  bool llama_kv_cache_unified_iswa_context::apply() {
249
- assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
265
+ assert(!llama_memory_status_is_fail(status));
250
266
 
251
267
  bool res = true;
252
268
 
@@ -20,6 +20,7 @@ public:
20
20
  bool v_trans,
21
21
  bool offload,
22
22
  bool swa_full,
23
+ bool unified,
23
24
  uint32_t kv_size,
24
25
  uint32_t n_seq_max,
25
26
  uint32_t n_ubatch,
@@ -68,12 +69,16 @@ public:
68
69
  private:
69
70
  const llama_hparams & hparams;
70
71
 
72
+ const bool unified;
73
+
71
74
  std::unique_ptr<llama_kv_cache_unified> kv_base;
72
75
  std::unique_ptr<llama_kv_cache_unified> kv_swa;
73
76
  };
74
77
 
75
78
  class llama_kv_cache_unified_iswa_context : public llama_memory_context_i {
76
79
  public:
80
+ using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
81
+
77
82
  // used for errors
78
83
  llama_kv_cache_unified_iswa_context(llama_memory_status status);
79
84
 
@@ -90,8 +95,8 @@ public:
90
95
  // used to create a batch processing context from a batch
91
96
  llama_kv_cache_unified_iswa_context(
92
97
  llama_kv_cache_unified_iswa * kv,
93
- std::vector<uint32_t> heads_base,
94
- std::vector<uint32_t> heads_swa,
98
+ slot_info_vec_t sinfos_base,
99
+ slot_info_vec_t sinfos_swa,
95
100
  std::vector<llama_ubatch> ubatches);
96
101
 
97
102
  virtual ~llama_kv_cache_unified_iswa_context();