@fugood/llama.node 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +12 -12
  3. package/src/llama.cpp/CMakeLists.txt +0 -1
  4. package/src/llama.cpp/common/arg.cpp +17 -0
  5. package/src/llama.cpp/common/chat.cpp +37 -20
  6. package/src/llama.cpp/common/chat.h +2 -0
  7. package/src/llama.cpp/common/common.h +4 -0
  8. package/src/llama.cpp/ggml/CMakeLists.txt +7 -2
  9. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
  10. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  11. package/src/llama.cpp/ggml/include/ggml.h +181 -10
  12. package/src/llama.cpp/ggml/src/CMakeLists.txt +0 -1
  13. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +38 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1297 -211
  17. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +7 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  19. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
  20. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -9
  21. package/src/llama.cpp/include/llama.h +1 -0
  22. package/src/llama.cpp/src/llama-arch.cpp +108 -2
  23. package/src/llama.cpp/src/llama-arch.h +7 -0
  24. package/src/llama.cpp/src/llama-batch.cpp +27 -1
  25. package/src/llama.cpp/src/llama-batch.h +8 -1
  26. package/src/llama.cpp/src/llama-chat.cpp +15 -0
  27. package/src/llama.cpp/src/llama-chat.h +1 -0
  28. package/src/llama.cpp/src/llama-graph.cpp +95 -81
  29. package/src/llama.cpp/src/llama-graph.h +43 -16
  30. package/src/llama.cpp/src/llama-hparams.cpp +2 -1
  31. package/src/llama.cpp/src/llama-hparams.h +1 -0
  32. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
  33. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
  34. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
  35. package/src/llama.cpp/src/llama-kv-cache-unified.h +62 -24
  36. package/src/llama.cpp/src/llama-kv-cells.h +62 -10
  37. package/src/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
  38. package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
  39. package/src/llama.cpp/src/llama-memory-recurrent.cpp +34 -16
  40. package/src/llama.cpp/src/llama-memory.cpp +17 -0
  41. package/src/llama.cpp/src/llama-memory.h +3 -0
  42. package/src/llama.cpp/src/llama-model.cpp +1374 -210
  43. package/src/llama.cpp/src/llama-model.h +3 -0
  44. package/src/llama.cpp/src/llama-vocab.cpp +8 -1
  45. package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
@@ -105,10 +105,30 @@ public:
105
105
  res.resize(n);
106
106
 
107
107
  for (uint32_t j = 0; j < n; ++j) {
108
- res.pos[j] = pos[i + j];
109
- res.seq[j] = seq[i + j];
108
+ const auto idx = i + j;
110
109
 
111
- assert(shift[i + j] == 0);
110
+ res.pos[j] = pos[idx];
111
+ res.seq[j] = seq[idx];
112
+
113
+ assert(shift[idx] == 0);
114
+ }
115
+
116
+ return res;
117
+ }
118
+
119
+ // copy the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
120
+ llama_kv_cells_unified cp(const std::vector<uint32_t> & idxs) const {
121
+ llama_kv_cells_unified res;
122
+
123
+ res.resize(idxs.size());
124
+
125
+ for (uint32_t j = 0; j < idxs.size(); ++j) {
126
+ const auto idx = idxs[j];
127
+
128
+ res.pos[j] = pos[idx];
129
+ res.seq[j] = seq[idx];
130
+
131
+ assert(shift[idx] == 0);
112
132
  }
113
133
 
114
134
  return res;
@@ -119,26 +139,58 @@ public:
119
139
  assert(i + other.pos.size() <= pos.size());
120
140
 
121
141
  for (uint32_t j = 0; j < other.pos.size(); ++j) {
122
- if (pos[i + j] == -1 && other.pos[j] != -1) {
142
+ const auto idx = i + j;
143
+
144
+ if (pos[idx] == -1 && other.pos[j] != -1) {
123
145
  used.insert(i + j);
124
146
  }
125
147
 
126
- if (pos[i + j] != -1 && other.pos[j] == -1) {
148
+ if (pos[idx] != -1 && other.pos[j] == -1) {
127
149
  used.erase(i + j);
128
150
  }
129
151
 
130
- if (pos[i + j] != -1) {
152
+ if (pos[idx] != -1) {
131
153
  seq_pos_rm(i + j);
132
154
  }
133
155
 
134
- pos[i + j] = other.pos[j];
135
- seq[i + j] = other.seq[j];
156
+ pos[idx] = other.pos[j];
157
+ seq[idx] = other.seq[j];
136
158
 
137
- if (pos[i + j] != -1) {
159
+ if (pos[idx] != -1) {
138
160
  seq_pos_add(i + j);
139
161
  }
140
162
 
141
- assert(shift[i + j] == 0);
163
+ assert(shift[idx] == 0);
164
+ }
165
+ }
166
+
167
+ // set the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
168
+ void set(const std::vector<uint32_t> & idxs, const llama_kv_cells_unified & other) {
169
+ assert(idxs.size() == other.pos.size());
170
+
171
+ for (uint32_t j = 0; j < other.pos.size(); ++j) {
172
+ const auto idx = idxs[j];
173
+
174
+ if (pos[idx] == -1 && other.pos[j] != -1) {
175
+ used.insert(idx);
176
+ }
177
+
178
+ if (pos[idx] != -1 && other.pos[j] == -1) {
179
+ used.erase(idx);
180
+ }
181
+
182
+ if (pos[idx] != -1) {
183
+ seq_pos_rm(idx);
184
+ }
185
+
186
+ pos[idx] = other.pos[j];
187
+ seq[idx] = other.seq[j];
188
+
189
+ if (pos[idx] != -1) {
190
+ seq_pos_add(idx);
191
+ }
192
+
193
+ assert(shift[idx] == 0);
142
194
  }
143
195
  }
144
196
 
@@ -70,7 +70,7 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
70
70
  // if all tokens are output, split by sequence
71
71
  ubatch = balloc.split_seq(n_ubatch);
72
72
  } else {
73
- ubatch = balloc.split_equal(n_ubatch);
73
+ ubatch = balloc.split_equal(n_ubatch, false);
74
74
  }
75
75
 
76
76
  if (ubatch.n_tokens == 0) {
@@ -80,6 +80,11 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
80
80
  ubatches.push_back(std::move(ubatch)); // NOLINT
81
81
  }
82
82
 
83
+ if (balloc.get_n_used() < balloc.get_n_tokens()) {
84
+ // failed to find a suitable split
85
+ break;
86
+ }
87
+
83
88
  // prepare the recurrent batches first
84
89
  if (!mem_recr->prepare(ubatches)) {
85
90
  // TODO: will the recurrent cache be in an undefined context at this point?
@@ -195,11 +200,11 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
195
200
 
196
201
  llama_memory_hybrid_context::llama_memory_hybrid_context(
197
202
  llama_memory_hybrid * mem,
198
- std::vector<uint32_t> heads_attn,
203
+ slot_info_vec_t sinfos_attn,
199
204
  std::vector<llama_ubatch> ubatches) :
200
205
  ubatches(std::move(ubatches)),
201
206
  // note: here we copy the ubatches. not sure if this is ideal
202
- ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(heads_attn), this->ubatches)),
207
+ ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
203
208
  ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
204
209
  status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
205
210
  }
@@ -218,7 +223,7 @@ bool llama_memory_hybrid_context::next() {
218
223
  }
219
224
 
220
225
  bool llama_memory_hybrid_context::apply() {
221
- assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
226
+ assert(!llama_memory_status_is_fail(status));
222
227
 
223
228
  bool res = true;
224
229
 
@@ -92,6 +92,8 @@ private:
92
92
 
93
93
  class llama_memory_hybrid_context : public llama_memory_context_i {
94
94
  public:
95
+ using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
96
+
95
97
  // init failure
96
98
  explicit llama_memory_hybrid_context(llama_memory_status status);
97
99
 
@@ -107,7 +109,7 @@ public:
107
109
  // init success
108
110
  llama_memory_hybrid_context(
109
111
  llama_memory_hybrid * mem,
110
- std::vector<uint32_t> heads_attn,
112
+ slot_info_vec_t sinfos_attn,
111
113
  std::vector<llama_ubatch> ubatches);
112
114
 
113
115
  ~llama_memory_hybrid_context() = default;
@@ -363,30 +363,40 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
363
363
  }
364
364
 
365
365
  llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
366
- std::vector<llama_ubatch> ubatches;
366
+ do {
367
+ balloc.split_reset();
367
368
 
368
- while (true) {
369
- llama_ubatch ubatch;
369
+ std::vector<llama_ubatch> ubatches;
370
+ while (true) {
371
+ llama_ubatch ubatch;
370
372
 
371
- if (embd_all) {
372
- // if all tokens are output, split by sequence
373
- ubatch = balloc.split_seq(n_ubatch);
374
- } else {
375
- ubatch = balloc.split_equal(n_ubatch);
373
+ if (embd_all) {
374
+ // if all tokens are output, split by sequence
375
+ ubatch = balloc.split_seq(n_ubatch);
376
+ } else {
377
+ ubatch = balloc.split_equal(n_ubatch, false);
378
+ }
379
+
380
+ if (ubatch.n_tokens == 0) {
381
+ break;
382
+ }
383
+
384
+ ubatches.push_back(std::move(ubatch)); // NOLINT
376
385
  }
377
386
 
378
- if (ubatch.n_tokens == 0) {
387
+ if (balloc.get_n_used() < balloc.get_n_tokens()) {
388
+ // failed to find a suitable split
379
389
  break;
380
390
  }
381
391
 
382
- ubatches.push_back(std::move(ubatch)); // NOLINT
383
- }
392
+ if (!prepare(ubatches)) {
393
+ break;
394
+ }
384
395
 
385
- if (!prepare(ubatches)) {
386
- return std::make_unique<llama_memory_recurrent_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
387
- }
396
+ return std::make_unique<llama_memory_recurrent_context>(this, std::move(ubatches));
397
+ } while (false);
388
398
 
389
- return std::make_unique<llama_memory_recurrent_context>(this, std::move(ubatches));
399
+ return std::make_unique<llama_memory_recurrent_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
390
400
  }
391
401
 
392
402
  llama_memory_context_ptr llama_memory_recurrent::init_full() {
@@ -1066,7 +1076,15 @@ bool llama_memory_recurrent_context::next() {
1066
1076
  }
1067
1077
 
1068
1078
  bool llama_memory_recurrent_context::apply() {
1069
- assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
1079
+ assert(!llama_memory_status_is_fail(status));
1080
+
1081
+ // no ubatches -> this is an update
1082
+ if (ubatches.empty()) {
1083
+ // recurrent cache never performs updates
1084
+ assert(status == LLAMA_MEMORY_STATUS_NO_UPDATE);
1085
+
1086
+ return true;
1087
+ }
1070
1088
 
1071
1089
  mem->find_slot(ubatches[i_next]);
1072
1090
 
@@ -40,3 +40,20 @@ llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_me
40
40
  // if either status has an update, then the combined status has an update
41
41
  return has_update ? LLAMA_MEMORY_STATUS_SUCCESS : LLAMA_MEMORY_STATUS_NO_UPDATE;
42
42
  }
43
+
44
+ bool llama_memory_status_is_fail(llama_memory_status status) {
45
+ switch (status) {
46
+ case LLAMA_MEMORY_STATUS_SUCCESS:
47
+ case LLAMA_MEMORY_STATUS_NO_UPDATE:
48
+ {
49
+ return false;
50
+ }
51
+ case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
52
+ case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
53
+ {
54
+ return true;
55
+ }
56
+ }
57
+
58
+ return false;
59
+ }
@@ -31,6 +31,9 @@ enum llama_memory_status {
31
31
  // useful for implementing hybrid memory types (e.g. iSWA)
32
32
  llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1);
33
33
 
34
+ // helper function for checking if a memory status indicates a failure
35
+ bool llama_memory_status_is_fail(llama_memory_status status);
36
+
34
37
  // the interface for managing the memory context during batch processing
35
38
  // this interface is implemented per memory type. see:
36
39
  // - llama_kv_cache_unified_context