@fugood/llama.node 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/common/CMakeLists.txt +4 -5
- package/src/llama.cpp/common/arg.cpp +37 -0
- package/src/llama.cpp/common/common.cpp +22 -6
- package/src/llama.cpp/common/common.h +14 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
- package/src/llama.cpp/ggml/include/ggml.h +13 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +23 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +39 -0
- package/src/llama.cpp/include/llama.h +13 -48
- package/src/llama.cpp/src/llama-arch.cpp +222 -15
- package/src/llama.cpp/src/llama-arch.h +16 -1
- package/src/llama.cpp/src/llama-batch.cpp +76 -70
- package/src/llama.cpp/src/llama-batch.h +24 -18
- package/src/llama.cpp/src/llama-chat.cpp +44 -1
- package/src/llama.cpp/src/llama-chat.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +134 -95
- package/src/llama.cpp/src/llama-context.h +13 -16
- package/src/llama.cpp/src/llama-cparams.h +3 -2
- package/src/llama.cpp/src/llama-graph.cpp +239 -154
- package/src/llama.cpp/src/llama-graph.h +162 -126
- package/src/llama.cpp/src/llama-hparams.cpp +45 -0
- package/src/llama.cpp/src/llama-hparams.h +11 -1
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
- package/src/llama.cpp/src/llama-kv-cache-unified.h +89 -31
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -9
- package/src/llama.cpp/src/llama-model.cpp +2309 -665
- package/src/llama.cpp/src/llama-model.h +18 -4
- package/src/llama.cpp/src/llama-quant.cpp +2 -2
- package/src/llama.cpp/src/llama-vocab.cpp +368 -9
- package/src/llama.cpp/src/llama-vocab.h +43 -0
- package/src/llama.cpp/src/unicode.cpp +207 -0
- package/src/llama.cpp/src/unicode.h +2 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#pragma once
|
|
2
2
|
|
|
3
3
|
#include "llama-arch.h"
|
|
4
|
+
#include "llama-batch.h"
|
|
4
5
|
#include "llama-hparams.h"
|
|
5
6
|
#include "llama-adapter.h"
|
|
6
7
|
|
|
@@ -14,7 +15,6 @@ struct ggml_cgraph;
|
|
|
14
15
|
struct ggml_context;
|
|
15
16
|
struct ggml_tensor;
|
|
16
17
|
|
|
17
|
-
struct llama_ubatch;
|
|
18
18
|
struct llama_cparams;
|
|
19
19
|
|
|
20
20
|
struct llama_memory_context_i;
|
|
@@ -69,6 +69,8 @@ struct llama_cross {
|
|
|
69
69
|
std::vector<std::set<llama_seq_id>> seq_ids_enc;
|
|
70
70
|
};
|
|
71
71
|
|
|
72
|
+
struct llm_graph_params;
|
|
73
|
+
|
|
72
74
|
//
|
|
73
75
|
// llm_graph_input
|
|
74
76
|
//
|
|
@@ -78,11 +80,19 @@ public:
|
|
|
78
80
|
virtual ~llm_graph_input_i() = default;
|
|
79
81
|
|
|
80
82
|
virtual void set_input(const llama_ubatch * ubatch) = 0;
|
|
83
|
+
|
|
84
|
+
// return true if the resulting input tensors using the provided graph parameters would be
|
|
85
|
+
// the same as the previous input tensors that we have currently stored in the object
|
|
86
|
+
virtual bool can_reuse(const llm_graph_params & params) {
|
|
87
|
+
// returning false here by default will prevent from reusing the graph if the check
|
|
88
|
+
// for the input type has not been implemented yet
|
|
89
|
+
GGML_UNUSED(params);
|
|
90
|
+
return false;
|
|
91
|
+
}
|
|
81
92
|
};
|
|
82
93
|
|
|
83
94
|
using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
|
|
84
95
|
|
|
85
|
-
|
|
86
96
|
class llm_graph_input_embd : public llm_graph_input_i {
|
|
87
97
|
public:
|
|
88
98
|
llm_graph_input_embd() = default;
|
|
@@ -90,6 +100,8 @@ public:
|
|
|
90
100
|
|
|
91
101
|
void set_input(const llama_ubatch * ubatch) override;
|
|
92
102
|
|
|
103
|
+
bool can_reuse(const llm_graph_params & params) override;
|
|
104
|
+
|
|
93
105
|
ggml_tensor * tokens = nullptr; // I32 [n_batch]
|
|
94
106
|
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
|
|
95
107
|
};
|
|
@@ -101,6 +113,8 @@ public:
|
|
|
101
113
|
|
|
102
114
|
void set_input(const llama_ubatch * ubatch) override;
|
|
103
115
|
|
|
116
|
+
bool can_reuse(const llm_graph_params & params) override;
|
|
117
|
+
|
|
104
118
|
ggml_tensor * pos = nullptr; // I32 [n_batch]
|
|
105
119
|
|
|
106
120
|
const uint32_t n_pos_per_embd = 1;
|
|
@@ -154,17 +168,19 @@ public:
|
|
|
154
168
|
llm_graph_input_out_ids(
|
|
155
169
|
const llama_hparams & hparams,
|
|
156
170
|
const llama_cparams & cparams,
|
|
157
|
-
|
|
171
|
+
uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
|
|
158
172
|
virtual ~llm_graph_input_out_ids() = default;
|
|
159
173
|
|
|
160
174
|
void set_input(const llama_ubatch * ubatch) override;
|
|
161
175
|
|
|
176
|
+
bool can_reuse(const llm_graph_params & params) override;
|
|
177
|
+
|
|
162
178
|
ggml_tensor * out_ids; // I32 [n_outputs]
|
|
163
179
|
|
|
164
180
|
const llama_hparams & hparams;
|
|
165
181
|
const llama_cparams & cparams;
|
|
166
182
|
|
|
167
|
-
const
|
|
183
|
+
const uint32_t n_outputs;
|
|
168
184
|
};
|
|
169
185
|
|
|
170
186
|
class llm_graph_input_mean : public llm_graph_input_i {
|
|
@@ -249,16 +265,18 @@ public:
|
|
|
249
265
|
|
|
250
266
|
void set_input(const llama_ubatch * ubatch) override;
|
|
251
267
|
|
|
268
|
+
bool can_reuse(const llm_graph_params & params) override;
|
|
269
|
+
|
|
252
270
|
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
|
253
271
|
ggml_tensor * get_v_idxs() const { return self_v_idxs; }
|
|
254
272
|
|
|
255
273
|
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
|
256
274
|
|
|
257
275
|
ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
|
|
258
|
-
ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch]
|
|
276
|
+
ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
|
|
259
277
|
|
|
260
|
-
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch, 1,
|
|
261
|
-
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch, 1,
|
|
278
|
+
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
|
279
|
+
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
|
262
280
|
|
|
263
281
|
const llama_hparams & hparams;
|
|
264
282
|
const llama_cparams & cparams;
|
|
@@ -280,6 +298,8 @@ public:
|
|
|
280
298
|
|
|
281
299
|
void set_input(const llama_ubatch * ubatch) override;
|
|
282
300
|
|
|
301
|
+
bool can_reuse(const llm_graph_params & params) override;
|
|
302
|
+
|
|
283
303
|
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
|
284
304
|
ggml_tensor * get_v_idxs() const { return self_v_idxs; }
|
|
285
305
|
ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; }
|
|
@@ -289,14 +309,14 @@ public:
|
|
|
289
309
|
ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
|
|
290
310
|
|
|
291
311
|
ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
|
|
292
|
-
ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch]
|
|
312
|
+
ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
|
|
293
313
|
ggml_tensor * self_k_idxs_swa = nullptr; // I64 [n_batch]
|
|
294
|
-
ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch]
|
|
314
|
+
ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
|
|
295
315
|
|
|
296
|
-
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch, 1,
|
|
297
|
-
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch, 1,
|
|
298
|
-
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch, 1,
|
|
299
|
-
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch, 1,
|
|
316
|
+
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
|
317
|
+
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
|
318
|
+
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
|
319
|
+
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
|
300
320
|
|
|
301
321
|
const llama_hparams & hparams;
|
|
302
322
|
const llama_cparams & cparams;
|
|
@@ -322,47 +342,25 @@ public:
|
|
|
322
342
|
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
|
|
323
343
|
public:
|
|
324
344
|
llm_graph_input_mem_hybrid(
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
const llama_memory_hybrid_context *
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
mctx(mctx) {
|
|
331
|
-
}
|
|
345
|
+
std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn,
|
|
346
|
+
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
|
347
|
+
const llama_memory_hybrid_context * mctx) :
|
|
348
|
+
inp_attn(std::move(inp_attn)),
|
|
349
|
+
inp_rs(std::move(inp_rs)),
|
|
350
|
+
mctx(mctx) { }
|
|
332
351
|
virtual ~llm_graph_input_mem_hybrid() = default;
|
|
333
352
|
|
|
334
353
|
void set_input(const llama_ubatch * ubatch) override;
|
|
335
354
|
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
|
339
|
-
ggml_tensor * get_v_idxs() const { return self_v_idxs; }
|
|
340
|
-
|
|
341
|
-
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
|
342
|
-
|
|
343
|
-
ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
|
|
344
|
-
ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch]
|
|
345
|
-
|
|
346
|
-
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch, 1, 1]
|
|
347
|
-
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch, 1, 1]
|
|
355
|
+
std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn;
|
|
356
|
+
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
|
348
357
|
|
|
349
|
-
const
|
|
350
|
-
const
|
|
358
|
+
llm_graph_input_attn_kv_unified * get_attn() const { return inp_attn.get(); }
|
|
359
|
+
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
|
351
360
|
|
|
352
361
|
const llama_memory_hybrid_context * mctx;
|
|
353
362
|
};
|
|
354
363
|
|
|
355
|
-
// TODO: remove this when ggml_scale_add is implemented
|
|
356
|
-
class llm_graph_input_one : public llm_graph_input_i {
|
|
357
|
-
public:
|
|
358
|
-
llm_graph_input_one() {}
|
|
359
|
-
virtual ~llm_graph_input_one() = default;
|
|
360
|
-
|
|
361
|
-
void set_input(const llama_ubatch * ubatch) override;
|
|
362
|
-
|
|
363
|
-
ggml_tensor * one = nullptr; // F32
|
|
364
|
-
};
|
|
365
|
-
|
|
366
364
|
//
|
|
367
365
|
// llm_graph_result
|
|
368
366
|
//
|
|
@@ -373,40 +371,108 @@ public:
|
|
|
373
371
|
// along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
|
|
374
372
|
// these are used by the llama_context to extact the relevant data, based on the compute parameters
|
|
375
373
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
virtual ~llm_graph_result_i() = default;
|
|
374
|
+
// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
|
375
|
+
using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
|
|
379
376
|
|
|
380
|
-
|
|
381
|
-
virtual ggml_tensor * get_logits() = 0;
|
|
382
|
-
virtual ggml_tensor * get_embd() = 0;
|
|
383
|
-
virtual ggml_tensor * get_embd_pooled() = 0;
|
|
377
|
+
class llm_graph_result;
|
|
384
378
|
|
|
385
|
-
|
|
386
|
-
|
|
379
|
+
struct llm_graph_params {
|
|
380
|
+
llm_arch arch = LLM_ARCH_UNKNOWN;
|
|
387
381
|
|
|
388
|
-
|
|
382
|
+
llama_hparams hparams;
|
|
383
|
+
llama_cparams cparams;
|
|
389
384
|
|
|
385
|
+
llama_ubatch ubatch; // note: intentionally make a copy
|
|
390
386
|
|
|
391
|
-
|
|
392
|
-
public:
|
|
393
|
-
virtual ~llm_graph_result() = default;
|
|
387
|
+
llm_graph_type gtype;
|
|
394
388
|
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
ggml_tensor * get_embd() override { return t_embd; }
|
|
398
|
-
ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
|
|
389
|
+
ggml_backend_sched_t sched;
|
|
390
|
+
ggml_backend_t backend_cpu;
|
|
399
391
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
392
|
+
const llama_adapter_cvec * cvec;
|
|
393
|
+
const llama_adapter_loras * loras;
|
|
394
|
+
const llama_memory_context_i * mctx;
|
|
395
|
+
const llama_cross * cross;
|
|
396
|
+
|
|
397
|
+
uint32_t n_outputs;
|
|
398
|
+
|
|
399
|
+
llm_graph_cb cb;
|
|
400
|
+
|
|
401
|
+
llm_graph_result * res;
|
|
402
|
+
|
|
403
|
+
// return true if the "other" params would result in a graph with the same topology as with the current params
|
|
404
|
+
// having the same topology allows us to reuse the graph in some cases
|
|
405
|
+
bool allow_reuse(const llm_graph_params & other) const {
|
|
406
|
+
// first check the ubatch
|
|
407
|
+
bool can_reuse_ubatch =
|
|
408
|
+
ubatch.equal_seqs() == other.ubatch.equal_seqs() &&
|
|
409
|
+
ubatch.n_tokens == other.ubatch.n_tokens &&
|
|
410
|
+
ubatch.n_seq_tokens == other.ubatch.n_seq_tokens &&
|
|
411
|
+
ubatch.n_seqs == other.ubatch.n_seqs &&
|
|
412
|
+
ubatch.n_seqs_unq == other.ubatch.n_seqs_unq &&
|
|
413
|
+
(
|
|
414
|
+
(!ubatch.token && !other.ubatch.token) ||
|
|
415
|
+
(!ubatch.embd && !other.ubatch.embd)
|
|
416
|
+
);
|
|
417
|
+
|
|
418
|
+
if (can_reuse_ubatch && !ubatch.equal_seqs()) {
|
|
419
|
+
if (!ubatch.data) {
|
|
420
|
+
// if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
|
|
421
|
+
// therefore we cannot perform the sequence id check. normally should never happen
|
|
422
|
+
can_reuse_ubatch = false;
|
|
423
|
+
} else {
|
|
424
|
+
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
|
|
425
|
+
can_reuse_ubatch &= ubatch.seq_id_unq[s] == other.ubatch.seq_id_unq[s];
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
if (!can_reuse_ubatch) {
|
|
431
|
+
return false;
|
|
403
432
|
}
|
|
404
|
-
}
|
|
405
433
|
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
434
|
+
return
|
|
435
|
+
cparams.embeddings == other.cparams.embeddings &&
|
|
436
|
+
cparams.causal_attn == other.cparams.causal_attn &&
|
|
437
|
+
arch == other.arch &&
|
|
438
|
+
gtype == other.gtype &&
|
|
439
|
+
cvec == other.cvec &&
|
|
440
|
+
loras == other.loras &&
|
|
441
|
+
cross == other.cross &&
|
|
442
|
+
n_outputs == other.n_outputs;
|
|
409
443
|
}
|
|
444
|
+
};
|
|
445
|
+
|
|
446
|
+
class llm_graph_result {
|
|
447
|
+
public:
|
|
448
|
+
llm_graph_result(int64_t max_nodes);
|
|
449
|
+
|
|
450
|
+
virtual ~llm_graph_result() = default;
|
|
451
|
+
|
|
452
|
+
ggml_tensor * get_tokens() const { return t_tokens; }
|
|
453
|
+
ggml_tensor * get_logits() const { return t_logits; }
|
|
454
|
+
ggml_tensor * get_embd() const { return t_embd; }
|
|
455
|
+
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
|
|
456
|
+
|
|
457
|
+
ggml_cgraph * get_gf() const { return gf; }
|
|
458
|
+
ggml_context * get_ctx() const { return ctx_compute.get(); }
|
|
459
|
+
|
|
460
|
+
int64_t get_max_nodes() const;
|
|
461
|
+
|
|
462
|
+
void reset();
|
|
463
|
+
|
|
464
|
+
void set_inputs(const llama_ubatch * ubatch);
|
|
465
|
+
|
|
466
|
+
// try to update the existing graph result using the new graph parameters in order to reuse it
|
|
467
|
+
// this can only be done if we determine that the resulting graph using the new graph parameters
|
|
468
|
+
// would be identical to the existing graph. in that case, we simply have to update the memory
|
|
469
|
+
// contexts of the input tensors of the graph and we can reuse it for another computation
|
|
470
|
+
// return true if the graph was updated and can be reused
|
|
471
|
+
bool can_reuse(const llm_graph_params & params);
|
|
472
|
+
|
|
473
|
+
llm_graph_input_i * add_input(llm_graph_input_ptr input);
|
|
474
|
+
|
|
475
|
+
void set_params(const llm_graph_params & params);
|
|
410
476
|
|
|
411
477
|
// important graph nodes
|
|
412
478
|
ggml_tensor * t_tokens = nullptr;
|
|
@@ -415,36 +481,31 @@ public:
|
|
|
415
481
|
ggml_tensor * t_embd_pooled = nullptr;
|
|
416
482
|
|
|
417
483
|
std::vector<llm_graph_input_ptr> inputs;
|
|
418
|
-
};
|
|
419
484
|
|
|
420
|
-
|
|
421
|
-
// llm_graph_context
|
|
422
|
-
//
|
|
485
|
+
ggml_context_ptr ctx_compute;
|
|
423
486
|
|
|
424
|
-
//
|
|
425
|
-
|
|
487
|
+
// memory buffers used to evaluate the model
|
|
488
|
+
std::vector<uint8_t> buf_compute_meta;
|
|
426
489
|
|
|
427
|
-
|
|
428
|
-
ggml_context * ctx;
|
|
490
|
+
ggml_cgraph * gf;
|
|
429
491
|
|
|
430
|
-
|
|
492
|
+
int64_t max_nodes;
|
|
431
493
|
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
494
|
+
private:
|
|
495
|
+
// keep a copy of the previous graph parameters
|
|
496
|
+
// we will use this to determine whether the graph can be reused by comparing them with the new parameters
|
|
497
|
+
// note: these are updated after constructing the new graph
|
|
498
|
+
llm_graph_params params;
|
|
435
499
|
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
const llama_adapter_cvec * cvec;
|
|
440
|
-
const llama_adapter_loras * loras;
|
|
441
|
-
const llama_memory_context_i * mctx;
|
|
442
|
-
const llama_cross * cross;
|
|
500
|
+
// env: LLAMA_GRAPH_RESULT_DEBUG
|
|
501
|
+
int debug = 0;
|
|
502
|
+
};
|
|
443
503
|
|
|
444
|
-
|
|
504
|
+
using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>;
|
|
445
505
|
|
|
446
|
-
|
|
447
|
-
|
|
506
|
+
//
|
|
507
|
+
// llm_graph_context
|
|
508
|
+
//
|
|
448
509
|
|
|
449
510
|
// used in build_rs to properly order writes and avoid unnecessary copies
|
|
450
511
|
using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
|
|
@@ -485,8 +546,6 @@ struct llm_graph_context {
|
|
|
485
546
|
const enum llama_pooling_type pooling_type;
|
|
486
547
|
const enum llama_rope_type rope_type;
|
|
487
548
|
|
|
488
|
-
ggml_context * ctx0 = nullptr;
|
|
489
|
-
|
|
490
549
|
ggml_backend_sched_t sched;
|
|
491
550
|
|
|
492
551
|
ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
|
|
@@ -498,7 +557,10 @@ struct llm_graph_context {
|
|
|
498
557
|
|
|
499
558
|
const llm_graph_cb & cb_func;
|
|
500
559
|
|
|
501
|
-
|
|
560
|
+
llm_graph_result * res;
|
|
561
|
+
|
|
562
|
+
ggml_context * ctx0 = nullptr;
|
|
563
|
+
ggml_cgraph * gf = nullptr;
|
|
502
564
|
|
|
503
565
|
llm_graph_context(const llm_graph_params & params);
|
|
504
566
|
virtual ~llm_graph_context() = default;
|
|
@@ -579,14 +641,11 @@ struct llm_graph_context {
|
|
|
579
641
|
ggml_tensor * build_inp_pos_bucket_dec() const;
|
|
580
642
|
ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;
|
|
581
643
|
|
|
582
|
-
llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
|
|
583
|
-
|
|
584
644
|
//
|
|
585
645
|
// attention
|
|
586
646
|
//
|
|
587
647
|
|
|
588
648
|
ggml_tensor * build_attn_mha(
|
|
589
|
-
ggml_cgraph * gf,
|
|
590
649
|
ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens]
|
|
591
650
|
ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens]
|
|
592
651
|
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
|
|
@@ -599,7 +658,6 @@ struct llm_graph_context {
|
|
|
599
658
|
|
|
600
659
|
ggml_tensor * build_attn(
|
|
601
660
|
llm_graph_input_attn_no_cache * inp,
|
|
602
|
-
ggml_cgraph * gf,
|
|
603
661
|
ggml_tensor * wo,
|
|
604
662
|
ggml_tensor * wo_b,
|
|
605
663
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
@@ -614,7 +672,6 @@ struct llm_graph_context {
|
|
|
614
672
|
|
|
615
673
|
ggml_tensor * build_attn(
|
|
616
674
|
llm_graph_input_attn_kv_unified * inp,
|
|
617
|
-
ggml_cgraph * gf,
|
|
618
675
|
ggml_tensor * wo,
|
|
619
676
|
ggml_tensor * wo_b,
|
|
620
677
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
@@ -630,7 +687,6 @@ struct llm_graph_context {
|
|
|
630
687
|
// note: if k_cur or v_cur are not provided, they will not be stored in the memory
|
|
631
688
|
ggml_tensor * build_attn(
|
|
632
689
|
llm_graph_input_attn_kv_unified_iswa * inp,
|
|
633
|
-
ggml_cgraph * gf,
|
|
634
690
|
ggml_tensor * wo,
|
|
635
691
|
ggml_tensor * wo_b,
|
|
636
692
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
@@ -645,7 +701,6 @@ struct llm_graph_context {
|
|
|
645
701
|
|
|
646
702
|
ggml_tensor * build_attn(
|
|
647
703
|
llm_graph_input_attn_cross * inp,
|
|
648
|
-
ggml_cgraph * gf,
|
|
649
704
|
ggml_tensor * wo,
|
|
650
705
|
ggml_tensor * wo_b,
|
|
651
706
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
@@ -656,18 +711,6 @@ struct llm_graph_context {
|
|
|
656
711
|
float kq_scale,
|
|
657
712
|
int il) const;
|
|
658
713
|
|
|
659
|
-
ggml_tensor * build_attn(
|
|
660
|
-
llm_graph_input_mem_hybrid * inp,
|
|
661
|
-
ggml_cgraph * gf,
|
|
662
|
-
ggml_tensor * wo,
|
|
663
|
-
ggml_tensor * wo_b,
|
|
664
|
-
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
665
|
-
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
|
666
|
-
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
|
667
|
-
ggml_tensor * kq_b,
|
|
668
|
-
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
669
|
-
float kq_scale,
|
|
670
|
-
int il) const;
|
|
671
714
|
//
|
|
672
715
|
// recurrent
|
|
673
716
|
//
|
|
@@ -679,7 +722,6 @@ struct llm_graph_context {
|
|
|
679
722
|
// implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
|
|
680
723
|
// `llama_memory_recurrent`
|
|
681
724
|
ggml_tensor * build_rs(
|
|
682
|
-
ggml_cgraph * gf,
|
|
683
725
|
ggml_tensor * s,
|
|
684
726
|
ggml_tensor * state_copy,
|
|
685
727
|
int32_t state_size,
|
|
@@ -694,15 +736,6 @@ struct llm_graph_context {
|
|
|
694
736
|
|
|
695
737
|
ggml_tensor * build_rs(
|
|
696
738
|
llm_graph_input_rs * inp,
|
|
697
|
-
ggml_cgraph * gf,
|
|
698
|
-
ggml_tensor * s,
|
|
699
|
-
int32_t state_size,
|
|
700
|
-
int32_t n_seqs,
|
|
701
|
-
const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
|
|
702
|
-
|
|
703
|
-
ggml_tensor * build_rs(
|
|
704
|
-
llm_graph_input_mem_hybrid * inp,
|
|
705
|
-
ggml_cgraph * gf,
|
|
706
739
|
ggml_tensor * s,
|
|
707
740
|
int32_t state_size,
|
|
708
741
|
int32_t n_seqs,
|
|
@@ -710,21 +743,24 @@ struct llm_graph_context {
|
|
|
710
743
|
|
|
711
744
|
ggml_tensor * build_rwkv_token_shift_load(
|
|
712
745
|
llm_graph_input_rs * inp,
|
|
713
|
-
ggml_cgraph * gf,
|
|
714
746
|
const llama_ubatch & ubatch,
|
|
715
|
-
|
|
747
|
+
int il) const;
|
|
716
748
|
|
|
717
749
|
ggml_tensor * build_rwkv_token_shift_store(
|
|
718
750
|
ggml_tensor * token_shift,
|
|
719
751
|
const llama_ubatch & ubatch,
|
|
720
752
|
int il) const;
|
|
753
|
+
//
|
|
754
|
+
// hybrid
|
|
755
|
+
//
|
|
756
|
+
|
|
757
|
+
llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
|
|
721
758
|
|
|
722
759
|
//
|
|
723
760
|
// pooling
|
|
724
761
|
//
|
|
725
762
|
|
|
726
763
|
void build_pooling(
|
|
727
|
-
ggml_cgraph * gf,
|
|
728
764
|
ggml_tensor * cls,
|
|
729
765
|
ggml_tensor * cls_b,
|
|
730
766
|
ggml_tensor * cls_out,
|
|
@@ -65,12 +65,57 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
|
|
|
65
65
|
return n_embd_head_v * n_head_kv;
|
|
66
66
|
}
|
|
67
67
|
|
|
68
|
+
bool llama_hparams::is_n_embd_k_gqa_variable() const {
|
|
69
|
+
const uint32_t val = n_embd_k_gqa();
|
|
70
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
71
|
+
if (val != n_embd_k_gqa(il)) {
|
|
72
|
+
return true;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return false;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
bool llama_hparams::is_n_embd_v_gqa_variable() const {
|
|
80
|
+
const uint32_t val = n_embd_v_gqa();
|
|
81
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
82
|
+
if (val != n_embd_v_gqa(il)) {
|
|
83
|
+
return true;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
return false;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
uint32_t llama_hparams::n_embd_k_gqa_max() const {
|
|
91
|
+
uint32_t val = n_embd_k_gqa();
|
|
92
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
93
|
+
val = std::max(val, n_embd_k_gqa(il));
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
return val;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
uint32_t llama_hparams::n_embd_v_gqa_max() const {
|
|
100
|
+
uint32_t val = n_embd_v_gqa();
|
|
101
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
102
|
+
val = std::max(val, n_embd_v_gqa(il));
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
return val;
|
|
106
|
+
}
|
|
107
|
+
|
|
68
108
|
uint32_t llama_hparams::n_embd_r() const {
|
|
69
109
|
if (wkv_head_size != 0) {
|
|
70
110
|
// for RWKV models
|
|
71
111
|
return token_shift_count * n_embd;
|
|
72
112
|
}
|
|
73
113
|
|
|
114
|
+
if (n_shortconv_l_cache != 0) {
|
|
115
|
+
// for LFM2 models
|
|
116
|
+
return n_embd * (n_shortconv_l_cache - 1);
|
|
117
|
+
}
|
|
118
|
+
|
|
74
119
|
// TODO: maybe support other convolution strides than 1
|
|
75
120
|
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
|
76
121
|
// Corresponds to Mamba's conv_states size
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
// bump if necessary
|
|
8
8
|
#define LLAMA_MAX_LAYERS 512
|
|
9
|
-
#define LLAMA_MAX_EXPERTS
|
|
9
|
+
#define LLAMA_MAX_EXPERTS 384 // Kimi-K2
|
|
10
10
|
|
|
11
11
|
enum llama_expert_gating_func_type {
|
|
12
12
|
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
|
|
@@ -55,6 +55,8 @@ struct llama_hparams {
|
|
|
55
55
|
struct llama_hparams_posnet posnet;
|
|
56
56
|
struct llama_hparams_convnext convnext;
|
|
57
57
|
|
|
58
|
+
uint32_t n_shortconv_l_cache = 0;
|
|
59
|
+
|
|
58
60
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
|
|
59
61
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
|
60
62
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
|
@@ -189,6 +191,14 @@ struct llama_hparams {
|
|
|
189
191
|
// dimension of value embeddings across all k-v heads
|
|
190
192
|
uint32_t n_embd_v_gqa(uint32_t il = 0) const;
|
|
191
193
|
|
|
194
|
+
// true if any layer has a different n_embd_k_gqa/n_embd_v_gqa
|
|
195
|
+
bool is_n_embd_k_gqa_variable() const;
|
|
196
|
+
bool is_n_embd_v_gqa_variable() const;
|
|
197
|
+
|
|
198
|
+
// return the maximum n_embd_k_gqa/n_embd_v_gqa across all layers
|
|
199
|
+
uint32_t n_embd_k_gqa_max() const;
|
|
200
|
+
uint32_t n_embd_v_gqa_max() const;
|
|
201
|
+
|
|
192
202
|
// dimension of the rolling state embeddings
|
|
193
203
|
// corresponds to Mamba's conv_states size or RWKV's token_shift states size
|
|
194
204
|
uint32_t n_embd_r() const;
|
|
@@ -18,16 +18,17 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
|
|
18
18
|
bool v_trans,
|
|
19
19
|
bool offload,
|
|
20
20
|
bool swa_full,
|
|
21
|
+
bool unified,
|
|
21
22
|
uint32_t kv_size,
|
|
22
23
|
uint32_t n_seq_max,
|
|
23
24
|
uint32_t n_ubatch,
|
|
24
|
-
uint32_t n_pad) : hparams(model.hparams) {
|
|
25
|
+
uint32_t n_pad) : hparams(model.hparams), unified(unified) {
|
|
25
26
|
llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
|
|
26
27
|
llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
|
|
27
28
|
|
|
28
29
|
const uint32_t size_base = kv_size;
|
|
29
30
|
|
|
30
|
-
uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad));
|
|
31
|
+
uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch, n_pad));
|
|
31
32
|
|
|
32
33
|
// when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
|
|
33
34
|
if (swa_full) {
|
|
@@ -41,14 +42,14 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
|
|
41
42
|
|
|
42
43
|
kv_base = std::make_unique<llama_kv_cache_unified>(
|
|
43
44
|
model, std::move(filter_base), type_k, type_v,
|
|
44
|
-
v_trans, offload, size_base, n_seq_max, n_pad,
|
|
45
|
+
v_trans, offload, unified, size_base, n_seq_max, n_pad,
|
|
45
46
|
0, LLAMA_SWA_TYPE_NONE);
|
|
46
47
|
|
|
47
48
|
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
|
|
48
49
|
|
|
49
50
|
kv_swa = std::make_unique<llama_kv_cache_unified>(
|
|
50
51
|
model, std::move(filter_swa), type_k, type_v,
|
|
51
|
-
v_trans, offload, size_swa, n_seq_max, n_pad,
|
|
52
|
+
v_trans, offload, unified, size_swa, n_seq_max, n_pad,
|
|
52
53
|
hparams.n_swa, hparams.swa_type);
|
|
53
54
|
}
|
|
54
55
|
|
|
@@ -100,6 +101,11 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
|
|
|
100
101
|
|
|
101
102
|
// first try simple split
|
|
102
103
|
do {
|
|
104
|
+
if (!unified) {
|
|
105
|
+
// requires equal splits, so we skip the simple split
|
|
106
|
+
break;
|
|
107
|
+
}
|
|
108
|
+
|
|
103
109
|
balloc.split_reset();
|
|
104
110
|
|
|
105
111
|
std::vector<llama_ubatch> ubatches;
|
|
@@ -140,7 +146,7 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
|
|
|
140
146
|
|
|
141
147
|
std::vector<llama_ubatch> ubatches;
|
|
142
148
|
while (true) {
|
|
143
|
-
auto ubatch = balloc.split_equal(n_ubatch,
|
|
149
|
+
auto ubatch = balloc.split_equal(n_ubatch, !unified);
|
|
144
150
|
|
|
145
151
|
if (ubatch.n_tokens == 0) {
|
|
146
152
|
break;
|
|
@@ -20,6 +20,7 @@ public:
|
|
|
20
20
|
bool v_trans,
|
|
21
21
|
bool offload,
|
|
22
22
|
bool swa_full,
|
|
23
|
+
bool unified,
|
|
23
24
|
uint32_t kv_size,
|
|
24
25
|
uint32_t n_seq_max,
|
|
25
26
|
uint32_t n_ubatch,
|
|
@@ -68,6 +69,8 @@ public:
|
|
|
68
69
|
private:
|
|
69
70
|
const llama_hparams & hparams;
|
|
70
71
|
|
|
72
|
+
const bool unified;
|
|
73
|
+
|
|
71
74
|
std::unique_ptr<llama_kv_cache_unified> kv_base;
|
|
72
75
|
std::unique_ptr<llama_kv_cache_unified> kv_swa;
|
|
73
76
|
};
|