@fugood/llama.node 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/CMakeLists.txt +0 -1
- package/src/llama.cpp/common/CMakeLists.txt +4 -5
- package/src/llama.cpp/common/arg.cpp +44 -0
- package/src/llama.cpp/common/common.cpp +22 -6
- package/src/llama.cpp/common/common.h +15 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +10 -2
- package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
- package/src/llama.cpp/ggml/include/ggml.h +104 -10
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +12 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +749 -163
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +12 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +88 -9
- package/src/llama.cpp/include/llama.h +13 -47
- package/src/llama.cpp/src/llama-arch.cpp +298 -3
- package/src/llama.cpp/src/llama-arch.h +22 -1
- package/src/llama.cpp/src/llama-batch.cpp +103 -71
- package/src/llama.cpp/src/llama-batch.h +31 -18
- package/src/llama.cpp/src/llama-chat.cpp +59 -1
- package/src/llama.cpp/src/llama-chat.h +3 -0
- package/src/llama.cpp/src/llama-context.cpp +134 -95
- package/src/llama.cpp/src/llama-context.h +13 -16
- package/src/llama.cpp/src/llama-cparams.h +3 -2
- package/src/llama.cpp/src/llama-graph.cpp +279 -180
- package/src/llama.cpp/src/llama-graph.h +183 -122
- package/src/llama.cpp/src/llama-hparams.cpp +47 -1
- package/src/llama.cpp/src/llama-hparams.h +12 -1
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
- package/src/llama.cpp/src/llama-kv-cache-unified.h +143 -47
- package/src/llama.cpp/src/llama-kv-cells.h +62 -10
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
- package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +21 -11
- package/src/llama.cpp/src/llama-memory.cpp +17 -0
- package/src/llama.cpp/src/llama-memory.h +3 -0
- package/src/llama.cpp/src/llama-model.cpp +3373 -743
- package/src/llama.cpp/src/llama-model.h +20 -4
- package/src/llama.cpp/src/llama-quant.cpp +2 -2
- package/src/llama.cpp/src/llama-vocab.cpp +376 -10
- package/src/llama.cpp/src/llama-vocab.h +43 -0
- package/src/llama.cpp/src/unicode.cpp +207 -0
- package/src/llama.cpp/src/unicode.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#pragma once
|
|
2
2
|
|
|
3
3
|
#include "llama-arch.h"
|
|
4
|
+
#include "llama-batch.h"
|
|
4
5
|
#include "llama-hparams.h"
|
|
5
6
|
#include "llama-adapter.h"
|
|
6
7
|
|
|
@@ -14,7 +15,6 @@ struct ggml_cgraph;
|
|
|
14
15
|
struct ggml_context;
|
|
15
16
|
struct ggml_tensor;
|
|
16
17
|
|
|
17
|
-
struct llama_ubatch;
|
|
18
18
|
struct llama_cparams;
|
|
19
19
|
|
|
20
20
|
struct llama_memory_context_i;
|
|
@@ -69,6 +69,8 @@ struct llama_cross {
|
|
|
69
69
|
std::vector<std::set<llama_seq_id>> seq_ids_enc;
|
|
70
70
|
};
|
|
71
71
|
|
|
72
|
+
struct llm_graph_params;
|
|
73
|
+
|
|
72
74
|
//
|
|
73
75
|
// llm_graph_input
|
|
74
76
|
//
|
|
@@ -78,11 +80,19 @@ public:
|
|
|
78
80
|
virtual ~llm_graph_input_i() = default;
|
|
79
81
|
|
|
80
82
|
virtual void set_input(const llama_ubatch * ubatch) = 0;
|
|
83
|
+
|
|
84
|
+
// return true if the resulting input tensors using the provided graph parameters would be
|
|
85
|
+
// the same as the previous input tensors that we have currently stored in the object
|
|
86
|
+
virtual bool can_reuse(const llm_graph_params & params) {
|
|
87
|
+
// returning false here by default will prevent from reusing the graph if the check
|
|
88
|
+
// for the input type has not been implemented yet
|
|
89
|
+
GGML_UNUSED(params);
|
|
90
|
+
return false;
|
|
91
|
+
}
|
|
81
92
|
};
|
|
82
93
|
|
|
83
94
|
using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
|
|
84
95
|
|
|
85
|
-
|
|
86
96
|
class llm_graph_input_embd : public llm_graph_input_i {
|
|
87
97
|
public:
|
|
88
98
|
llm_graph_input_embd() = default;
|
|
@@ -90,6 +100,8 @@ public:
|
|
|
90
100
|
|
|
91
101
|
void set_input(const llama_ubatch * ubatch) override;
|
|
92
102
|
|
|
103
|
+
bool can_reuse(const llm_graph_params & params) override;
|
|
104
|
+
|
|
93
105
|
ggml_tensor * tokens = nullptr; // I32 [n_batch]
|
|
94
106
|
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
|
|
95
107
|
};
|
|
@@ -101,6 +113,8 @@ public:
|
|
|
101
113
|
|
|
102
114
|
void set_input(const llama_ubatch * ubatch) override;
|
|
103
115
|
|
|
116
|
+
bool can_reuse(const llm_graph_params & params) override;
|
|
117
|
+
|
|
104
118
|
ggml_tensor * pos = nullptr; // I32 [n_batch]
|
|
105
119
|
|
|
106
120
|
const uint32_t n_pos_per_embd = 1;
|
|
@@ -154,17 +168,19 @@ public:
|
|
|
154
168
|
llm_graph_input_out_ids(
|
|
155
169
|
const llama_hparams & hparams,
|
|
156
170
|
const llama_cparams & cparams,
|
|
157
|
-
|
|
171
|
+
uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
|
|
158
172
|
virtual ~llm_graph_input_out_ids() = default;
|
|
159
173
|
|
|
160
174
|
void set_input(const llama_ubatch * ubatch) override;
|
|
161
175
|
|
|
176
|
+
bool can_reuse(const llm_graph_params & params) override;
|
|
177
|
+
|
|
162
178
|
ggml_tensor * out_ids; // I32 [n_outputs]
|
|
163
179
|
|
|
164
180
|
const llama_hparams & hparams;
|
|
165
181
|
const llama_cparams & cparams;
|
|
166
182
|
|
|
167
|
-
const
|
|
183
|
+
const uint32_t n_outputs;
|
|
168
184
|
};
|
|
169
185
|
|
|
170
186
|
class llm_graph_input_mean : public llm_graph_input_i {
|
|
@@ -228,8 +244,8 @@ public:
|
|
|
228
244
|
|
|
229
245
|
ggml_tensor * get_kq_mask() const { return kq_mask_cnv; }
|
|
230
246
|
|
|
231
|
-
ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch]
|
|
232
|
-
ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch]
|
|
247
|
+
ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch, 1, 1]
|
|
248
|
+
ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch, 1, 1]
|
|
233
249
|
|
|
234
250
|
const llama_hparams & hparams;
|
|
235
251
|
const llama_cparams & cparams;
|
|
@@ -249,10 +265,18 @@ public:
|
|
|
249
265
|
|
|
250
266
|
void set_input(const llama_ubatch * ubatch) override;
|
|
251
267
|
|
|
268
|
+
bool can_reuse(const llm_graph_params & params) override;
|
|
269
|
+
|
|
270
|
+
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
|
271
|
+
ggml_tensor * get_v_idxs() const { return self_v_idxs; }
|
|
272
|
+
|
|
252
273
|
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
|
253
274
|
|
|
254
|
-
ggml_tensor *
|
|
255
|
-
ggml_tensor *
|
|
275
|
+
ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
|
|
276
|
+
ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
|
|
277
|
+
|
|
278
|
+
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
|
279
|
+
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
|
256
280
|
|
|
257
281
|
const llama_hparams & hparams;
|
|
258
282
|
const llama_cparams & cparams;
|
|
@@ -274,13 +298,25 @@ public:
|
|
|
274
298
|
|
|
275
299
|
void set_input(const llama_ubatch * ubatch) override;
|
|
276
300
|
|
|
301
|
+
bool can_reuse(const llm_graph_params & params) override;
|
|
302
|
+
|
|
303
|
+
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
|
304
|
+
ggml_tensor * get_v_idxs() const { return self_v_idxs; }
|
|
305
|
+
ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; }
|
|
306
|
+
ggml_tensor * get_v_idxs_swa() const { return self_v_idxs_swa; }
|
|
307
|
+
|
|
277
308
|
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
|
278
309
|
ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
|
|
279
310
|
|
|
280
|
-
ggml_tensor *
|
|
281
|
-
ggml_tensor *
|
|
282
|
-
ggml_tensor *
|
|
283
|
-
ggml_tensor *
|
|
311
|
+
ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
|
|
312
|
+
ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
|
|
313
|
+
ggml_tensor * self_k_idxs_swa = nullptr; // I64 [n_batch]
|
|
314
|
+
ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
|
|
315
|
+
|
|
316
|
+
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
|
317
|
+
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
|
318
|
+
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
|
319
|
+
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
|
284
320
|
|
|
285
321
|
const llama_hparams & hparams;
|
|
286
322
|
const llama_cparams & cparams;
|
|
@@ -297,8 +333,8 @@ public:
|
|
|
297
333
|
|
|
298
334
|
ggml_tensor * get_kq_mask_cross() const { return cross_kq_mask_cnv; }
|
|
299
335
|
|
|
300
|
-
ggml_tensor * cross_kq_mask = nullptr; // F32 [n_outputs_enc, n_batch]
|
|
301
|
-
ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch]
|
|
336
|
+
ggml_tensor * cross_kq_mask = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1]
|
|
337
|
+
ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1]
|
|
302
338
|
|
|
303
339
|
const llama_cross * cross = nullptr;
|
|
304
340
|
};
|
|
@@ -306,41 +342,25 @@ public:
|
|
|
306
342
|
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
|
|
307
343
|
public:
|
|
308
344
|
llm_graph_input_mem_hybrid(
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
const llama_memory_hybrid_context *
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
mctx(mctx) {
|
|
315
|
-
}
|
|
345
|
+
std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn,
|
|
346
|
+
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
|
347
|
+
const llama_memory_hybrid_context * mctx) :
|
|
348
|
+
inp_attn(std::move(inp_attn)),
|
|
349
|
+
inp_rs(std::move(inp_rs)),
|
|
350
|
+
mctx(mctx) { }
|
|
316
351
|
virtual ~llm_graph_input_mem_hybrid() = default;
|
|
317
352
|
|
|
318
353
|
void set_input(const llama_ubatch * ubatch) override;
|
|
319
354
|
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
|
323
|
-
|
|
324
|
-
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch]
|
|
325
|
-
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch]
|
|
355
|
+
std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn;
|
|
356
|
+
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
|
326
357
|
|
|
327
|
-
const
|
|
328
|
-
const
|
|
358
|
+
llm_graph_input_attn_kv_unified * get_attn() const { return inp_attn.get(); }
|
|
359
|
+
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
|
329
360
|
|
|
330
361
|
const llama_memory_hybrid_context * mctx;
|
|
331
362
|
};
|
|
332
363
|
|
|
333
|
-
// TODO: remove this when ggml_scale_add is implemented
|
|
334
|
-
class llm_graph_input_one : public llm_graph_input_i {
|
|
335
|
-
public:
|
|
336
|
-
llm_graph_input_one() {}
|
|
337
|
-
virtual ~llm_graph_input_one() = default;
|
|
338
|
-
|
|
339
|
-
void set_input(const llama_ubatch *) override;
|
|
340
|
-
|
|
341
|
-
ggml_tensor * one = nullptr; // F32
|
|
342
|
-
};
|
|
343
|
-
|
|
344
364
|
//
|
|
345
365
|
// llm_graph_result
|
|
346
366
|
//
|
|
@@ -351,40 +371,108 @@ public:
|
|
|
351
371
|
// along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
|
|
352
372
|
// these are used by the llama_context to extact the relevant data, based on the compute parameters
|
|
353
373
|
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
virtual ~llm_graph_result_i() = default;
|
|
374
|
+
// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
|
375
|
+
using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
|
|
357
376
|
|
|
358
|
-
|
|
359
|
-
virtual ggml_tensor * get_logits() = 0;
|
|
360
|
-
virtual ggml_tensor * get_embd() = 0;
|
|
361
|
-
virtual ggml_tensor * get_embd_pooled() = 0;
|
|
377
|
+
class llm_graph_result;
|
|
362
378
|
|
|
363
|
-
|
|
364
|
-
|
|
379
|
+
struct llm_graph_params {
|
|
380
|
+
llm_arch arch = LLM_ARCH_UNKNOWN;
|
|
365
381
|
|
|
366
|
-
|
|
382
|
+
llama_hparams hparams;
|
|
383
|
+
llama_cparams cparams;
|
|
367
384
|
|
|
385
|
+
llama_ubatch ubatch; // note: intentionally make a copy
|
|
368
386
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
387
|
+
llm_graph_type gtype;
|
|
388
|
+
|
|
389
|
+
ggml_backend_sched_t sched;
|
|
390
|
+
ggml_backend_t backend_cpu;
|
|
391
|
+
|
|
392
|
+
const llama_adapter_cvec * cvec;
|
|
393
|
+
const llama_adapter_loras * loras;
|
|
394
|
+
const llama_memory_context_i * mctx;
|
|
395
|
+
const llama_cross * cross;
|
|
372
396
|
|
|
373
|
-
|
|
374
|
-
ggml_tensor * get_logits() override { return t_logits; }
|
|
375
|
-
ggml_tensor * get_embd() override { return t_embd; }
|
|
376
|
-
ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
|
|
397
|
+
uint32_t n_outputs;
|
|
377
398
|
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
399
|
+
llm_graph_cb cb;
|
|
400
|
+
|
|
401
|
+
llm_graph_result * res;
|
|
402
|
+
|
|
403
|
+
// return true if the "other" params would result in a graph with the same topology as with the current params
|
|
404
|
+
// having the same topology allows us to reuse the graph in some cases
|
|
405
|
+
bool allow_reuse(const llm_graph_params & other) const {
|
|
406
|
+
// first check the ubatch
|
|
407
|
+
bool can_reuse_ubatch =
|
|
408
|
+
ubatch.equal_seqs() == other.ubatch.equal_seqs() &&
|
|
409
|
+
ubatch.n_tokens == other.ubatch.n_tokens &&
|
|
410
|
+
ubatch.n_seq_tokens == other.ubatch.n_seq_tokens &&
|
|
411
|
+
ubatch.n_seqs == other.ubatch.n_seqs &&
|
|
412
|
+
ubatch.n_seqs_unq == other.ubatch.n_seqs_unq &&
|
|
413
|
+
(
|
|
414
|
+
(!ubatch.token && !other.ubatch.token) ||
|
|
415
|
+
(!ubatch.embd && !other.ubatch.embd)
|
|
416
|
+
);
|
|
417
|
+
|
|
418
|
+
if (can_reuse_ubatch && !ubatch.equal_seqs()) {
|
|
419
|
+
if (!ubatch.data) {
|
|
420
|
+
// if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
|
|
421
|
+
// therefore we cannot perform the sequence id check. normally should never happen
|
|
422
|
+
can_reuse_ubatch = false;
|
|
423
|
+
} else {
|
|
424
|
+
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
|
|
425
|
+
can_reuse_ubatch &= ubatch.seq_id_unq[s] == other.ubatch.seq_id_unq[s];
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
if (!can_reuse_ubatch) {
|
|
431
|
+
return false;
|
|
381
432
|
}
|
|
382
|
-
}
|
|
383
433
|
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
434
|
+
return
|
|
435
|
+
cparams.embeddings == other.cparams.embeddings &&
|
|
436
|
+
cparams.causal_attn == other.cparams.causal_attn &&
|
|
437
|
+
arch == other.arch &&
|
|
438
|
+
gtype == other.gtype &&
|
|
439
|
+
cvec == other.cvec &&
|
|
440
|
+
loras == other.loras &&
|
|
441
|
+
cross == other.cross &&
|
|
442
|
+
n_outputs == other.n_outputs;
|
|
387
443
|
}
|
|
444
|
+
};
|
|
445
|
+
|
|
446
|
+
class llm_graph_result {
|
|
447
|
+
public:
|
|
448
|
+
llm_graph_result(int64_t max_nodes);
|
|
449
|
+
|
|
450
|
+
virtual ~llm_graph_result() = default;
|
|
451
|
+
|
|
452
|
+
ggml_tensor * get_tokens() const { return t_tokens; }
|
|
453
|
+
ggml_tensor * get_logits() const { return t_logits; }
|
|
454
|
+
ggml_tensor * get_embd() const { return t_embd; }
|
|
455
|
+
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
|
|
456
|
+
|
|
457
|
+
ggml_cgraph * get_gf() const { return gf; }
|
|
458
|
+
ggml_context * get_ctx() const { return ctx_compute.get(); }
|
|
459
|
+
|
|
460
|
+
int64_t get_max_nodes() const;
|
|
461
|
+
|
|
462
|
+
void reset();
|
|
463
|
+
|
|
464
|
+
void set_inputs(const llama_ubatch * ubatch);
|
|
465
|
+
|
|
466
|
+
// try to update the existing graph result using the new graph parameters in order to reuse it
|
|
467
|
+
// this can only be done if we determine that the resulting graph using the new graph parameters
|
|
468
|
+
// would be identical to the existing graph. in that case, we simply have to update the memory
|
|
469
|
+
// contexts of the input tensors of the graph and we can reuse it for another computation
|
|
470
|
+
// return true if the graph was updated and can be reused
|
|
471
|
+
bool can_reuse(const llm_graph_params & params);
|
|
472
|
+
|
|
473
|
+
llm_graph_input_i * add_input(llm_graph_input_ptr input);
|
|
474
|
+
|
|
475
|
+
void set_params(const llm_graph_params & params);
|
|
388
476
|
|
|
389
477
|
// important graph nodes
|
|
390
478
|
ggml_tensor * t_tokens = nullptr;
|
|
@@ -393,36 +481,34 @@ public:
|
|
|
393
481
|
ggml_tensor * t_embd_pooled = nullptr;
|
|
394
482
|
|
|
395
483
|
std::vector<llm_graph_input_ptr> inputs;
|
|
396
|
-
};
|
|
397
484
|
|
|
398
|
-
|
|
399
|
-
// llm_graph_context
|
|
400
|
-
//
|
|
485
|
+
ggml_context_ptr ctx_compute;
|
|
401
486
|
|
|
402
|
-
//
|
|
403
|
-
|
|
487
|
+
// memory buffers used to evaluate the model
|
|
488
|
+
std::vector<uint8_t> buf_compute_meta;
|
|
404
489
|
|
|
405
|
-
|
|
406
|
-
ggml_context * ctx;
|
|
490
|
+
ggml_cgraph * gf;
|
|
407
491
|
|
|
408
|
-
|
|
492
|
+
int64_t max_nodes;
|
|
409
493
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
494
|
+
private:
|
|
495
|
+
// keep a copy of the previous graph parameters
|
|
496
|
+
// we will use this to determine whether the graph can be reused by comparing them with the new parameters
|
|
497
|
+
// note: these are updated after constructing the new graph
|
|
498
|
+
llm_graph_params params;
|
|
413
499
|
|
|
414
|
-
|
|
415
|
-
|
|
500
|
+
// env: LLAMA_GRAPH_RESULT_DEBUG
|
|
501
|
+
int debug = 0;
|
|
502
|
+
};
|
|
416
503
|
|
|
417
|
-
|
|
418
|
-
const llama_adapter_loras * loras;
|
|
419
|
-
const llama_memory_context_i * mctx;
|
|
420
|
-
const llama_cross * cross;
|
|
504
|
+
using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>;
|
|
421
505
|
|
|
422
|
-
|
|
506
|
+
//
|
|
507
|
+
// llm_graph_context
|
|
508
|
+
//
|
|
423
509
|
|
|
424
|
-
|
|
425
|
-
|
|
510
|
+
// used in build_rs to properly order writes and avoid unnecessary copies
|
|
511
|
+
using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
|
|
426
512
|
|
|
427
513
|
struct llm_graph_context {
|
|
428
514
|
const llm_arch arch;
|
|
@@ -460,8 +546,6 @@ struct llm_graph_context {
|
|
|
460
546
|
const enum llama_pooling_type pooling_type;
|
|
461
547
|
const enum llama_rope_type rope_type;
|
|
462
548
|
|
|
463
|
-
ggml_context * ctx0 = nullptr;
|
|
464
|
-
|
|
465
549
|
ggml_backend_sched_t sched;
|
|
466
550
|
|
|
467
551
|
ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
|
|
@@ -473,7 +557,10 @@ struct llm_graph_context {
|
|
|
473
557
|
|
|
474
558
|
const llm_graph_cb & cb_func;
|
|
475
559
|
|
|
476
|
-
|
|
560
|
+
llm_graph_result * res;
|
|
561
|
+
|
|
562
|
+
ggml_context * ctx0 = nullptr;
|
|
563
|
+
ggml_cgraph * gf = nullptr;
|
|
477
564
|
|
|
478
565
|
llm_graph_context(const llm_graph_params & params);
|
|
479
566
|
virtual ~llm_graph_context() = default;
|
|
@@ -554,14 +641,11 @@ struct llm_graph_context {
|
|
|
554
641
|
ggml_tensor * build_inp_pos_bucket_dec() const;
|
|
555
642
|
ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;
|
|
556
643
|
|
|
557
|
-
llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
|
|
558
|
-
|
|
559
644
|
//
|
|
560
645
|
// attention
|
|
561
646
|
//
|
|
562
647
|
|
|
563
648
|
ggml_tensor * build_attn_mha(
|
|
564
|
-
ggml_cgraph * gf,
|
|
565
649
|
ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens]
|
|
566
650
|
ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens]
|
|
567
651
|
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
|
|
@@ -574,7 +658,6 @@ struct llm_graph_context {
|
|
|
574
658
|
|
|
575
659
|
ggml_tensor * build_attn(
|
|
576
660
|
llm_graph_input_attn_no_cache * inp,
|
|
577
|
-
ggml_cgraph * gf,
|
|
578
661
|
ggml_tensor * wo,
|
|
579
662
|
ggml_tensor * wo_b,
|
|
580
663
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
@@ -589,7 +672,6 @@ struct llm_graph_context {
|
|
|
589
672
|
|
|
590
673
|
ggml_tensor * build_attn(
|
|
591
674
|
llm_graph_input_attn_kv_unified * inp,
|
|
592
|
-
ggml_cgraph * gf,
|
|
593
675
|
ggml_tensor * wo,
|
|
594
676
|
ggml_tensor * wo_b,
|
|
595
677
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
@@ -605,7 +687,6 @@ struct llm_graph_context {
|
|
|
605
687
|
// note: if k_cur or v_cur are not provided, they will not be stored in the memory
|
|
606
688
|
ggml_tensor * build_attn(
|
|
607
689
|
llm_graph_input_attn_kv_unified_iswa * inp,
|
|
608
|
-
ggml_cgraph * gf,
|
|
609
690
|
ggml_tensor * wo,
|
|
610
691
|
ggml_tensor * wo_b,
|
|
611
692
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
@@ -620,7 +701,6 @@ struct llm_graph_context {
|
|
|
620
701
|
|
|
621
702
|
ggml_tensor * build_attn(
|
|
622
703
|
llm_graph_input_attn_cross * inp,
|
|
623
|
-
ggml_cgraph * gf,
|
|
624
704
|
ggml_tensor * wo,
|
|
625
705
|
ggml_tensor * wo_b,
|
|
626
706
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
@@ -631,18 +711,6 @@ struct llm_graph_context {
|
|
|
631
711
|
float kq_scale,
|
|
632
712
|
int il) const;
|
|
633
713
|
|
|
634
|
-
ggml_tensor * build_attn(
|
|
635
|
-
llm_graph_input_mem_hybrid * inp,
|
|
636
|
-
ggml_cgraph * gf,
|
|
637
|
-
ggml_tensor * wo,
|
|
638
|
-
ggml_tensor * wo_b,
|
|
639
|
-
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
640
|
-
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
|
641
|
-
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
|
642
|
-
ggml_tensor * kq_b,
|
|
643
|
-
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
644
|
-
float kq_scale,
|
|
645
|
-
int il) const;
|
|
646
714
|
//
|
|
647
715
|
// recurrent
|
|
648
716
|
//
|
|
@@ -654,7 +722,6 @@ struct llm_graph_context {
|
|
|
654
722
|
// implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
|
|
655
723
|
// `llama_memory_recurrent`
|
|
656
724
|
ggml_tensor * build_rs(
|
|
657
|
-
ggml_cgraph * gf,
|
|
658
725
|
ggml_tensor * s,
|
|
659
726
|
ggml_tensor * state_copy,
|
|
660
727
|
int32_t state_size,
|
|
@@ -663,43 +730,37 @@ struct llm_graph_context {
|
|
|
663
730
|
uint32_t kv_head,
|
|
664
731
|
uint32_t kv_size,
|
|
665
732
|
int32_t rs_zero,
|
|
666
|
-
|
|
733
|
+
const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
|
|
667
734
|
|
|
668
735
|
llm_graph_input_rs * build_rs_inp() const;
|
|
669
736
|
|
|
670
737
|
ggml_tensor * build_rs(
|
|
671
738
|
llm_graph_input_rs * inp,
|
|
672
|
-
ggml_cgraph * gf,
|
|
673
739
|
ggml_tensor * s,
|
|
674
740
|
int32_t state_size,
|
|
675
741
|
int32_t n_seqs,
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
ggml_tensor * build_rs(
|
|
679
|
-
llm_graph_input_mem_hybrid * inp,
|
|
680
|
-
ggml_cgraph * gf,
|
|
681
|
-
ggml_tensor * s,
|
|
682
|
-
int32_t state_size,
|
|
683
|
-
int32_t n_seqs,
|
|
684
|
-
bool avoid_copies = false) const;
|
|
742
|
+
const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
|
|
685
743
|
|
|
686
744
|
ggml_tensor * build_rwkv_token_shift_load(
|
|
687
745
|
llm_graph_input_rs * inp,
|
|
688
|
-
ggml_cgraph * gf,
|
|
689
746
|
const llama_ubatch & ubatch,
|
|
690
|
-
|
|
747
|
+
int il) const;
|
|
691
748
|
|
|
692
749
|
ggml_tensor * build_rwkv_token_shift_store(
|
|
693
750
|
ggml_tensor * token_shift,
|
|
694
751
|
const llama_ubatch & ubatch,
|
|
695
752
|
int il) const;
|
|
753
|
+
//
|
|
754
|
+
// hybrid
|
|
755
|
+
//
|
|
756
|
+
|
|
757
|
+
llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
|
|
696
758
|
|
|
697
759
|
//
|
|
698
760
|
// pooling
|
|
699
761
|
//
|
|
700
762
|
|
|
701
763
|
void build_pooling(
|
|
702
|
-
ggml_cgraph * gf,
|
|
703
764
|
ggml_tensor * cls,
|
|
704
765
|
ggml_tensor * cls_b,
|
|
705
766
|
ggml_tensor * cls_out,
|
|
@@ -65,15 +65,61 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
|
|
|
65
65
|
return n_embd_head_v * n_head_kv;
|
|
66
66
|
}
|
|
67
67
|
|
|
68
|
+
bool llama_hparams::is_n_embd_k_gqa_variable() const {
|
|
69
|
+
const uint32_t val = n_embd_k_gqa();
|
|
70
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
71
|
+
if (val != n_embd_k_gqa(il)) {
|
|
72
|
+
return true;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return false;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
bool llama_hparams::is_n_embd_v_gqa_variable() const {
|
|
80
|
+
const uint32_t val = n_embd_v_gqa();
|
|
81
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
82
|
+
if (val != n_embd_v_gqa(il)) {
|
|
83
|
+
return true;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
return false;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
uint32_t llama_hparams::n_embd_k_gqa_max() const {
|
|
91
|
+
uint32_t val = n_embd_k_gqa();
|
|
92
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
93
|
+
val = std::max(val, n_embd_k_gqa(il));
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
return val;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
uint32_t llama_hparams::n_embd_v_gqa_max() const {
|
|
100
|
+
uint32_t val = n_embd_v_gqa();
|
|
101
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
102
|
+
val = std::max(val, n_embd_v_gqa(il));
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
return val;
|
|
106
|
+
}
|
|
107
|
+
|
|
68
108
|
uint32_t llama_hparams::n_embd_r() const {
|
|
69
109
|
if (wkv_head_size != 0) {
|
|
70
110
|
// for RWKV models
|
|
71
111
|
return token_shift_count * n_embd;
|
|
72
112
|
}
|
|
73
113
|
|
|
114
|
+
if (n_shortconv_l_cache != 0) {
|
|
115
|
+
// for LFM2 models
|
|
116
|
+
return n_embd * (n_shortconv_l_cache - 1);
|
|
117
|
+
}
|
|
118
|
+
|
|
74
119
|
// TODO: maybe support other convolution strides than 1
|
|
75
120
|
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
|
76
|
-
|
|
121
|
+
// Corresponds to Mamba's conv_states size
|
|
122
|
+
return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * (ssm_d_inner + 2*ssm_n_group*ssm_d_state);
|
|
77
123
|
}
|
|
78
124
|
|
|
79
125
|
uint32_t llama_hparams::n_embd_s() const {
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
// bump if necessary
|
|
8
8
|
#define LLAMA_MAX_LAYERS 512
|
|
9
|
-
#define LLAMA_MAX_EXPERTS
|
|
9
|
+
#define LLAMA_MAX_EXPERTS 384 // Kimi-K2
|
|
10
10
|
|
|
11
11
|
enum llama_expert_gating_func_type {
|
|
12
12
|
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
|
|
@@ -55,6 +55,8 @@ struct llama_hparams {
|
|
|
55
55
|
struct llama_hparams_posnet posnet;
|
|
56
56
|
struct llama_hparams_convnext convnext;
|
|
57
57
|
|
|
58
|
+
uint32_t n_shortconv_l_cache = 0;
|
|
59
|
+
|
|
58
60
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
|
|
59
61
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
|
60
62
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
|
@@ -114,6 +116,7 @@ struct llama_hparams {
|
|
|
114
116
|
uint32_t ssm_d_inner = 0;
|
|
115
117
|
uint32_t ssm_d_state = 0;
|
|
116
118
|
uint32_t ssm_dt_rank = 0;
|
|
119
|
+
uint32_t ssm_n_group = 0;
|
|
117
120
|
|
|
118
121
|
// for hybrid state space models
|
|
119
122
|
std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;
|
|
@@ -188,6 +191,14 @@ struct llama_hparams {
|
|
|
188
191
|
// dimension of value embeddings across all k-v heads
|
|
189
192
|
uint32_t n_embd_v_gqa(uint32_t il = 0) const;
|
|
190
193
|
|
|
194
|
+
// true if any layer has a different n_embd_k_gqa/n_embd_v_gqa
|
|
195
|
+
bool is_n_embd_k_gqa_variable() const;
|
|
196
|
+
bool is_n_embd_v_gqa_variable() const;
|
|
197
|
+
|
|
198
|
+
// return the maximum n_embd_k_gqa/n_embd_v_gqa across all layers
|
|
199
|
+
uint32_t n_embd_k_gqa_max() const;
|
|
200
|
+
uint32_t n_embd_v_gqa_max() const;
|
|
201
|
+
|
|
191
202
|
// dimension of the rolling state embeddings
|
|
192
203
|
// corresponds to Mamba's conv_states size or RWKV's token_shift states size
|
|
193
204
|
uint32_t n_embd_r() const;
|