@fugood/llama.node 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/package.json +14 -14
  2. package/src/llama.cpp/CMakeLists.txt +0 -1
  3. package/src/llama.cpp/common/CMakeLists.txt +4 -5
  4. package/src/llama.cpp/common/arg.cpp +44 -0
  5. package/src/llama.cpp/common/common.cpp +22 -6
  6. package/src/llama.cpp/common/common.h +15 -1
  7. package/src/llama.cpp/ggml/CMakeLists.txt +10 -2
  8. package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  9. package/src/llama.cpp/ggml/include/ggml.h +104 -10
  10. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  11. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
  12. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +12 -1
  13. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +749 -163
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  17. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +12 -9
  18. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +88 -9
  19. package/src/llama.cpp/include/llama.h +13 -47
  20. package/src/llama.cpp/src/llama-arch.cpp +298 -3
  21. package/src/llama.cpp/src/llama-arch.h +22 -1
  22. package/src/llama.cpp/src/llama-batch.cpp +103 -71
  23. package/src/llama.cpp/src/llama-batch.h +31 -18
  24. package/src/llama.cpp/src/llama-chat.cpp +59 -1
  25. package/src/llama.cpp/src/llama-chat.h +3 -0
  26. package/src/llama.cpp/src/llama-context.cpp +134 -95
  27. package/src/llama.cpp/src/llama-context.h +13 -16
  28. package/src/llama.cpp/src/llama-cparams.h +3 -2
  29. package/src/llama.cpp/src/llama-graph.cpp +279 -180
  30. package/src/llama.cpp/src/llama-graph.h +183 -122
  31. package/src/llama.cpp/src/llama-hparams.cpp +47 -1
  32. package/src/llama.cpp/src/llama-hparams.h +12 -1
  33. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
  34. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
  35. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
  36. package/src/llama.cpp/src/llama-kv-cache-unified.h +143 -47
  37. package/src/llama.cpp/src/llama-kv-cells.h +62 -10
  38. package/src/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
  39. package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
  40. package/src/llama.cpp/src/llama-memory-recurrent.cpp +21 -11
  41. package/src/llama.cpp/src/llama-memory.cpp +17 -0
  42. package/src/llama.cpp/src/llama-memory.h +3 -0
  43. package/src/llama.cpp/src/llama-model.cpp +3373 -743
  44. package/src/llama.cpp/src/llama-model.h +20 -4
  45. package/src/llama.cpp/src/llama-quant.cpp +2 -2
  46. package/src/llama.cpp/src/llama-vocab.cpp +376 -10
  47. package/src/llama.cpp/src/llama-vocab.h +43 -0
  48. package/src/llama.cpp/src/unicode.cpp +207 -0
  49. package/src/llama.cpp/src/unicode.h +2 -0
  50. package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
@@ -1,6 +1,7 @@
1
1
  #pragma once
2
2
 
3
3
  #include "llama-arch.h"
4
+ #include "llama-batch.h"
4
5
  #include "llama-hparams.h"
5
6
  #include "llama-adapter.h"
6
7
 
@@ -14,7 +15,6 @@ struct ggml_cgraph;
14
15
  struct ggml_context;
15
16
  struct ggml_tensor;
16
17
 
17
- struct llama_ubatch;
18
18
  struct llama_cparams;
19
19
 
20
20
  struct llama_memory_context_i;
@@ -69,6 +69,8 @@ struct llama_cross {
69
69
  std::vector<std::set<llama_seq_id>> seq_ids_enc;
70
70
  };
71
71
 
72
+ struct llm_graph_params;
73
+
72
74
  //
73
75
  // llm_graph_input
74
76
  //
@@ -78,11 +80,19 @@ public:
78
80
  virtual ~llm_graph_input_i() = default;
79
81
 
80
82
  virtual void set_input(const llama_ubatch * ubatch) = 0;
83
+
84
+ // return true if the resulting input tensors using the provided graph parameters would be
85
+ // the same as the previous input tensors that we have currently stored in the object
86
+ virtual bool can_reuse(const llm_graph_params & params) {
87
+ // returning false here by default will prevent from reusing the graph if the check
88
+ // for the input type has not been implemented yet
89
+ GGML_UNUSED(params);
90
+ return false;
91
+ }
81
92
  };
82
93
 
83
94
  using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
84
95
 
85
-
86
96
  class llm_graph_input_embd : public llm_graph_input_i {
87
97
  public:
88
98
  llm_graph_input_embd() = default;
@@ -90,6 +100,8 @@ public:
90
100
 
91
101
  void set_input(const llama_ubatch * ubatch) override;
92
102
 
103
+ bool can_reuse(const llm_graph_params & params) override;
104
+
93
105
  ggml_tensor * tokens = nullptr; // I32 [n_batch]
94
106
  ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
95
107
  };
@@ -101,6 +113,8 @@ public:
101
113
 
102
114
  void set_input(const llama_ubatch * ubatch) override;
103
115
 
116
+ bool can_reuse(const llm_graph_params & params) override;
117
+
104
118
  ggml_tensor * pos = nullptr; // I32 [n_batch]
105
119
 
106
120
  const uint32_t n_pos_per_embd = 1;
@@ -154,17 +168,19 @@ public:
154
168
  llm_graph_input_out_ids(
155
169
  const llama_hparams & hparams,
156
170
  const llama_cparams & cparams,
157
- int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
171
+ uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
158
172
  virtual ~llm_graph_input_out_ids() = default;
159
173
 
160
174
  void set_input(const llama_ubatch * ubatch) override;
161
175
 
176
+ bool can_reuse(const llm_graph_params & params) override;
177
+
162
178
  ggml_tensor * out_ids; // I32 [n_outputs]
163
179
 
164
180
  const llama_hparams & hparams;
165
181
  const llama_cparams & cparams;
166
182
 
167
- const int32_t n_outputs;
183
+ const uint32_t n_outputs;
168
184
  };
169
185
 
170
186
  class llm_graph_input_mean : public llm_graph_input_i {
@@ -228,8 +244,8 @@ public:
228
244
 
229
245
  ggml_tensor * get_kq_mask() const { return kq_mask_cnv; }
230
246
 
231
- ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch]
232
- ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch]
247
+ ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch, 1, 1]
248
+ ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch, 1, 1]
233
249
 
234
250
  const llama_hparams & hparams;
235
251
  const llama_cparams & cparams;
@@ -249,10 +265,18 @@ public:
249
265
 
250
266
  void set_input(const llama_ubatch * ubatch) override;
251
267
 
268
+ bool can_reuse(const llm_graph_params & params) override;
269
+
270
+ ggml_tensor * get_k_idxs() const { return self_k_idxs; }
271
+ ggml_tensor * get_v_idxs() const { return self_v_idxs; }
272
+
252
273
  ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
253
274
 
254
- ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch]
255
- ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch]
275
+ ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
276
+ ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
277
+
278
+ ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
279
+ ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
256
280
 
257
281
  const llama_hparams & hparams;
258
282
  const llama_cparams & cparams;
@@ -274,13 +298,25 @@ public:
274
298
 
275
299
  void set_input(const llama_ubatch * ubatch) override;
276
300
 
301
+ bool can_reuse(const llm_graph_params & params) override;
302
+
303
+ ggml_tensor * get_k_idxs() const { return self_k_idxs; }
304
+ ggml_tensor * get_v_idxs() const { return self_v_idxs; }
305
+ ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; }
306
+ ggml_tensor * get_v_idxs_swa() const { return self_v_idxs_swa; }
307
+
277
308
  ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
278
309
  ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
279
310
 
280
- ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch]
281
- ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch]
282
- ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch]
283
- ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch]
311
+ ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
312
+ ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
313
+ ggml_tensor * self_k_idxs_swa = nullptr; // I64 [n_batch]
314
+ ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
315
+
316
+ ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
317
+ ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
318
+ ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
319
+ ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
284
320
 
285
321
  const llama_hparams & hparams;
286
322
  const llama_cparams & cparams;
@@ -297,8 +333,8 @@ public:
297
333
 
298
334
  ggml_tensor * get_kq_mask_cross() const { return cross_kq_mask_cnv; }
299
335
 
300
- ggml_tensor * cross_kq_mask = nullptr; // F32 [n_outputs_enc, n_batch]
301
- ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch]
336
+ ggml_tensor * cross_kq_mask = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1]
337
+ ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1]
302
338
 
303
339
  const llama_cross * cross = nullptr;
304
340
  };
@@ -306,41 +342,25 @@ public:
306
342
  class llm_graph_input_mem_hybrid : public llm_graph_input_i {
307
343
  public:
308
344
  llm_graph_input_mem_hybrid(
309
- const llama_hparams & hparams,
310
- const llama_cparams & cparams,
311
- const llama_memory_hybrid_context * mctx) :
312
- hparams(hparams),
313
- cparams(cparams),
314
- mctx(mctx) {
315
- }
345
+ std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn,
346
+ std::unique_ptr<llm_graph_input_rs> inp_rs,
347
+ const llama_memory_hybrid_context * mctx) :
348
+ inp_attn(std::move(inp_attn)),
349
+ inp_rs(std::move(inp_rs)),
350
+ mctx(mctx) { }
316
351
  virtual ~llm_graph_input_mem_hybrid() = default;
317
352
 
318
353
  void set_input(const llama_ubatch * ubatch) override;
319
354
 
320
- ggml_tensor * s_copy; // I32 [kv_size]
321
-
322
- ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
323
-
324
- ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch]
325
- ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch]
355
+ std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn;
356
+ std::unique_ptr<llm_graph_input_rs> inp_rs;
326
357
 
327
- const llama_hparams & hparams;
328
- const llama_cparams & cparams;
358
+ llm_graph_input_attn_kv_unified * get_attn() const { return inp_attn.get(); }
359
+ llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
329
360
 
330
361
  const llama_memory_hybrid_context * mctx;
331
362
  };
332
363
 
333
- // TODO: remove this when ggml_scale_add is implemented
334
- class llm_graph_input_one : public llm_graph_input_i {
335
- public:
336
- llm_graph_input_one() {}
337
- virtual ~llm_graph_input_one() = default;
338
-
339
- void set_input(const llama_ubatch *) override;
340
-
341
- ggml_tensor * one = nullptr; // F32
342
- };
343
-
344
364
  //
345
365
  // llm_graph_result
346
366
  //
@@ -351,40 +371,108 @@ public:
351
371
  // along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
352
372
  // these are used by the llama_context to extact the relevant data, based on the compute parameters
353
373
 
354
- class llm_graph_result_i {
355
- public:
356
- virtual ~llm_graph_result_i() = default;
374
+ // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
375
+ using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
357
376
 
358
- virtual ggml_tensor * get_tokens() = 0;
359
- virtual ggml_tensor * get_logits() = 0;
360
- virtual ggml_tensor * get_embd() = 0;
361
- virtual ggml_tensor * get_embd_pooled() = 0;
377
+ class llm_graph_result;
362
378
 
363
- virtual void set_inputs(const llama_ubatch * ubatch) = 0;
364
- };
379
+ struct llm_graph_params {
380
+ llm_arch arch = LLM_ARCH_UNKNOWN;
365
381
 
366
- using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>;
382
+ llama_hparams hparams;
383
+ llama_cparams cparams;
367
384
 
385
+ llama_ubatch ubatch; // note: intentionally make a copy
368
386
 
369
- class llm_graph_result : public llm_graph_result_i {
370
- public:
371
- virtual ~llm_graph_result() = default;
387
+ llm_graph_type gtype;
388
+
389
+ ggml_backend_sched_t sched;
390
+ ggml_backend_t backend_cpu;
391
+
392
+ const llama_adapter_cvec * cvec;
393
+ const llama_adapter_loras * loras;
394
+ const llama_memory_context_i * mctx;
395
+ const llama_cross * cross;
372
396
 
373
- ggml_tensor * get_tokens() override { return t_tokens; }
374
- ggml_tensor * get_logits() override { return t_logits; }
375
- ggml_tensor * get_embd() override { return t_embd; }
376
- ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
397
+ uint32_t n_outputs;
377
398
 
378
- void set_inputs(const llama_ubatch * ubatch) override {
379
- for (auto & input : inputs) {
380
- input->set_input(ubatch);
399
+ llm_graph_cb cb;
400
+
401
+ llm_graph_result * res;
402
+
403
+ // return true if the "other" params would result in a graph with the same topology as with the current params
404
+ // having the same topology allows us to reuse the graph in some cases
405
+ bool allow_reuse(const llm_graph_params & other) const {
406
+ // first check the ubatch
407
+ bool can_reuse_ubatch =
408
+ ubatch.equal_seqs() == other.ubatch.equal_seqs() &&
409
+ ubatch.n_tokens == other.ubatch.n_tokens &&
410
+ ubatch.n_seq_tokens == other.ubatch.n_seq_tokens &&
411
+ ubatch.n_seqs == other.ubatch.n_seqs &&
412
+ ubatch.n_seqs_unq == other.ubatch.n_seqs_unq &&
413
+ (
414
+ (!ubatch.token && !other.ubatch.token) ||
415
+ (!ubatch.embd && !other.ubatch.embd)
416
+ );
417
+
418
+ if (can_reuse_ubatch && !ubatch.equal_seqs()) {
419
+ if (!ubatch.data) {
420
+ // if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
421
+ // therefore we cannot perform the sequence id check. normally should never happen
422
+ can_reuse_ubatch = false;
423
+ } else {
424
+ for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
425
+ can_reuse_ubatch &= ubatch.seq_id_unq[s] == other.ubatch.seq_id_unq[s];
426
+ }
427
+ }
428
+ }
429
+
430
+ if (!can_reuse_ubatch) {
431
+ return false;
381
432
  }
382
- }
383
433
 
384
- llm_graph_input_i * add_input(llm_graph_input_ptr input) {
385
- inputs.emplace_back(std::move(input));
386
- return inputs.back().get();
434
+ return
435
+ cparams.embeddings == other.cparams.embeddings &&
436
+ cparams.causal_attn == other.cparams.causal_attn &&
437
+ arch == other.arch &&
438
+ gtype == other.gtype &&
439
+ cvec == other.cvec &&
440
+ loras == other.loras &&
441
+ cross == other.cross &&
442
+ n_outputs == other.n_outputs;
387
443
  }
444
+ };
445
+
446
+ class llm_graph_result {
447
+ public:
448
+ llm_graph_result(int64_t max_nodes);
449
+
450
+ virtual ~llm_graph_result() = default;
451
+
452
+ ggml_tensor * get_tokens() const { return t_tokens; }
453
+ ggml_tensor * get_logits() const { return t_logits; }
454
+ ggml_tensor * get_embd() const { return t_embd; }
455
+ ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
456
+
457
+ ggml_cgraph * get_gf() const { return gf; }
458
+ ggml_context * get_ctx() const { return ctx_compute.get(); }
459
+
460
+ int64_t get_max_nodes() const;
461
+
462
+ void reset();
463
+
464
+ void set_inputs(const llama_ubatch * ubatch);
465
+
466
+ // try to update the existing graph result using the new graph parameters in order to reuse it
467
+ // this can only be done if we determine that the resulting graph using the new graph parameters
468
+ // would be identical to the existing graph. in that case, we simply have to update the memory
469
+ // contexts of the input tensors of the graph and we can reuse it for another computation
470
+ // return true if the graph was updated and can be reused
471
+ bool can_reuse(const llm_graph_params & params);
472
+
473
+ llm_graph_input_i * add_input(llm_graph_input_ptr input);
474
+
475
+ void set_params(const llm_graph_params & params);
388
476
 
389
477
  // important graph nodes
390
478
  ggml_tensor * t_tokens = nullptr;
@@ -393,36 +481,34 @@ public:
393
481
  ggml_tensor * t_embd_pooled = nullptr;
394
482
 
395
483
  std::vector<llm_graph_input_ptr> inputs;
396
- };
397
484
 
398
- //
399
- // llm_graph_context
400
- //
485
+ ggml_context_ptr ctx_compute;
401
486
 
402
- // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
403
- using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
487
+ // memory buffers used to evaluate the model
488
+ std::vector<uint8_t> buf_compute_meta;
404
489
 
405
- struct llm_graph_params {
406
- ggml_context * ctx;
490
+ ggml_cgraph * gf;
407
491
 
408
- const llm_arch arch;
492
+ int64_t max_nodes;
409
493
 
410
- const llama_hparams & hparams;
411
- const llama_cparams & cparams;
412
- const llama_ubatch & ubatch;
494
+ private:
495
+ // keep a copy of the previous graph parameters
496
+ // we will use this to determine whether the graph can be reused by comparing them with the new parameters
497
+ // note: these are updated after constructing the new graph
498
+ llm_graph_params params;
413
499
 
414
- ggml_backend_sched_t sched;
415
- ggml_backend_t backend_cpu;
500
+ // env: LLAMA_GRAPH_RESULT_DEBUG
501
+ int debug = 0;
502
+ };
416
503
 
417
- const llama_adapter_cvec * cvec;
418
- const llama_adapter_loras * loras;
419
- const llama_memory_context_i * mctx;
420
- const llama_cross * cross;
504
+ using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>;
421
505
 
422
- uint32_t n_outputs;
506
+ //
507
+ // llm_graph_context
508
+ //
423
509
 
424
- const llm_graph_cb & cb;
425
- };
510
+ // used in build_rs to properly order writes and avoid unnecessary copies
511
+ using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
426
512
 
427
513
  struct llm_graph_context {
428
514
  const llm_arch arch;
@@ -460,8 +546,6 @@ struct llm_graph_context {
460
546
  const enum llama_pooling_type pooling_type;
461
547
  const enum llama_rope_type rope_type;
462
548
 
463
- ggml_context * ctx0 = nullptr;
464
-
465
549
  ggml_backend_sched_t sched;
466
550
 
467
551
  ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
@@ -473,7 +557,10 @@ struct llm_graph_context {
473
557
 
474
558
  const llm_graph_cb & cb_func;
475
559
 
476
- std::unique_ptr<llm_graph_result> res;
560
+ llm_graph_result * res;
561
+
562
+ ggml_context * ctx0 = nullptr;
563
+ ggml_cgraph * gf = nullptr;
477
564
 
478
565
  llm_graph_context(const llm_graph_params & params);
479
566
  virtual ~llm_graph_context() = default;
@@ -554,14 +641,11 @@ struct llm_graph_context {
554
641
  ggml_tensor * build_inp_pos_bucket_dec() const;
555
642
  ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;
556
643
 
557
- llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
558
-
559
644
  //
560
645
  // attention
561
646
  //
562
647
 
563
648
  ggml_tensor * build_attn_mha(
564
- ggml_cgraph * gf,
565
649
  ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens]
566
650
  ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens]
567
651
  ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
@@ -574,7 +658,6 @@ struct llm_graph_context {
574
658
 
575
659
  ggml_tensor * build_attn(
576
660
  llm_graph_input_attn_no_cache * inp,
577
- ggml_cgraph * gf,
578
661
  ggml_tensor * wo,
579
662
  ggml_tensor * wo_b,
580
663
  ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@@ -589,7 +672,6 @@ struct llm_graph_context {
589
672
 
590
673
  ggml_tensor * build_attn(
591
674
  llm_graph_input_attn_kv_unified * inp,
592
- ggml_cgraph * gf,
593
675
  ggml_tensor * wo,
594
676
  ggml_tensor * wo_b,
595
677
  ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@@ -605,7 +687,6 @@ struct llm_graph_context {
605
687
  // note: if k_cur or v_cur are not provided, they will not be stored in the memory
606
688
  ggml_tensor * build_attn(
607
689
  llm_graph_input_attn_kv_unified_iswa * inp,
608
- ggml_cgraph * gf,
609
690
  ggml_tensor * wo,
610
691
  ggml_tensor * wo_b,
611
692
  ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@@ -620,7 +701,6 @@ struct llm_graph_context {
620
701
 
621
702
  ggml_tensor * build_attn(
622
703
  llm_graph_input_attn_cross * inp,
623
- ggml_cgraph * gf,
624
704
  ggml_tensor * wo,
625
705
  ggml_tensor * wo_b,
626
706
  ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@@ -631,18 +711,6 @@ struct llm_graph_context {
631
711
  float kq_scale,
632
712
  int il) const;
633
713
 
634
- ggml_tensor * build_attn(
635
- llm_graph_input_mem_hybrid * inp,
636
- ggml_cgraph * gf,
637
- ggml_tensor * wo,
638
- ggml_tensor * wo_b,
639
- ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
640
- ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
641
- ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
642
- ggml_tensor * kq_b,
643
- ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
644
- float kq_scale,
645
- int il) const;
646
714
  //
647
715
  // recurrent
648
716
  //
@@ -654,7 +722,6 @@ struct llm_graph_context {
654
722
  // implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
655
723
  // `llama_memory_recurrent`
656
724
  ggml_tensor * build_rs(
657
- ggml_cgraph * gf,
658
725
  ggml_tensor * s,
659
726
  ggml_tensor * state_copy,
660
727
  int32_t state_size,
@@ -663,43 +730,37 @@ struct llm_graph_context {
663
730
  uint32_t kv_head,
664
731
  uint32_t kv_size,
665
732
  int32_t rs_zero,
666
- bool avoid_copies = false) const;
733
+ const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
667
734
 
668
735
  llm_graph_input_rs * build_rs_inp() const;
669
736
 
670
737
  ggml_tensor * build_rs(
671
738
  llm_graph_input_rs * inp,
672
- ggml_cgraph * gf,
673
739
  ggml_tensor * s,
674
740
  int32_t state_size,
675
741
  int32_t n_seqs,
676
- bool avoid_copies = false) const;
677
-
678
- ggml_tensor * build_rs(
679
- llm_graph_input_mem_hybrid * inp,
680
- ggml_cgraph * gf,
681
- ggml_tensor * s,
682
- int32_t state_size,
683
- int32_t n_seqs,
684
- bool avoid_copies = false) const;
742
+ const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
685
743
 
686
744
  ggml_tensor * build_rwkv_token_shift_load(
687
745
  llm_graph_input_rs * inp,
688
- ggml_cgraph * gf,
689
746
  const llama_ubatch & ubatch,
690
- int il) const;
747
+ int il) const;
691
748
 
692
749
  ggml_tensor * build_rwkv_token_shift_store(
693
750
  ggml_tensor * token_shift,
694
751
  const llama_ubatch & ubatch,
695
752
  int il) const;
753
+ //
754
+ // hybrid
755
+ //
756
+
757
+ llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
696
758
 
697
759
  //
698
760
  // pooling
699
761
  //
700
762
 
701
763
  void build_pooling(
702
- ggml_cgraph * gf,
703
764
  ggml_tensor * cls,
704
765
  ggml_tensor * cls_b,
705
766
  ggml_tensor * cls_out,
@@ -65,15 +65,61 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
65
65
  return n_embd_head_v * n_head_kv;
66
66
  }
67
67
 
68
+ bool llama_hparams::is_n_embd_k_gqa_variable() const {
69
+ const uint32_t val = n_embd_k_gqa();
70
+ for (uint32_t il = 0; il < n_layer; ++il) {
71
+ if (val != n_embd_k_gqa(il)) {
72
+ return true;
73
+ }
74
+ }
75
+
76
+ return false;
77
+ }
78
+
79
+ bool llama_hparams::is_n_embd_v_gqa_variable() const {
80
+ const uint32_t val = n_embd_v_gqa();
81
+ for (uint32_t il = 0; il < n_layer; ++il) {
82
+ if (val != n_embd_v_gqa(il)) {
83
+ return true;
84
+ }
85
+ }
86
+
87
+ return false;
88
+ }
89
+
90
+ uint32_t llama_hparams::n_embd_k_gqa_max() const {
91
+ uint32_t val = n_embd_k_gqa();
92
+ for (uint32_t il = 0; il < n_layer; ++il) {
93
+ val = std::max(val, n_embd_k_gqa(il));
94
+ }
95
+
96
+ return val;
97
+ }
98
+
99
+ uint32_t llama_hparams::n_embd_v_gqa_max() const {
100
+ uint32_t val = n_embd_v_gqa();
101
+ for (uint32_t il = 0; il < n_layer; ++il) {
102
+ val = std::max(val, n_embd_v_gqa(il));
103
+ }
104
+
105
+ return val;
106
+ }
107
+
68
108
  uint32_t llama_hparams::n_embd_r() const {
69
109
  if (wkv_head_size != 0) {
70
110
  // for RWKV models
71
111
  return token_shift_count * n_embd;
72
112
  }
73
113
 
114
+ if (n_shortconv_l_cache != 0) {
115
+ // for LFM2 models
116
+ return n_embd * (n_shortconv_l_cache - 1);
117
+ }
118
+
74
119
  // TODO: maybe support other convolution strides than 1
75
120
  // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
76
- return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
121
+ // Corresponds to Mamba's conv_states size
122
+ return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * (ssm_d_inner + 2*ssm_n_group*ssm_d_state);
77
123
  }
78
124
 
79
125
  uint32_t llama_hparams::n_embd_s() const {
@@ -6,7 +6,7 @@
6
6
 
7
7
  // bump if necessary
8
8
  #define LLAMA_MAX_LAYERS 512
9
- #define LLAMA_MAX_EXPERTS 256 // DeepSeekV3
9
+ #define LLAMA_MAX_EXPERTS 384 // Kimi-K2
10
10
 
11
11
  enum llama_expert_gating_func_type {
12
12
  LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
@@ -55,6 +55,8 @@ struct llama_hparams {
55
55
  struct llama_hparams_posnet posnet;
56
56
  struct llama_hparams_convnext convnext;
57
57
 
58
+ uint32_t n_shortconv_l_cache = 0;
59
+
58
60
  std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
59
61
  std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
60
62
  std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
@@ -114,6 +116,7 @@ struct llama_hparams {
114
116
  uint32_t ssm_d_inner = 0;
115
117
  uint32_t ssm_d_state = 0;
116
118
  uint32_t ssm_dt_rank = 0;
119
+ uint32_t ssm_n_group = 0;
117
120
 
118
121
  // for hybrid state space models
119
122
  std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;
@@ -188,6 +191,14 @@ struct llama_hparams {
188
191
  // dimension of value embeddings across all k-v heads
189
192
  uint32_t n_embd_v_gqa(uint32_t il = 0) const;
190
193
 
194
+ // true if any layer has a different n_embd_k_gqa/n_embd_v_gqa
195
+ bool is_n_embd_k_gqa_variable() const;
196
+ bool is_n_embd_v_gqa_variable() const;
197
+
198
+ // return the maximum n_embd_k_gqa/n_embd_v_gqa across all layers
199
+ uint32_t n_embd_k_gqa_max() const;
200
+ uint32_t n_embd_v_gqa_max() const;
201
+
191
202
  // dimension of the rolling state embeddings
192
203
  // corresponds to Mamba's conv_states size or RWKV's token_shift states size
193
204
  uint32_t n_embd_r() const;