@fugood/llama.node 1.0.3 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/package.json +14 -14
  2. package/src/llama.cpp/common/CMakeLists.txt +4 -5
  3. package/src/llama.cpp/common/arg.cpp +37 -0
  4. package/src/llama.cpp/common/common.cpp +22 -6
  5. package/src/llama.cpp/common/common.h +14 -1
  6. package/src/llama.cpp/ggml/CMakeLists.txt +3 -0
  7. package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  8. package/src/llama.cpp/ggml/include/ggml.h +13 -0
  9. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
  10. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  11. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +23 -8
  12. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
  13. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +39 -0
  14. package/src/llama.cpp/include/llama.h +13 -48
  15. package/src/llama.cpp/src/llama-arch.cpp +222 -15
  16. package/src/llama.cpp/src/llama-arch.h +16 -1
  17. package/src/llama.cpp/src/llama-batch.cpp +76 -70
  18. package/src/llama.cpp/src/llama-batch.h +24 -18
  19. package/src/llama.cpp/src/llama-chat.cpp +44 -1
  20. package/src/llama.cpp/src/llama-chat.h +2 -0
  21. package/src/llama.cpp/src/llama-context.cpp +134 -95
  22. package/src/llama.cpp/src/llama-context.h +13 -16
  23. package/src/llama.cpp/src/llama-cparams.h +3 -2
  24. package/src/llama.cpp/src/llama-graph.cpp +239 -154
  25. package/src/llama.cpp/src/llama-graph.h +162 -126
  26. package/src/llama.cpp/src/llama-hparams.cpp +45 -0
  27. package/src/llama.cpp/src/llama-hparams.h +11 -1
  28. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
  29. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
  30. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
  31. package/src/llama.cpp/src/llama-kv-cache-unified.h +89 -31
  32. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
  33. package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -9
  34. package/src/llama.cpp/src/llama-model.cpp +2309 -665
  35. package/src/llama.cpp/src/llama-model.h +18 -4
  36. package/src/llama.cpp/src/llama-quant.cpp +2 -2
  37. package/src/llama.cpp/src/llama-vocab.cpp +368 -9
  38. package/src/llama.cpp/src/llama-vocab.h +43 -0
  39. package/src/llama.cpp/src/unicode.cpp +207 -0
  40. package/src/llama.cpp/src/unicode.h +2 -0
@@ -1,6 +1,7 @@
1
1
  #pragma once
2
2
 
3
3
  #include "llama-arch.h"
4
+ #include "llama-batch.h"
4
5
  #include "llama-hparams.h"
5
6
  #include "llama-adapter.h"
6
7
 
@@ -14,7 +15,6 @@ struct ggml_cgraph;
14
15
  struct ggml_context;
15
16
  struct ggml_tensor;
16
17
 
17
- struct llama_ubatch;
18
18
  struct llama_cparams;
19
19
 
20
20
  struct llama_memory_context_i;
@@ -69,6 +69,8 @@ struct llama_cross {
69
69
  std::vector<std::set<llama_seq_id>> seq_ids_enc;
70
70
  };
71
71
 
72
+ struct llm_graph_params;
73
+
72
74
  //
73
75
  // llm_graph_input
74
76
  //
@@ -78,11 +80,19 @@ public:
78
80
  virtual ~llm_graph_input_i() = default;
79
81
 
80
82
  virtual void set_input(const llama_ubatch * ubatch) = 0;
83
+
84
+ // return true if the resulting input tensors using the provided graph parameters would be
85
+ // the same as the previous input tensors that we have currently stored in the object
86
+ virtual bool can_reuse(const llm_graph_params & params) {
87
+ // returning false here by default will prevent from reusing the graph if the check
88
+ // for the input type has not been implemented yet
89
+ GGML_UNUSED(params);
90
+ return false;
91
+ }
81
92
  };
82
93
 
83
94
  using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
84
95
 
85
-
86
96
  class llm_graph_input_embd : public llm_graph_input_i {
87
97
  public:
88
98
  llm_graph_input_embd() = default;
@@ -90,6 +100,8 @@ public:
90
100
 
91
101
  void set_input(const llama_ubatch * ubatch) override;
92
102
 
103
+ bool can_reuse(const llm_graph_params & params) override;
104
+
93
105
  ggml_tensor * tokens = nullptr; // I32 [n_batch]
94
106
  ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
95
107
  };
@@ -101,6 +113,8 @@ public:
101
113
 
102
114
  void set_input(const llama_ubatch * ubatch) override;
103
115
 
116
+ bool can_reuse(const llm_graph_params & params) override;
117
+
104
118
  ggml_tensor * pos = nullptr; // I32 [n_batch]
105
119
 
106
120
  const uint32_t n_pos_per_embd = 1;
@@ -154,17 +168,19 @@ public:
154
168
  llm_graph_input_out_ids(
155
169
  const llama_hparams & hparams,
156
170
  const llama_cparams & cparams,
157
- int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
171
+ uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
158
172
  virtual ~llm_graph_input_out_ids() = default;
159
173
 
160
174
  void set_input(const llama_ubatch * ubatch) override;
161
175
 
176
+ bool can_reuse(const llm_graph_params & params) override;
177
+
162
178
  ggml_tensor * out_ids; // I32 [n_outputs]
163
179
 
164
180
  const llama_hparams & hparams;
165
181
  const llama_cparams & cparams;
166
182
 
167
- const int32_t n_outputs;
183
+ const uint32_t n_outputs;
168
184
  };
169
185
 
170
186
  class llm_graph_input_mean : public llm_graph_input_i {
@@ -249,16 +265,18 @@ public:
249
265
 
250
266
  void set_input(const llama_ubatch * ubatch) override;
251
267
 
268
+ bool can_reuse(const llm_graph_params & params) override;
269
+
252
270
  ggml_tensor * get_k_idxs() const { return self_k_idxs; }
253
271
  ggml_tensor * get_v_idxs() const { return self_v_idxs; }
254
272
 
255
273
  ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
256
274
 
257
275
  ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
258
- ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch]
276
+ ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
259
277
 
260
- ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch, 1, 1]
261
- ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch, 1, 1]
278
+ ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
279
+ ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
262
280
 
263
281
  const llama_hparams & hparams;
264
282
  const llama_cparams & cparams;
@@ -280,6 +298,8 @@ public:
280
298
 
281
299
  void set_input(const llama_ubatch * ubatch) override;
282
300
 
301
+ bool can_reuse(const llm_graph_params & params) override;
302
+
283
303
  ggml_tensor * get_k_idxs() const { return self_k_idxs; }
284
304
  ggml_tensor * get_v_idxs() const { return self_v_idxs; }
285
305
  ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; }
@@ -289,14 +309,14 @@ public:
289
309
  ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
290
310
 
291
311
  ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
292
- ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch]
312
+ ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
293
313
  ggml_tensor * self_k_idxs_swa = nullptr; // I64 [n_batch]
294
- ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch]
314
+ ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
295
315
 
296
- ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch, 1, 1]
297
- ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch, 1, 1]
298
- ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch, 1, 1]
299
- ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch, 1, 1]
316
+ ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
317
+ ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
318
+ ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
319
+ ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
300
320
 
301
321
  const llama_hparams & hparams;
302
322
  const llama_cparams & cparams;
@@ -322,47 +342,25 @@ public:
322
342
  class llm_graph_input_mem_hybrid : public llm_graph_input_i {
323
343
  public:
324
344
  llm_graph_input_mem_hybrid(
325
- const llama_hparams & hparams,
326
- const llama_cparams & cparams,
327
- const llama_memory_hybrid_context * mctx) :
328
- hparams(hparams),
329
- cparams(cparams),
330
- mctx(mctx) {
331
- }
345
+ std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn,
346
+ std::unique_ptr<llm_graph_input_rs> inp_rs,
347
+ const llama_memory_hybrid_context * mctx) :
348
+ inp_attn(std::move(inp_attn)),
349
+ inp_rs(std::move(inp_rs)),
350
+ mctx(mctx) { }
332
351
  virtual ~llm_graph_input_mem_hybrid() = default;
333
352
 
334
353
  void set_input(const llama_ubatch * ubatch) override;
335
354
 
336
- ggml_tensor * s_copy; // I32 [kv_size]
337
-
338
- ggml_tensor * get_k_idxs() const { return self_k_idxs; }
339
- ggml_tensor * get_v_idxs() const { return self_v_idxs; }
340
-
341
- ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
342
-
343
- ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
344
- ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch]
345
-
346
- ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch, 1, 1]
347
- ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch, 1, 1]
355
+ std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn;
356
+ std::unique_ptr<llm_graph_input_rs> inp_rs;
348
357
 
349
- const llama_hparams & hparams;
350
- const llama_cparams & cparams;
358
+ llm_graph_input_attn_kv_unified * get_attn() const { return inp_attn.get(); }
359
+ llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
351
360
 
352
361
  const llama_memory_hybrid_context * mctx;
353
362
  };
354
363
 
355
- // TODO: remove this when ggml_scale_add is implemented
356
- class llm_graph_input_one : public llm_graph_input_i {
357
- public:
358
- llm_graph_input_one() {}
359
- virtual ~llm_graph_input_one() = default;
360
-
361
- void set_input(const llama_ubatch * ubatch) override;
362
-
363
- ggml_tensor * one = nullptr; // F32
364
- };
365
-
366
364
  //
367
365
  // llm_graph_result
368
366
  //
@@ -373,40 +371,108 @@ public:
373
371
  // along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
374
372
  // these are used by the llama_context to extact the relevant data, based on the compute parameters
375
373
 
376
- class llm_graph_result_i {
377
- public:
378
- virtual ~llm_graph_result_i() = default;
374
+ // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
375
+ using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
379
376
 
380
- virtual ggml_tensor * get_tokens() = 0;
381
- virtual ggml_tensor * get_logits() = 0;
382
- virtual ggml_tensor * get_embd() = 0;
383
- virtual ggml_tensor * get_embd_pooled() = 0;
377
+ class llm_graph_result;
384
378
 
385
- virtual void set_inputs(const llama_ubatch * ubatch) = 0;
386
- };
379
+ struct llm_graph_params {
380
+ llm_arch arch = LLM_ARCH_UNKNOWN;
387
381
 
388
- using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>;
382
+ llama_hparams hparams;
383
+ llama_cparams cparams;
389
384
 
385
+ llama_ubatch ubatch; // note: intentionally make a copy
390
386
 
391
- class llm_graph_result : public llm_graph_result_i {
392
- public:
393
- virtual ~llm_graph_result() = default;
387
+ llm_graph_type gtype;
394
388
 
395
- ggml_tensor * get_tokens() override { return t_tokens; }
396
- ggml_tensor * get_logits() override { return t_logits; }
397
- ggml_tensor * get_embd() override { return t_embd; }
398
- ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
389
+ ggml_backend_sched_t sched;
390
+ ggml_backend_t backend_cpu;
399
391
 
400
- void set_inputs(const llama_ubatch * ubatch) override {
401
- for (auto & input : inputs) {
402
- input->set_input(ubatch);
392
+ const llama_adapter_cvec * cvec;
393
+ const llama_adapter_loras * loras;
394
+ const llama_memory_context_i * mctx;
395
+ const llama_cross * cross;
396
+
397
+ uint32_t n_outputs;
398
+
399
+ llm_graph_cb cb;
400
+
401
+ llm_graph_result * res;
402
+
403
+ // return true if the "other" params would result in a graph with the same topology as with the current params
404
+ // having the same topology allows us to reuse the graph in some cases
405
+ bool allow_reuse(const llm_graph_params & other) const {
406
+ // first check the ubatch
407
+ bool can_reuse_ubatch =
408
+ ubatch.equal_seqs() == other.ubatch.equal_seqs() &&
409
+ ubatch.n_tokens == other.ubatch.n_tokens &&
410
+ ubatch.n_seq_tokens == other.ubatch.n_seq_tokens &&
411
+ ubatch.n_seqs == other.ubatch.n_seqs &&
412
+ ubatch.n_seqs_unq == other.ubatch.n_seqs_unq &&
413
+ (
414
+ (!ubatch.token && !other.ubatch.token) ||
415
+ (!ubatch.embd && !other.ubatch.embd)
416
+ );
417
+
418
+ if (can_reuse_ubatch && !ubatch.equal_seqs()) {
419
+ if (!ubatch.data) {
420
+ // if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
421
+ // therefore we cannot perform the sequence id check. normally should never happen
422
+ can_reuse_ubatch = false;
423
+ } else {
424
+ for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
425
+ can_reuse_ubatch &= ubatch.seq_id_unq[s] == other.ubatch.seq_id_unq[s];
426
+ }
427
+ }
428
+ }
429
+
430
+ if (!can_reuse_ubatch) {
431
+ return false;
403
432
  }
404
- }
405
433
 
406
- llm_graph_input_i * add_input(llm_graph_input_ptr input) {
407
- inputs.emplace_back(std::move(input));
408
- return inputs.back().get();
434
+ return
435
+ cparams.embeddings == other.cparams.embeddings &&
436
+ cparams.causal_attn == other.cparams.causal_attn &&
437
+ arch == other.arch &&
438
+ gtype == other.gtype &&
439
+ cvec == other.cvec &&
440
+ loras == other.loras &&
441
+ cross == other.cross &&
442
+ n_outputs == other.n_outputs;
409
443
  }
444
+ };
445
+
446
+ class llm_graph_result {
447
+ public:
448
+ llm_graph_result(int64_t max_nodes);
449
+
450
+ virtual ~llm_graph_result() = default;
451
+
452
+ ggml_tensor * get_tokens() const { return t_tokens; }
453
+ ggml_tensor * get_logits() const { return t_logits; }
454
+ ggml_tensor * get_embd() const { return t_embd; }
455
+ ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
456
+
457
+ ggml_cgraph * get_gf() const { return gf; }
458
+ ggml_context * get_ctx() const { return ctx_compute.get(); }
459
+
460
+ int64_t get_max_nodes() const;
461
+
462
+ void reset();
463
+
464
+ void set_inputs(const llama_ubatch * ubatch);
465
+
466
+ // try to update the existing graph result using the new graph parameters in order to reuse it
467
+ // this can only be done if we determine that the resulting graph using the new graph parameters
468
+ // would be identical to the existing graph. in that case, we simply have to update the memory
469
+ // contexts of the input tensors of the graph and we can reuse it for another computation
470
+ // return true if the graph was updated and can be reused
471
+ bool can_reuse(const llm_graph_params & params);
472
+
473
+ llm_graph_input_i * add_input(llm_graph_input_ptr input);
474
+
475
+ void set_params(const llm_graph_params & params);
410
476
 
411
477
  // important graph nodes
412
478
  ggml_tensor * t_tokens = nullptr;
@@ -415,36 +481,31 @@ public:
415
481
  ggml_tensor * t_embd_pooled = nullptr;
416
482
 
417
483
  std::vector<llm_graph_input_ptr> inputs;
418
- };
419
484
 
420
- //
421
- // llm_graph_context
422
- //
485
+ ggml_context_ptr ctx_compute;
423
486
 
424
- // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
425
- using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
487
+ // memory buffers used to evaluate the model
488
+ std::vector<uint8_t> buf_compute_meta;
426
489
 
427
- struct llm_graph_params {
428
- ggml_context * ctx;
490
+ ggml_cgraph * gf;
429
491
 
430
- const llm_arch arch;
492
+ int64_t max_nodes;
431
493
 
432
- const llama_hparams & hparams;
433
- const llama_cparams & cparams;
434
- const llama_ubatch & ubatch;
494
+ private:
495
+ // keep a copy of the previous graph parameters
496
+ // we will use this to determine whether the graph can be reused by comparing them with the new parameters
497
+ // note: these are updated after constructing the new graph
498
+ llm_graph_params params;
435
499
 
436
- ggml_backend_sched_t sched;
437
- ggml_backend_t backend_cpu;
438
-
439
- const llama_adapter_cvec * cvec;
440
- const llama_adapter_loras * loras;
441
- const llama_memory_context_i * mctx;
442
- const llama_cross * cross;
500
+ // env: LLAMA_GRAPH_RESULT_DEBUG
501
+ int debug = 0;
502
+ };
443
503
 
444
- uint32_t n_outputs;
504
+ using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>;
445
505
 
446
- const llm_graph_cb & cb;
447
- };
506
+ //
507
+ // llm_graph_context
508
+ //
448
509
 
449
510
  // used in build_rs to properly order writes and avoid unnecessary copies
450
511
  using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
@@ -485,8 +546,6 @@ struct llm_graph_context {
485
546
  const enum llama_pooling_type pooling_type;
486
547
  const enum llama_rope_type rope_type;
487
548
 
488
- ggml_context * ctx0 = nullptr;
489
-
490
549
  ggml_backend_sched_t sched;
491
550
 
492
551
  ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
@@ -498,7 +557,10 @@ struct llm_graph_context {
498
557
 
499
558
  const llm_graph_cb & cb_func;
500
559
 
501
- std::unique_ptr<llm_graph_result> res;
560
+ llm_graph_result * res;
561
+
562
+ ggml_context * ctx0 = nullptr;
563
+ ggml_cgraph * gf = nullptr;
502
564
 
503
565
  llm_graph_context(const llm_graph_params & params);
504
566
  virtual ~llm_graph_context() = default;
@@ -579,14 +641,11 @@ struct llm_graph_context {
579
641
  ggml_tensor * build_inp_pos_bucket_dec() const;
580
642
  ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;
581
643
 
582
- llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
583
-
584
644
  //
585
645
  // attention
586
646
  //
587
647
 
588
648
  ggml_tensor * build_attn_mha(
589
- ggml_cgraph * gf,
590
649
  ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens]
591
650
  ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens]
592
651
  ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
@@ -599,7 +658,6 @@ struct llm_graph_context {
599
658
 
600
659
  ggml_tensor * build_attn(
601
660
  llm_graph_input_attn_no_cache * inp,
602
- ggml_cgraph * gf,
603
661
  ggml_tensor * wo,
604
662
  ggml_tensor * wo_b,
605
663
  ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@@ -614,7 +672,6 @@ struct llm_graph_context {
614
672
 
615
673
  ggml_tensor * build_attn(
616
674
  llm_graph_input_attn_kv_unified * inp,
617
- ggml_cgraph * gf,
618
675
  ggml_tensor * wo,
619
676
  ggml_tensor * wo_b,
620
677
  ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@@ -630,7 +687,6 @@ struct llm_graph_context {
630
687
  // note: if k_cur or v_cur are not provided, they will not be stored in the memory
631
688
  ggml_tensor * build_attn(
632
689
  llm_graph_input_attn_kv_unified_iswa * inp,
633
- ggml_cgraph * gf,
634
690
  ggml_tensor * wo,
635
691
  ggml_tensor * wo_b,
636
692
  ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@@ -645,7 +701,6 @@ struct llm_graph_context {
645
701
 
646
702
  ggml_tensor * build_attn(
647
703
  llm_graph_input_attn_cross * inp,
648
- ggml_cgraph * gf,
649
704
  ggml_tensor * wo,
650
705
  ggml_tensor * wo_b,
651
706
  ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@@ -656,18 +711,6 @@ struct llm_graph_context {
656
711
  float kq_scale,
657
712
  int il) const;
658
713
 
659
- ggml_tensor * build_attn(
660
- llm_graph_input_mem_hybrid * inp,
661
- ggml_cgraph * gf,
662
- ggml_tensor * wo,
663
- ggml_tensor * wo_b,
664
- ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
665
- ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
666
- ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
667
- ggml_tensor * kq_b,
668
- ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
669
- float kq_scale,
670
- int il) const;
671
714
  //
672
715
  // recurrent
673
716
  //
@@ -679,7 +722,6 @@ struct llm_graph_context {
679
722
  // implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
680
723
  // `llama_memory_recurrent`
681
724
  ggml_tensor * build_rs(
682
- ggml_cgraph * gf,
683
725
  ggml_tensor * s,
684
726
  ggml_tensor * state_copy,
685
727
  int32_t state_size,
@@ -694,15 +736,6 @@ struct llm_graph_context {
694
736
 
695
737
  ggml_tensor * build_rs(
696
738
  llm_graph_input_rs * inp,
697
- ggml_cgraph * gf,
698
- ggml_tensor * s,
699
- int32_t state_size,
700
- int32_t n_seqs,
701
- const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
702
-
703
- ggml_tensor * build_rs(
704
- llm_graph_input_mem_hybrid * inp,
705
- ggml_cgraph * gf,
706
739
  ggml_tensor * s,
707
740
  int32_t state_size,
708
741
  int32_t n_seqs,
@@ -710,21 +743,24 @@ struct llm_graph_context {
710
743
 
711
744
  ggml_tensor * build_rwkv_token_shift_load(
712
745
  llm_graph_input_rs * inp,
713
- ggml_cgraph * gf,
714
746
  const llama_ubatch & ubatch,
715
- int il) const;
747
+ int il) const;
716
748
 
717
749
  ggml_tensor * build_rwkv_token_shift_store(
718
750
  ggml_tensor * token_shift,
719
751
  const llama_ubatch & ubatch,
720
752
  int il) const;
753
+ //
754
+ // hybrid
755
+ //
756
+
757
+ llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
721
758
 
722
759
  //
723
760
  // pooling
724
761
  //
725
762
 
726
763
  void build_pooling(
727
- ggml_cgraph * gf,
728
764
  ggml_tensor * cls,
729
765
  ggml_tensor * cls_b,
730
766
  ggml_tensor * cls_out,
@@ -65,12 +65,57 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
65
65
  return n_embd_head_v * n_head_kv;
66
66
  }
67
67
 
68
+ bool llama_hparams::is_n_embd_k_gqa_variable() const {
69
+ const uint32_t val = n_embd_k_gqa();
70
+ for (uint32_t il = 0; il < n_layer; ++il) {
71
+ if (val != n_embd_k_gqa(il)) {
72
+ return true;
73
+ }
74
+ }
75
+
76
+ return false;
77
+ }
78
+
79
+ bool llama_hparams::is_n_embd_v_gqa_variable() const {
80
+ const uint32_t val = n_embd_v_gqa();
81
+ for (uint32_t il = 0; il < n_layer; ++il) {
82
+ if (val != n_embd_v_gqa(il)) {
83
+ return true;
84
+ }
85
+ }
86
+
87
+ return false;
88
+ }
89
+
90
+ uint32_t llama_hparams::n_embd_k_gqa_max() const {
91
+ uint32_t val = n_embd_k_gqa();
92
+ for (uint32_t il = 0; il < n_layer; ++il) {
93
+ val = std::max(val, n_embd_k_gqa(il));
94
+ }
95
+
96
+ return val;
97
+ }
98
+
99
+ uint32_t llama_hparams::n_embd_v_gqa_max() const {
100
+ uint32_t val = n_embd_v_gqa();
101
+ for (uint32_t il = 0; il < n_layer; ++il) {
102
+ val = std::max(val, n_embd_v_gqa(il));
103
+ }
104
+
105
+ return val;
106
+ }
107
+
68
108
  uint32_t llama_hparams::n_embd_r() const {
69
109
  if (wkv_head_size != 0) {
70
110
  // for RWKV models
71
111
  return token_shift_count * n_embd;
72
112
  }
73
113
 
114
+ if (n_shortconv_l_cache != 0) {
115
+ // for LFM2 models
116
+ return n_embd * (n_shortconv_l_cache - 1);
117
+ }
118
+
74
119
  // TODO: maybe support other convolution strides than 1
75
120
  // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
76
121
  // Corresponds to Mamba's conv_states size
@@ -6,7 +6,7 @@
6
6
 
7
7
  // bump if necessary
8
8
  #define LLAMA_MAX_LAYERS 512
9
- #define LLAMA_MAX_EXPERTS 256 // DeepSeekV3
9
+ #define LLAMA_MAX_EXPERTS 384 // Kimi-K2
10
10
 
11
11
  enum llama_expert_gating_func_type {
12
12
  LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
@@ -55,6 +55,8 @@ struct llama_hparams {
55
55
  struct llama_hparams_posnet posnet;
56
56
  struct llama_hparams_convnext convnext;
57
57
 
58
+ uint32_t n_shortconv_l_cache = 0;
59
+
58
60
  std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
59
61
  std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
60
62
  std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
@@ -189,6 +191,14 @@ struct llama_hparams {
189
191
  // dimension of value embeddings across all k-v heads
190
192
  uint32_t n_embd_v_gqa(uint32_t il = 0) const;
191
193
 
194
+ // true if any layer has a different n_embd_k_gqa/n_embd_v_gqa
195
+ bool is_n_embd_k_gqa_variable() const;
196
+ bool is_n_embd_v_gqa_variable() const;
197
+
198
+ // return the maximum n_embd_k_gqa/n_embd_v_gqa across all layers
199
+ uint32_t n_embd_k_gqa_max() const;
200
+ uint32_t n_embd_v_gqa_max() const;
201
+
192
202
  // dimension of the rolling state embeddings
193
203
  // corresponds to Mamba's conv_states size or RWKV's token_shift states size
194
204
  uint32_t n_embd_r() const;
@@ -18,16 +18,17 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
18
18
  bool v_trans,
19
19
  bool offload,
20
20
  bool swa_full,
21
+ bool unified,
21
22
  uint32_t kv_size,
22
23
  uint32_t n_seq_max,
23
24
  uint32_t n_ubatch,
24
- uint32_t n_pad) : hparams(model.hparams) {
25
+ uint32_t n_pad) : hparams(model.hparams), unified(unified) {
25
26
  llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
26
27
  llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
27
28
 
28
29
  const uint32_t size_base = kv_size;
29
30
 
30
- uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad));
31
+ uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch, n_pad));
31
32
 
32
33
  // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
33
34
  if (swa_full) {
@@ -41,14 +42,14 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
41
42
 
42
43
  kv_base = std::make_unique<llama_kv_cache_unified>(
43
44
  model, std::move(filter_base), type_k, type_v,
44
- v_trans, offload, size_base, n_seq_max, n_pad,
45
+ v_trans, offload, unified, size_base, n_seq_max, n_pad,
45
46
  0, LLAMA_SWA_TYPE_NONE);
46
47
 
47
48
  LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
48
49
 
49
50
  kv_swa = std::make_unique<llama_kv_cache_unified>(
50
51
  model, std::move(filter_swa), type_k, type_v,
51
- v_trans, offload, size_swa, n_seq_max, n_pad,
52
+ v_trans, offload, unified, size_swa, n_seq_max, n_pad,
52
53
  hparams.n_swa, hparams.swa_type);
53
54
  }
54
55
 
@@ -100,6 +101,11 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
100
101
 
101
102
  // first try simple split
102
103
  do {
104
+ if (!unified) {
105
+ // requires equal splits, so we skip the simple split
106
+ break;
107
+ }
108
+
103
109
  balloc.split_reset();
104
110
 
105
111
  std::vector<llama_ubatch> ubatches;
@@ -140,7 +146,7 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
140
146
 
141
147
  std::vector<llama_ubatch> ubatches;
142
148
  while (true) {
143
- auto ubatch = balloc.split_equal(n_ubatch, false);
149
+ auto ubatch = balloc.split_equal(n_ubatch, !unified);
144
150
 
145
151
  if (ubatch.n_tokens == 0) {
146
152
  break;
@@ -20,6 +20,7 @@ public:
20
20
  bool v_trans,
21
21
  bool offload,
22
22
  bool swa_full,
23
+ bool unified,
23
24
  uint32_t kv_size,
24
25
  uint32_t n_seq_max,
25
26
  uint32_t n_ubatch,
@@ -68,6 +69,8 @@ public:
68
69
  private:
69
70
  const llama_hparams & hparams;
70
71
 
72
+ const bool unified;
73
+
71
74
  std::unique_ptr<llama_kv_cache_unified> kv_base;
72
75
  std::unique_ptr<llama_kv_cache_unified> kv_swa;
73
76
  };