@fugood/llama.node 1.0.0-beta.5 → 1.0.0-beta.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/lib/binding.ts +3 -1
  2. package/lib/index.js +2 -0
  3. package/lib/index.ts +3 -1
  4. package/package.json +14 -14
  5. package/scripts/llama.cpp.patch +27 -26
  6. package/src/EmbeddingWorker.cpp +1 -1
  7. package/src/LlamaCompletionWorker.cpp +28 -7
  8. package/src/LlamaCompletionWorker.h +4 -0
  9. package/src/LlamaContext.cpp +14 -17
  10. package/src/common.hpp +7 -6
  11. package/src/llama.cpp/CMakeLists.txt +15 -4
  12. package/src/llama.cpp/common/CMakeLists.txt +15 -24
  13. package/src/llama.cpp/common/arg.cpp +172 -110
  14. package/src/llama.cpp/common/chat-parser.cpp +385 -0
  15. package/src/llama.cpp/common/chat-parser.h +120 -0
  16. package/src/llama.cpp/common/chat.cpp +726 -596
  17. package/src/llama.cpp/common/chat.h +74 -8
  18. package/src/llama.cpp/common/common.cpp +56 -38
  19. package/src/llama.cpp/common/common.h +9 -3
  20. package/src/llama.cpp/common/json-partial.cpp +256 -0
  21. package/src/llama.cpp/common/json-partial.h +38 -0
  22. package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
  23. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -4
  24. package/src/llama.cpp/common/sampling.cpp +7 -8
  25. package/src/llama.cpp/common/speculative.cpp +6 -4
  26. package/src/llama.cpp/ggml/CMakeLists.txt +48 -3
  27. package/src/llama.cpp/ggml/include/ggml.h +22 -3
  28. package/src/llama.cpp/ggml/src/CMakeLists.txt +81 -22
  29. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +131 -49
  30. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  31. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
  32. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2162 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
  39. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
  40. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
  41. package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
  42. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
  43. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
  44. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  45. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
  46. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +12 -13
  47. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +64 -88
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
  49. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  50. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  51. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
  52. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  53. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +282 -100
  54. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  55. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
  56. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  57. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1570 -0
  58. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  59. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +119 -5
  60. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  61. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
  62. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +204 -49
  63. package/src/llama.cpp/include/llama.h +145 -40
  64. package/src/llama.cpp/src/CMakeLists.txt +5 -1
  65. package/src/llama.cpp/src/llama-arch.cpp +99 -3
  66. package/src/llama.cpp/src/llama-arch.h +10 -1
  67. package/src/llama.cpp/src/llama-batch.cpp +728 -272
  68. package/src/llama.cpp/src/llama-batch.h +112 -54
  69. package/src/llama.cpp/src/llama-chat.cpp +19 -2
  70. package/src/llama.cpp/src/llama-chat.h +1 -0
  71. package/src/llama.cpp/src/llama-context.cpp +525 -339
  72. package/src/llama.cpp/src/llama-context.h +38 -17
  73. package/src/llama.cpp/src/llama-cparams.cpp +4 -0
  74. package/src/llama.cpp/src/llama-cparams.h +2 -0
  75. package/src/llama.cpp/src/llama-grammar.cpp +12 -2
  76. package/src/llama.cpp/src/llama-graph.cpp +413 -353
  77. package/src/llama.cpp/src/llama-graph.h +112 -56
  78. package/src/llama.cpp/src/llama-hparams.cpp +10 -2
  79. package/src/llama.cpp/src/llama-hparams.h +13 -2
  80. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +279 -0
  81. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +128 -0
  82. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +1815 -0
  83. package/src/llama.cpp/src/llama-kv-cache-unified.h +303 -0
  84. package/src/llama.cpp/src/llama-kv-cells.h +415 -0
  85. package/src/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  86. package/src/llama.cpp/src/llama-memory-hybrid.h +138 -0
  87. package/src/llama.cpp/src/llama-memory-recurrent.cpp +1112 -0
  88. package/src/llama.cpp/src/llama-memory-recurrent.h +183 -0
  89. package/src/llama.cpp/src/llama-memory.cpp +41 -0
  90. package/src/llama.cpp/src/llama-memory.h +86 -5
  91. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  92. package/src/llama.cpp/src/llama-model-loader.cpp +42 -17
  93. package/src/llama.cpp/src/llama-model-saver.cpp +1 -0
  94. package/src/llama.cpp/src/llama-model.cpp +1137 -528
  95. package/src/llama.cpp/src/llama-model.h +4 -0
  96. package/src/llama.cpp/src/llama-quant.cpp +2 -1
  97. package/src/llama.cpp/src/llama-sampling.cpp +2 -2
  98. package/src/llama.cpp/src/llama-vocab.cpp +69 -32
  99. package/src/llama.cpp/src/llama-vocab.h +1 -0
  100. package/src/llama.cpp/src/llama.cpp +11 -7
  101. package/src/llama.cpp/src/unicode.cpp +5 -0
  102. package/src/tts_utils.h +1 -1
  103. package/src/llama.cpp/common/json.hpp +0 -24766
  104. package/src/llama.cpp/common/minja/chat-template.hpp +0 -541
  105. package/src/llama.cpp/common/minja/minja.hpp +0 -2974
  106. package/src/llama.cpp/common/stb_image.h +0 -7988
  107. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  108. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13326
  109. package/src/llama.cpp/src/llama-kv-cache.cpp +0 -2827
  110. package/src/llama.cpp/src/llama-kv-cache.h +0 -515
  111. /package/src/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  112. /package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  113. /package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -61,7 +61,10 @@ extern "C" {
61
61
  struct llama_model;
62
62
  struct llama_context;
63
63
  struct llama_sampler;
64
- struct llama_kv_cache;
64
+
65
+ typedef struct llama_memory_i * llama_memory_t;
66
+
67
+ struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
65
68
 
66
69
  typedef int32_t llama_pos;
67
70
  typedef int32_t llama_token;
@@ -240,18 +243,21 @@ extern "C" {
240
243
 
241
244
  typedef bool (*llama_progress_callback)(float progress, void * user_data);
242
245
 
243
- // Input data for llama_decode
246
+ // Input data for llama_encode/llama_decode
244
247
  // A llama_batch object can contain input about one or many sequences
245
248
  // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
246
249
  //
247
250
  // - token : the token ids of the input (used when embd is NULL)
248
251
  // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
249
252
  // - pos : the positions of the respective token in the sequence
250
- // (if set to NULL, the token position will be tracked automatically by llama_decode)
253
+ // (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
251
254
  // - seq_id : the sequence to which the respective token belongs
252
255
  // (if set to NULL, the sequence ID will be assumed to be 0)
253
256
  // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
254
- // (if set to NULL, only the logits for last token will be returned)
257
+ // (if set to NULL:
258
+ // - if embeddings: all tokens are output
259
+ // - if not: only the last token is output
260
+ // )
255
261
  //
256
262
  typedef struct llama_batch {
257
263
  int32_t n_tokens;
@@ -261,7 +267,7 @@ extern "C" {
261
267
  llama_pos * pos;
262
268
  int32_t * n_seq_id;
263
269
  llama_seq_id ** seq_id;
264
- int8_t * logits; // TODO: rename this to "output"
270
+ int8_t * logits; // TODO: rename this to "output"
265
271
  } llama_batch;
266
272
 
267
273
  enum llama_model_kv_override_type {
@@ -366,6 +372,8 @@ extern "C" {
366
372
  bool no_perf; // measure performance timings
367
373
  bool op_offload; // offload host tensor operations to device
368
374
  bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
375
+ // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
376
+ // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
369
377
  };
370
378
 
371
379
  // model quantization parameters
@@ -471,6 +479,7 @@ extern "C" {
471
479
  LLAMA_API int64_t llama_time_us(void);
472
480
 
473
481
  LLAMA_API size_t llama_max_devices(void);
482
+ LLAMA_API size_t llama_max_parallel_sequences(void);
474
483
 
475
484
  LLAMA_API bool llama_supports_mmap (void);
476
485
  LLAMA_API bool llama_supports_mlock (void);
@@ -490,9 +499,11 @@ extern "C" {
490
499
  DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
491
500
 
492
501
  LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
493
- LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx);
502
+ LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
494
503
  LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
495
504
 
505
+ DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
506
+
496
507
  LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
497
508
  LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
498
509
 
@@ -501,10 +512,18 @@ extern "C" {
501
512
  LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
502
513
  LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
503
514
  LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
515
+ LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model);
504
516
 
505
517
  // Get the model's RoPE frequency scaling factor
506
518
  LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
507
519
 
520
+ // Returns the number of classifier outputs (only valid for classifier models)
521
+ // Undefined behavior for non-classifier models
522
+ LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
523
+
524
+ // Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
525
+ LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
526
+
508
527
  LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
509
528
 
510
529
  LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
@@ -605,106 +624,190 @@ extern "C" {
605
624
  int32_t il_end);
606
625
 
607
626
  //
608
- // KV cache
627
+ // Memory
628
+ //
629
+
630
+ // Clear the memory contents
631
+ // If data == true, the data buffers will also be cleared together with the metadata
632
+ LLAMA_API void llama_memory_clear(
633
+ llama_memory_t mem,
634
+ bool data);
635
+
636
+ // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
637
+ // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
638
+ // seq_id < 0 : match any sequence
639
+ // p0 < 0 : [0, p1]
640
+ // p1 < 0 : [p0, inf)
641
+ LLAMA_API bool llama_memory_seq_rm(
642
+ llama_memory_t mem,
643
+ llama_seq_id seq_id,
644
+ llama_pos p0,
645
+ llama_pos p1);
646
+
647
+ // Copy all tokens that belong to the specified sequence to another sequence
648
+ // p0 < 0 : [0, p1]
649
+ // p1 < 0 : [p0, inf)
650
+ LLAMA_API void llama_memory_seq_cp(
651
+ llama_memory_t mem,
652
+ llama_seq_id seq_id_src,
653
+ llama_seq_id seq_id_dst,
654
+ llama_pos p0,
655
+ llama_pos p1);
656
+
657
+ // Removes all tokens that do not belong to the specified sequence
658
+ LLAMA_API void llama_memory_seq_keep(
659
+ llama_memory_t mem,
660
+ llama_seq_id seq_id);
661
+
662
+ // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
663
+ // p0 < 0 : [0, p1]
664
+ // p1 < 0 : [p0, inf)
665
+ LLAMA_API void llama_memory_seq_add(
666
+ llama_memory_t mem,
667
+ llama_seq_id seq_id,
668
+ llama_pos p0,
669
+ llama_pos p1,
670
+ llama_pos delta);
671
+
672
+ // Integer division of the positions by factor of `d > 1`
673
+ // p0 < 0 : [0, p1]
674
+ // p1 < 0 : [p0, inf)
675
+ LLAMA_API void llama_memory_seq_div(
676
+ llama_memory_t mem,
677
+ llama_seq_id seq_id,
678
+ llama_pos p0,
679
+ llama_pos p1,
680
+ int d);
681
+
682
+ // Returns the smallest position present in the memory for the specified sequence
683
+ // This is typically non-zero only for SWA caches
684
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
685
+ // Return -1 if the sequence is empty
686
+ LLAMA_API llama_pos llama_memory_seq_pos_min(
687
+ llama_memory_t mem,
688
+ llama_seq_id seq_id);
689
+
690
+ // Returns the largest position present in the memory for the specified sequence
691
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
692
+ // Return -1 if the sequence is empty
693
+ LLAMA_API llama_pos llama_memory_seq_pos_max(
694
+ llama_memory_t mem,
695
+ llama_seq_id seq_id);
696
+
697
+ // Check if the memory supports shifting
698
+ LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
699
+
700
+ //
701
+ // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
609
702
  //
610
703
 
611
704
  // Returns the number of tokens in the KV cache (slow, use only for debug)
612
705
  // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
613
706
  DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
614
- "Use llama_kv_self_seq_pos_max() instead");
707
+ "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
615
708
 
616
709
  // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
617
710
  DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
618
- "Use llama_kv_self_seq_pos_max() instead");
711
+ "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
619
712
 
620
713
  // Clear the KV cache - both cell info is erased and KV data is zeroed
621
- LLAMA_API void llama_kv_self_clear(
622
- struct llama_context * ctx);
714
+ DEPRECATED(LLAMA_API void llama_kv_self_clear(
715
+ struct llama_context * ctx),
716
+ "Use llama_memory_clear() instead");
623
717
 
624
718
  // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
625
719
  // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
626
720
  // seq_id < 0 : match any sequence
627
721
  // p0 < 0 : [0, p1]
628
722
  // p1 < 0 : [p0, inf)
629
- LLAMA_API bool llama_kv_self_seq_rm(
723
+ DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
630
724
  struct llama_context * ctx,
631
725
  llama_seq_id seq_id,
632
726
  llama_pos p0,
633
- llama_pos p1);
727
+ llama_pos p1),
728
+ "Use llama_memory_seq_rm() instead");
634
729
 
635
730
  // Copy all tokens that belong to the specified sequence to another sequence
636
731
  // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
637
732
  // p0 < 0 : [0, p1]
638
733
  // p1 < 0 : [p0, inf)
639
- LLAMA_API void llama_kv_self_seq_cp(
734
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
640
735
  struct llama_context * ctx,
641
736
  llama_seq_id seq_id_src,
642
737
  llama_seq_id seq_id_dst,
643
738
  llama_pos p0,
644
- llama_pos p1);
739
+ llama_pos p1),
740
+ "Use llama_memory_seq_cp() instead");
645
741
 
646
742
  // Removes all tokens that do not belong to the specified sequence
647
- LLAMA_API void llama_kv_self_seq_keep(
743
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
648
744
  struct llama_context * ctx,
649
- llama_seq_id seq_id);
745
+ llama_seq_id seq_id),
746
+ "Use llama_memory_seq_keep() instead");
650
747
 
651
748
  // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
652
749
  // If the KV cache is RoPEd, the KV data is updated accordingly:
653
750
  // - lazily on next llama_decode()
654
- // - explicitly with llama_kv_self_update()
655
751
  // p0 < 0 : [0, p1]
656
752
  // p1 < 0 : [p0, inf)
657
- LLAMA_API void llama_kv_self_seq_add(
753
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
658
754
  struct llama_context * ctx,
659
755
  llama_seq_id seq_id,
660
756
  llama_pos p0,
661
757
  llama_pos p1,
662
- llama_pos delta);
758
+ llama_pos delta),
759
+ "Use llama_memory_seq_add() instead");
663
760
 
664
761
  // Integer division of the positions by factor of `d > 1`
665
762
  // If the KV cache is RoPEd, the KV data is updated accordingly:
666
763
  // - lazily on next llama_decode()
667
- // - explicitly with llama_kv_self_update()
668
764
  // p0 < 0 : [0, p1]
669
765
  // p1 < 0 : [p0, inf)
670
- LLAMA_API void llama_kv_self_seq_div(
766
+ DEPRECATED(void llama_kv_self_seq_div(
671
767
  struct llama_context * ctx,
672
768
  llama_seq_id seq_id,
673
769
  llama_pos p0,
674
770
  llama_pos p1,
675
- int d);
771
+ int d),
772
+ "Use llama_memory_seq_div() instead");
676
773
 
677
774
  // Returns the smallest position present in the KV cache for the specified sequence
678
775
  // This is typically non-zero only for SWA caches
776
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
679
777
  // Return -1 if the sequence is empty
680
- LLAMA_API llama_pos llama_kv_self_seq_pos_min(
778
+ DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
681
779
  struct llama_context * ctx,
682
- llama_seq_id seq_id);
780
+ llama_seq_id seq_id),
781
+ "Use llama_memory_seq_pos_min() instead");
683
782
 
684
783
  // Returns the largest position present in the KV cache for the specified sequence
784
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
685
785
  // Return -1 if the sequence is empty
686
- LLAMA_API llama_pos llama_kv_self_seq_pos_max(
786
+ DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
687
787
  struct llama_context * ctx,
688
- llama_seq_id seq_id);
788
+ llama_seq_id seq_id),
789
+ "Use llama_memory_seq_pos_max() instead");
689
790
 
690
791
  // Defragment the KV cache
691
792
  // This will be applied:
692
793
  // - lazily on next llama_decode()
693
- // - explicitly with llama_kv_self_update()
694
- LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
794
+ DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
795
+ "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
695
796
 
696
797
  // Check if the context supports KV cache shifting
697
- LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
798
+ DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
799
+ "use llama_memory_can_shift() instead");
698
800
 
699
801
  // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
700
- LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
802
+ DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
803
+ "simply remove this call, updates are applied lazily on the next llama_decode()");
701
804
 
702
805
  //
703
806
  // State / sessions
704
807
  //
705
808
 
706
809
  // Returns the *actual* size in bytes of the state
707
- // (logits, embedding and kv_cache)
810
+ // (logits, embedding and memory)
708
811
  // Only use when saving the state, not when restoring it, otherwise the size may be too small.
709
812
  LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
710
813
  LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
@@ -760,12 +863,12 @@ extern "C" {
760
863
  size_t n_token_count),
761
864
  "use llama_state_save_file instead");
762
865
 
763
- // Get the exact size needed to copy the KV cache of a single sequence
866
+ // Get the exact size needed to copy the state of a single sequence
764
867
  LLAMA_API size_t llama_state_seq_get_size(
765
868
  struct llama_context * ctx,
766
869
  llama_seq_id seq_id);
767
870
 
768
- // Copy the KV cache of a single sequence into the specified buffer
871
+ // Copy the state of a single sequence into the specified buffer
769
872
  LLAMA_API size_t llama_state_seq_get_data(
770
873
  struct llama_context * ctx,
771
874
  uint8_t * dst,
@@ -831,16 +934,16 @@ extern "C" {
831
934
  // For encode-decoder contexts, processes the batch using the encoder.
832
935
  // Can store the encoder output internally for later use by the decoder's cross-attention layers.
833
936
  // 0 - success
834
- // < 0 - error. the KV cache state is restored to the state before this call
937
+ // < 0 - error. the memory state is restored to the state before this call
835
938
  LLAMA_API int32_t llama_encode(
836
939
  struct llama_context * ctx,
837
940
  struct llama_batch batch);
838
941
 
839
942
  // Process a batch of tokens.
840
- // Requires KV cache.
943
+ // Requires the context to have a memory.
841
944
  // For encode-decoder contexts, processes the batch using the decoder.
842
945
  // Positive return values does not mean a fatal error, but rather a warning.
843
- // Upon non-zero return values, the KV cache state is restored to the state before this call
946
+ // Upon non-zero return values, the memory state is restored to the state before this call
844
947
  // 0 - success
845
948
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
846
949
  // 2 - aborted
@@ -861,8 +964,8 @@ extern "C" {
861
964
  // Get the number of threads used for prompt and batch processing (multiple token).
862
965
  LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
863
966
 
864
- // Set whether the model is in embeddings mode or not
865
- // If true, embeddings will be returned but logits will not
967
+ // Set whether the context outputs embeddings or not
968
+ // TODO: rename to avoid confusion with llama_get_embeddings()
866
969
  LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
867
970
 
868
971
  // Set whether to use causal attention or not
@@ -911,7 +1014,7 @@ extern "C" {
911
1014
 
912
1015
  // Get the embeddings for a sequence id
913
1016
  // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
914
- // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
1017
+ // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
915
1018
  // otherwise: float[n_embd] (1-dimensional)
916
1019
  LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
917
1020
 
@@ -941,6 +1044,7 @@ extern "C" {
941
1044
 
942
1045
  LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
943
1046
  LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
1047
+ LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
944
1048
 
945
1049
  LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
946
1050
  LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
@@ -984,6 +1088,7 @@ extern "C" {
984
1088
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
985
1089
  /// @return Returns the number of tokens on success, no more than n_tokens_max
986
1090
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
1091
+ /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
987
1092
  /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
988
1093
  /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
989
1094
  /// as plaintext. Does not insert a leading space.
@@ -14,13 +14,17 @@ add_library(llama
14
14
  llama-batch.cpp
15
15
  llama-chat.cpp
16
16
  llama-context.cpp
17
+ llama-cparams.cpp
17
18
  llama-grammar.cpp
18
19
  llama-graph.cpp
19
20
  llama-hparams.cpp
20
21
  llama-impl.cpp
21
22
  llama-io.cpp
22
- llama-kv-cache.cpp
23
+ llama-kv-cache-unified.cpp
24
+ llama-kv-cache-unified-iswa.cpp
23
25
  llama-memory.cpp
26
+ llama-memory-hybrid.cpp
27
+ llama-memory-recurrent.cpp
24
28
  llama-mmap.cpp
25
29
  llama-model-loader.cpp
26
30
  llama-model-saver.cpp
@@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
20
20
  { LLM_ARCH_BERT, "bert" },
21
21
  { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
22
22
  { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
23
+ { LLM_ARCH_NEO_BERT, "neo-bert" },
23
24
  { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
24
25
  { LLM_ARCH_BLOOM, "bloom" },
25
26
  { LLM_ARCH_STABLELM, "stablelm" },
@@ -72,6 +73,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
72
73
  { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
73
74
  { LLM_ARCH_PLM, "plm" },
74
75
  { LLM_ARCH_BAILINGMOE, "bailingmoe" },
76
+ { LLM_ARCH_DOTS1, "dots1" },
77
+ { LLM_ARCH_ARCEE, "arcee" },
75
78
  { LLM_ARCH_UNKNOWN, "(unknown)" },
76
79
  };
77
80
 
@@ -144,6 +147,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
144
147
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
145
148
  { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
146
149
  { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
150
+ { LLM_KV_ATTENTION_LAYER_INDICES, "%s.attention.layer_indices" },
147
151
 
148
152
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
149
153
  { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -174,6 +178,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
174
178
  { LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
175
179
  { LLM_KV_CONVNEXT_BLOCK_COUNT, "%s.convnext.block_count" },
176
180
 
181
+ { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
182
+
177
183
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
178
184
  { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
179
185
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
@@ -192,13 +198,13 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
192
198
  { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
193
199
  { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
194
200
  { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
201
+ { LLM_KV_TOKENIZER_ADD_SEP, "tokenizer.ggml.add_sep_token" },
195
202
  { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
196
203
  { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
197
204
  { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
198
205
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
199
206
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
200
207
  { LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
201
- { LLM_KV_TOKENIZER_CHAT_TEMPLATE_N, "tokenizer.chat_template.%s" },
202
208
  { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
203
209
  { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
204
210
  { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
@@ -242,6 +248,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
242
248
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
243
249
  },
244
250
  },
251
+ {
252
+ LLM_ARCH_ARCEE,
253
+ {
254
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
255
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
256
+ { LLM_TENSOR_OUTPUT, "output" },
257
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
258
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
259
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
260
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
261
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
262
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
263
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
264
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
265
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
266
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
267
+ },
268
+ },
245
269
  {
246
270
  LLM_ARCH_LLAMA4,
247
271
  {
@@ -448,6 +472,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
448
472
  { LLM_TENSOR_TOKEN_TYPES, "token_types" },
449
473
  { LLM_TENSOR_POS_EMBD, "position_embd" },
450
474
  { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
475
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
451
476
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
452
477
  { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
453
478
  { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
@@ -492,6 +517,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
492
517
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
493
518
  },
494
519
  },
520
+ {
521
+ LLM_ARCH_NEO_BERT,
522
+ {
523
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
524
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
525
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
526
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
527
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
528
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
529
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
530
+ { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
531
+ { LLM_TENSOR_CLS, "cls" },
532
+ { LLM_TENSOR_CLS_OUT, "cls.output" },
533
+ },
534
+ },
495
535
  {
496
536
  LLM_ARCH_JINA_BERT_V2,
497
537
  {
@@ -1553,6 +1593,34 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1553
1593
  { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1554
1594
  },
1555
1595
  },
1596
+ {
1597
+ LLM_ARCH_DOTS1,
1598
+ {
1599
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1600
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1601
+ { LLM_TENSOR_OUTPUT, "output" },
1602
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1603
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1604
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1605
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1606
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1607
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1608
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1609
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1610
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1611
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1612
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1613
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1614
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1615
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1616
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1617
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
1618
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1619
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1620
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1621
+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1622
+ }
1623
+ },
1556
1624
  {
1557
1625
  LLM_ARCH_UNKNOWN,
1558
1626
  {
@@ -1704,8 +1772,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1704
1772
  LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
1705
1773
 
1706
1774
  std::string LLM_KV::operator()(llm_kv kv) const {
1707
- return suffix ? ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch), suffix)
1708
- : ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
1775
+ std::string name = ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
1776
+
1777
+ if (suffix != nullptr) {
1778
+ name += ".";
1779
+ name += suffix;
1780
+ }
1781
+
1782
+ return name;
1709
1783
  }
1710
1784
 
1711
1785
  std::string LLM_TN_IMPL::str() const {
@@ -1744,3 +1818,25 @@ llm_arch llm_arch_from_string(const std::string & name) {
1744
1818
  const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) {
1745
1819
  return LLM_TENSOR_INFOS.at(tensor);
1746
1820
  }
1821
+
1822
+ bool llm_arch_is_recurrent(const llm_arch & arch) {
1823
+ switch (arch) {
1824
+ case LLM_ARCH_MAMBA:
1825
+ case LLM_ARCH_RWKV6:
1826
+ case LLM_ARCH_RWKV6QWEN2:
1827
+ case LLM_ARCH_RWKV7:
1828
+ case LLM_ARCH_ARWKV7:
1829
+ return true;
1830
+ default:
1831
+ return false;
1832
+ }
1833
+ }
1834
+
1835
+ bool llm_arch_is_hybrid(const llm_arch & arch) {
1836
+ // TODO: There are currently no hybrid models! Once there are, this will be
1837
+ // the place to identify them
1838
+ switch (arch) {
1839
+ default:
1840
+ return false;
1841
+ }
1842
+ }
@@ -24,6 +24,7 @@ enum llm_arch {
24
24
  LLM_ARCH_BERT,
25
25
  LLM_ARCH_NOMIC_BERT,
26
26
  LLM_ARCH_NOMIC_BERT_MOE,
27
+ LLM_ARCH_NEO_BERT,
27
28
  LLM_ARCH_JINA_BERT_V2,
28
29
  LLM_ARCH_BLOOM,
29
30
  LLM_ARCH_STABLELM,
@@ -76,6 +77,8 @@ enum llm_arch {
76
77
  LLM_ARCH_WAVTOKENIZER_DEC,
77
78
  LLM_ARCH_PLM,
78
79
  LLM_ARCH_BAILINGMOE,
80
+ LLM_ARCH_DOTS1,
81
+ LLM_ARCH_ARCEE,
79
82
  LLM_ARCH_UNKNOWN,
80
83
  };
81
84
 
@@ -148,6 +151,7 @@ enum llm_kv {
148
151
  LLM_KV_ATTENTION_SCALE,
149
152
  LLM_KV_ATTENTION_KEY_LENGTH_MLA,
150
153
  LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
154
+ LLM_KV_ATTENTION_LAYER_INDICES,
151
155
 
152
156
  LLM_KV_ROPE_DIMENSION_COUNT,
153
157
  LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -190,13 +194,13 @@ enum llm_kv {
190
194
  LLM_KV_TOKENIZER_MASK_ID,
191
195
  LLM_KV_TOKENIZER_ADD_BOS,
192
196
  LLM_KV_TOKENIZER_ADD_EOS,
197
+ LLM_KV_TOKENIZER_ADD_SEP,
193
198
  LLM_KV_TOKENIZER_ADD_PREFIX,
194
199
  LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
195
200
  LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
196
201
  LLM_KV_TOKENIZER_HF_JSON,
197
202
  LLM_KV_TOKENIZER_RWKV,
198
203
  LLM_KV_TOKENIZER_CHAT_TEMPLATE,
199
- LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,
200
204
  LLM_KV_TOKENIZER_FIM_PRE_ID,
201
205
  LLM_KV_TOKENIZER_FIM_SUF_ID,
202
206
  LLM_KV_TOKENIZER_FIM_MID_ID,
@@ -213,6 +217,8 @@ enum llm_kv {
213
217
  LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
214
218
  LLM_KV_CONVNEXT_BLOCK_COUNT,
215
219
 
220
+ LLM_KV_CLASSIFIER_OUTPUT_LABELS,
221
+
216
222
  // deprecated:
217
223
  LLM_KV_TOKENIZER_PREFIX_ID,
218
224
  LLM_KV_TOKENIZER_SUFFIX_ID,
@@ -435,3 +441,6 @@ const char * llm_arch_name(llm_arch arch);
435
441
  llm_arch llm_arch_from_string(const std::string & name);
436
442
 
437
443
  const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
444
+
445
+ bool llm_arch_is_recurrent(const llm_arch & arch);
446
+ bool llm_arch_is_hybrid (const llm_arch & arch);