@fugood/llama.node 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +89 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/CMakeLists.txt +9 -1
  25. package/src/llama.cpp/cmake/common.cmake +2 -0
  26. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  27. package/src/llama.cpp/common/arg.cpp +132 -13
  28. package/src/llama.cpp/common/chat.cpp +960 -266
  29. package/src/llama.cpp/common/chat.h +135 -0
  30. package/src/llama.cpp/common/common.cpp +33 -174
  31. package/src/llama.cpp/common/common.h +27 -67
  32. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  33. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  34. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  35. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  36. package/src/llama.cpp/common/sampling.cpp +45 -7
  37. package/src/llama.cpp/common/speculative.cpp +10 -9
  38. package/src/llama.cpp/common/speculative.h +1 -1
  39. package/src/llama.cpp/docs/build.md +45 -7
  40. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  41. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
  42. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
  43. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  44. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  45. package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
  46. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  48. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  50. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  51. package/src/llama.cpp/examples/llava/clip.h +19 -3
  52. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  53. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  54. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  55. package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
  56. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  57. package/src/llama.cpp/examples/main/main.cpp +79 -34
  58. package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
  59. package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
  60. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  61. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  62. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  63. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +196 -108
  67. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  68. package/src/llama.cpp/examples/server/server.cpp +113 -101
  69. package/src/llama.cpp/examples/server/utils.hpp +94 -105
  70. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  74. package/src/llama.cpp/examples/tts/tts.cpp +263 -151
  75. package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
  76. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  77. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  79. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  80. package/src/llama.cpp/ggml/include/ggml.h +29 -1
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
  82. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  83. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  84. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  85. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  87. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
  88. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  89. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
  90. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  91. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  102. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  103. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  104. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  105. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  106. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  107. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
  108. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
  109. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  110. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
  111. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  112. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  113. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
  117. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  118. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  124. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
  125. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
  127. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  128. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
  129. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  130. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
  132. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  134. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  135. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
  139. package/src/llama.cpp/ggml/src/ggml.c +93 -5
  140. package/src/llama.cpp/include/llama.h +105 -27
  141. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  142. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  143. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  144. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  145. package/src/llama.cpp/requirements.txt +1 -0
  146. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  147. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  148. package/src/llama.cpp/src/llama-adapter.h +11 -9
  149. package/src/llama.cpp/src/llama-arch.cpp +123 -16
  150. package/src/llama.cpp/src/llama-arch.h +19 -0
  151. package/src/llama.cpp/src/llama-batch.h +2 -2
  152. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  153. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  154. package/src/llama.cpp/src/llama-context.h +214 -77
  155. package/src/llama.cpp/src/llama-cparams.h +1 -0
  156. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  157. package/src/llama.cpp/src/llama-grammar.h +12 -3
  158. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  159. package/src/llama.cpp/src/llama-graph.h +574 -0
  160. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  161. package/src/llama.cpp/src/llama-hparams.h +9 -0
  162. package/src/llama.cpp/src/llama-io.cpp +15 -0
  163. package/src/llama.cpp/src/llama-io.h +35 -0
  164. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  165. package/src/llama.cpp/src/llama-kv-cache.h +178 -109
  166. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  167. package/src/llama.cpp/src/llama-memory.h +21 -0
  168. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  169. package/src/llama.cpp/src/llama-model.cpp +8230 -122
  170. package/src/llama.cpp/src/llama-model.h +34 -1
  171. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  172. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  173. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  174. package/src/llama.cpp/src/llama.cpp +51 -9837
  175. package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
  176. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  177. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  178. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  179. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  180. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  181. package/src/llama.cpp/common/chat.hpp +0 -55
  182. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  183. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
  184. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -1,12 +1,30 @@
1
1
  #pragma once
2
2
 
3
3
  #include "llama.h"
4
+ #include "llama-io.h"
5
+ #include "llama-memory.h"
4
6
 
5
7
  #include "ggml-cpp.h"
6
8
 
9
+ #include <functional>
7
10
  #include <set>
8
11
  #include <vector>
9
12
 
13
+ struct llama_cparams;
14
+ struct llama_hparams;
15
+ struct llama_ubatch;
16
+
17
+ struct llama_kv_cache : public llama_memory_i {
18
+ using llama_memory_i::llama_memory_i;
19
+
20
+ virtual int32_t get_n_tokens() const = 0;
21
+ virtual uint32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
22
+
23
+ virtual bool get_can_shift() const = 0;
24
+
25
+ bool get_can_edit() const override { return get_can_shift(); }
26
+ };
27
+
10
28
  struct llama_kv_cell {
11
29
  llama_pos pos = -1;
12
30
  llama_pos delta = 0;
@@ -28,55 +46,6 @@ struct llama_kv_cell {
28
46
  }
29
47
  };
30
48
 
31
- // ring-buffer of cached KV data
32
- struct llama_kv_cache {
33
- bool has_shift = false;
34
- bool do_defrag = false;
35
- bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
36
- bool v_trans = true; // the value tensor is transposed
37
- bool can_shift = false;
38
-
39
- // Note: The value of head isn't only used to optimize searching
40
- // for a free KV slot. llama_decode_impl also uses it, so it
41
- // cannot be freely changed after a slot has been allocated.
42
- uint32_t head = 0;
43
- uint32_t size = 0;
44
- uint32_t used = 0; // used cells (i.e. at least one seq_id)
45
-
46
- // computed before each graph build
47
- uint32_t n = 0;
48
-
49
- ggml_type type_k = GGML_TYPE_F16;
50
- ggml_type type_v = GGML_TYPE_F16;
51
-
52
- std::vector<llama_kv_cell> cells;
53
-
54
- std::vector<struct ggml_tensor *> k_l; // per layer
55
- std::vector<struct ggml_tensor *> v_l;
56
-
57
- std::vector<ggml_context_ptr> ctxs;
58
- std::vector<ggml_backend_buffer_ptr> bufs;
59
-
60
- size_t total_size() const {
61
- size_t size = 0;
62
- for (const auto & buf : bufs) {
63
- size += ggml_backend_buffer_get_size(buf.get());
64
- }
65
-
66
- return size;
67
- }
68
-
69
- // TODO: better data structures to reduce the cost of this operation
70
- llama_pos max_pos() const {
71
- llama_pos max_pos = -1;
72
- for (const auto & cell : cells) {
73
- max_pos = std::max(max_pos, cell.pos);
74
- }
75
-
76
- return max_pos;
77
- }
78
- };
79
-
80
49
  // a structure holds information about the slot found in llama_kv_cache_find_slot
81
50
  struct llama_kv_cache_slot_info {
82
51
  std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
@@ -88,82 +57,131 @@ struct llama_kv_cache_slot_info {
88
57
  operator bool() const { return found; }
89
58
  };
90
59
 
91
- // TODO: maybe not needed
92
- uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams);
93
-
94
- bool llama_kv_cache_init(
95
- struct llama_kv_cache & cache,
96
- const llama_model & model,
60
+ // ring-buffer of cached KV data
61
+ // TODO: pimpl
62
+ // TODO: add notion of max sequences
63
+ class llama_kv_cache_unified : public llama_kv_cache {
64
+ public:
65
+ // can be used to query data from the model if needed
66
+ struct callbacks {
67
+ std::function<ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
68
+ };
69
+
70
+ llama_kv_cache_unified(
71
+ const llama_hparams & hparams,
72
+ callbacks cbs);
73
+
74
+ virtual ~llama_kv_cache_unified() = default;
75
+
76
+ // TODO: become constructor
77
+ bool init(
78
+ const llama_model & model, // TODO: do not reference the model
97
79
  const llama_cparams & cparams,
98
80
  ggml_type type_k,
99
81
  ggml_type type_v,
100
82
  uint32_t kv_size,
101
83
  bool offload);
102
84
 
103
- // find an empty slot of size "n_tokens" in the cache
104
- // updates the cache head
105
- // returns a structure holding information about the slot found
106
- // Note: On success, it's important that cache.head points
107
- // to the first cell of the slot.
108
- struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
109
- struct llama_kv_cache & cache,
110
- const struct llama_ubatch & batch);
85
+ int32_t get_n_tokens() const override;
86
+ uint32_t get_used_cells() const override;
111
87
 
112
- // find how many cells are currently in use
113
- uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache);
88
+ size_t total_size() const;
114
89
 
115
- void llama_kv_cache_clear(struct llama_kv_cache & cache);
90
+ // TODO: better data structures to reduce the cost of this operation
91
+ llama_pos pos_max() const;
116
92
 
117
- bool llama_kv_cache_seq_rm(
118
- struct llama_kv_cache & cache,
119
- llama_seq_id seq_id,
120
- llama_pos p0,
121
- llama_pos p1);
93
+ void clear() override;
94
+ void defrag() override;
122
95
 
123
- void llama_kv_cache_seq_cp(
124
- struct llama_kv_cache & cache,
125
- llama_seq_id seq_id_src,
126
- llama_seq_id seq_id_dst,
127
- llama_pos p0,
128
- llama_pos p1);
96
+ bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
97
+ void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
98
+ void seq_keep(llama_seq_id seq_id) override;
99
+ void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override;
100
+ void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
129
101
 
130
- void llama_kv_cache_seq_keep(
131
- struct llama_kv_cache & cache,
132
- llama_seq_id seq_id);
102
+ llama_pos seq_pos_max(llama_seq_id seq_id) override;
133
103
 
134
- void llama_kv_cache_seq_add(
135
- struct llama_kv_cache & cache,
136
- llama_seq_id seq_id,
137
- llama_pos p0,
138
- llama_pos p1,
139
- llama_pos delta);
104
+ bool get_can_shift() const override;
140
105
 
141
- void llama_kv_cache_seq_div(
142
- struct llama_kv_cache & cache,
143
- llama_seq_id seq_id,
144
- llama_pos p0,
145
- llama_pos p1,
146
- int d);
106
+ // find an empty slot of size "n_tokens" in the cache
107
+ // updates the cache head
108
+ // returns a structure holding information about the slot found
109
+ // Note: On success, it's important that cache.head points
110
+ // to the first cell of the slot.
111
+ llama_kv_cache_slot_info find_slot(const llama_ubatch & batch);
147
112
 
148
- llama_pos llama_kv_cache_seq_pos_max(
149
- struct llama_kv_cache & cache,
150
- llama_seq_id seq_id);
113
+ // TODO: maybe not needed
114
+ uint32_t get_padding(const llama_cparams & cparams) const;
151
115
 
152
- void llama_kv_cache_defrag(struct llama_kv_cache & cache);
116
+ // find how many cells are currently in use
117
+ uint32_t cell_max() const;
153
118
 
154
- int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv);
119
+ size_t size_k_bytes() const;
120
+ size_t size_v_bytes() const;
155
121
 
156
- int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv);
122
+ // defrag
157
123
 
158
- bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv);
124
+ struct {
125
+ std::vector<uint32_t> ids;
126
+ } defrag_info;
159
127
 
160
- //
161
- // kv cache view
162
- //
128
+ // return true if cells have been moved
129
+ bool defrag_prepare(int32_t n_max_nodes);
130
+
131
+ // state save/load
132
+
133
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
134
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1);
163
135
 
164
- struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max);
136
+ // members
165
137
 
166
- void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv);
138
+ const llama_hparams & hparams;
139
+
140
+ callbacks cbs;
141
+
142
+ bool has_shift = false;
143
+ bool do_defrag = false;
144
+
145
+ // TODO: remove this and implement llama_kv_cache_recurrent instead
146
+ bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
147
+
148
+ bool v_trans = true; // the value tensor is transposed
149
+ bool can_shift = false;
150
+
151
+ // Note: The value of head isn't only used to optimize searching
152
+ // for a free KV slot. llama_decode_impl also uses it, so it
153
+ // cannot be freely changed after a slot has been allocated.
154
+ uint32_t head = 0;
155
+ uint32_t size = 0;
156
+ uint32_t used = 0; // used cells (i.e. at least one seq_id)
157
+
158
+ // computed before each graph build
159
+ uint32_t n = 0;
160
+
161
+ std::vector<llama_kv_cell> cells;
162
+
163
+ std::vector<ggml_tensor *> k_l; // per layer
164
+ std::vector<ggml_tensor *> v_l;
165
+
166
+ private:
167
+ ggml_type type_k = GGML_TYPE_F16;
168
+ ggml_type type_v = GGML_TYPE_F16;
169
+
170
+ std::vector<ggml_context_ptr> ctxs;
171
+ std::vector<ggml_backend_buffer_ptr> bufs;
172
+
173
+ void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
174
+ void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
175
+
176
+ bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
177
+ bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
178
+ };
179
+
180
+ // TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified
181
+ //class llama_kv_cache_recurrent : public llama_kv_cache_unified {
182
+ //public:
183
+ // using llama_kv_cache_unified::llama_kv_cache_unified;
184
+ //};
167
185
 
168
186
  //
169
187
  // kv cache restore
@@ -183,13 +201,15 @@ struct llama_kv_slot_restorer {
183
201
 
184
202
  bool do_restore = false;
185
203
 
186
- explicit llama_kv_slot_restorer(const struct llama_kv_cache & cache) {
204
+ llama_kv_cache_unified & cache;
205
+
206
+ explicit llama_kv_slot_restorer(llama_kv_cache_unified & cache) : cache(cache) {
187
207
  old_state.head = cache.head;
188
208
  old_state.n = cache.n;
189
209
  }
190
210
 
191
211
  // saves a slot information for future restoration
192
- void save(const struct llama_kv_cache_slot_info & slot) {
212
+ void save(const llama_kv_cache_slot_info & slot) {
193
213
  if (slot) {
194
214
  do_restore = true;
195
215
  if (slot.boundaries.first != slot.boundaries.second) {
@@ -200,19 +220,68 @@ struct llama_kv_slot_restorer {
200
220
 
201
221
  // must be explicitly called to restore the kv_cache state
202
222
  // and rollback changes from all llama_kv_cache_find_slot calls
203
- void restore(struct llama_kv_cache & cache) {
223
+ void restore() {
204
224
  if (do_restore) {
205
225
  cache.head = old_state.head;
206
226
  cache.n = old_state.n;
207
227
 
208
228
  if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
209
- llama_kv_cache_seq_rm(cache, -1, -1, -1);
229
+ cache.seq_rm(-1, -1, -1);
210
230
  } else {
211
231
  for (auto & slot : slot_boundaries) {
212
- llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second);
232
+ cache.seq_rm(-1, slot.first, slot.second);
213
233
  }
214
234
  }
215
235
  }
216
236
  }
217
237
  };
218
238
 
239
+ // TODO: maybe become part of the public llama_kv_cache in the future
240
+ int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv);
241
+
242
+ int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv);
243
+
244
+ void llama_kv_cache_clear(llama_kv_cache * kv);
245
+
246
+ bool llama_kv_cache_seq_rm(
247
+ llama_kv_cache * kv,
248
+ llama_seq_id seq_id,
249
+ llama_pos p0,
250
+ llama_pos p1);
251
+
252
+ void llama_kv_cache_seq_cp(
253
+ llama_kv_cache * kv,
254
+ llama_seq_id seq_id_src,
255
+ llama_seq_id seq_id_dst,
256
+ llama_pos p0,
257
+ llama_pos p1);
258
+
259
+ void llama_kv_cache_seq_keep(llama_kv_cache * kv, llama_seq_id seq_id);
260
+
261
+ void llama_kv_cache_seq_add(
262
+ llama_kv_cache * kv,
263
+ llama_seq_id seq_id,
264
+ llama_pos p0,
265
+ llama_pos p1,
266
+ llama_pos delta);
267
+
268
+ void llama_kv_cache_seq_div(
269
+ llama_kv_cache * kv,
270
+ llama_seq_id seq_id,
271
+ llama_pos p0,
272
+ llama_pos p1,
273
+ int d);
274
+
275
+ llama_pos llama_kv_cache_seq_pos_max(llama_kv_cache * kv, llama_seq_id seq_id);
276
+
277
+ void llama_kv_cache_defrag(llama_kv_cache * kv);
278
+
279
+ bool llama_kv_cache_can_shift(const llama_kv_cache * kv);
280
+
281
+ //
282
+ // kv cache view
283
+ //
284
+
285
+ llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max);
286
+
287
+ void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv);
@@ -0,0 +1 @@
1
+ #include "llama-memory.h"
@@ -0,0 +1,21 @@
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+
5
+ // general concept of LLM memory
6
+ // the KV cache is a type of LLM memory, but there can be other types
7
+ class llama_memory_i {
8
+ public:
9
+ virtual void clear() = 0;
10
+ virtual void defrag() = 0;
11
+
12
+ virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0;
13
+ virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
14
+ virtual void seq_keep(llama_seq_id seq_id) = 0;
15
+ virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) = 0;
16
+ virtual void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) = 0;
17
+
18
+ virtual llama_pos seq_pos_max(llama_seq_id seq_id) = 0;
19
+
20
+ virtual bool get_can_edit() const = 0;
21
+ };
@@ -8,6 +8,7 @@
8
8
  #include <climits>
9
9
  #include <stdexcept>
10
10
  #include <cerrno>
11
+ #include <algorithm>
11
12
 
12
13
  #ifdef __has_include
13
14
  #if __has_include(<unistd.h>)
@@ -34,6 +35,10 @@
34
35
  #include <io.h>
35
36
  #endif
36
37
 
38
+ #if defined(__APPLE__)
39
+ #include <TargetConditionals.h>
40
+ #endif
41
+
37
42
  // TODO: consider moving to llama-impl.h if needed in more places
38
43
  #if defined(_WIN32)
39
44
  static std::string llama_format_win_err(DWORD err) {
@@ -471,7 +476,11 @@ struct llama_mlock::impl {
471
476
 
472
477
  char* errmsg = std::strerror(errno);
473
478
  bool suggest = (errno == ENOMEM);
474
-
479
+ #if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV)
480
+ // visionOS/tvOS dont't support RLIMIT_MEMLOCK
481
+ // Skip resource limit checks on visionOS/tvOS
482
+ suggest = false;
483
+ #else
475
484
  struct rlimit lock_limit;
476
485
  if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
477
486
  suggest = false;
@@ -479,6 +488,7 @@ struct llama_mlock::impl {
479
488
  if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
480
489
  suggest = false;
481
490
  }
491
+ #endif
482
492
 
483
493
  LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
484
494
  size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");