cui-llama.rn 1.7.3 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. package/README.md +217 -17
  2. package/android/src/main/CMakeLists.txt +34 -15
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +94 -8
  4. package/android/src/main/java/com/rnllama/RNLlama.java +247 -0
  5. package/android/src/main/jni.cpp +213 -14
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
  16. package/cpp/README.md +1 -1
  17. package/cpp/chat-parser.cpp +385 -0
  18. package/cpp/chat-parser.h +120 -0
  19. package/cpp/chat.cpp +726 -596
  20. package/cpp/chat.h +71 -6
  21. package/cpp/common.cpp +56 -38
  22. package/cpp/common.h +9 -3
  23. package/cpp/ggml-backend-reg.cpp +5 -0
  24. package/cpp/ggml-backend.cpp +10 -2
  25. package/cpp/ggml-common.h +4 -0
  26. package/cpp/ggml-cpu/amx/amx.cpp +1 -1
  27. package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
  28. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  29. package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
  30. package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
  31. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  32. package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
  33. package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  34. package/cpp/ggml-cpu/arch-fallback.h +184 -0
  35. package/cpp/ggml-cpu/common.h +4 -3
  36. package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
  37. package/cpp/ggml-cpu/ggml-cpu.c +123 -104
  38. package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
  39. package/cpp/ggml-cpu/ops.cpp +330 -148
  40. package/cpp/ggml-cpu/ops.h +1 -0
  41. package/cpp/ggml-cpu/quants.c +1158 -0
  42. package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  43. package/cpp/ggml-cpu/repack.cpp +1571 -0
  44. package/cpp/ggml-cpu/repack.h +98 -0
  45. package/cpp/ggml-cpu/simd-mappings.h +330 -38
  46. package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  47. package/cpp/ggml-cpu/vec.cpp +87 -18
  48. package/cpp/ggml-cpu/vec.h +249 -94
  49. package/cpp/ggml-cpu.h +1 -0
  50. package/cpp/ggml-impl.h +63 -183
  51. package/cpp/ggml-llama-sim.metallib +0 -0
  52. package/cpp/ggml-llama.metallib +0 -0
  53. package/cpp/ggml-metal.m +152 -45
  54. package/cpp/ggml-quants.c +0 -2
  55. package/cpp/ggml.c +61 -21
  56. package/cpp/ggml.h +22 -3
  57. package/cpp/gguf.cpp +24 -3
  58. package/cpp/json-partial.cpp +256 -0
  59. package/cpp/json-partial.h +38 -0
  60. package/cpp/json-schema-to-grammar.cpp +5 -47
  61. package/cpp/json-schema-to-grammar.h +4 -4
  62. package/cpp/llama-arch.cpp +153 -3
  63. package/cpp/llama-arch.h +27 -1
  64. package/cpp/llama-batch.cpp +741 -272
  65. package/cpp/llama-batch.h +112 -54
  66. package/cpp/llama-chat.cpp +30 -8
  67. package/cpp/llama-chat.h +1 -0
  68. package/cpp/llama-context.cpp +524 -339
  69. package/cpp/llama-context.h +38 -17
  70. package/cpp/llama-cparams.cpp +4 -0
  71. package/cpp/llama-cparams.h +2 -0
  72. package/cpp/llama-grammar.cpp +12 -2
  73. package/cpp/llama-graph.cpp +431 -356
  74. package/cpp/llama-graph.h +126 -58
  75. package/cpp/llama-hparams.cpp +10 -2
  76. package/cpp/llama-hparams.h +19 -2
  77. package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
  78. package/cpp/llama-kv-cache-unified-iswa.h +128 -0
  79. package/cpp/llama-kv-cache-unified.cpp +1841 -0
  80. package/cpp/llama-kv-cache-unified.h +303 -0
  81. package/cpp/llama-kv-cells.h +439 -0
  82. package/cpp/llama-memory-hybrid.cpp +246 -0
  83. package/cpp/llama-memory-hybrid.h +138 -0
  84. package/cpp/llama-memory-recurrent.cpp +1112 -0
  85. package/cpp/llama-memory-recurrent.h +183 -0
  86. package/cpp/llama-memory.cpp +41 -0
  87. package/cpp/llama-memory.h +86 -5
  88. package/cpp/llama-mmap.cpp +1 -1
  89. package/cpp/llama-model-loader.cpp +42 -17
  90. package/cpp/llama-model-saver.cpp +1 -0
  91. package/cpp/llama-model.cpp +1639 -513
  92. package/cpp/llama-model.h +26 -0
  93. package/cpp/llama-sampling.cpp +2 -2
  94. package/cpp/llama-vocab.cpp +65 -28
  95. package/cpp/llama-vocab.h +1 -0
  96. package/cpp/llama.cpp +11 -7
  97. package/cpp/llama.h +150 -42
  98. package/cpp/minja/chat-template.hpp +1 -1
  99. package/cpp/minja/minja.hpp +1 -1
  100. package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
  101. package/cpp/nlohmann/json_fwd.hpp +187 -0
  102. package/cpp/regex-partial.cpp +204 -0
  103. package/cpp/regex-partial.h +56 -0
  104. package/cpp/rn-llama.cpp +646 -35
  105. package/cpp/rn-llama.h +32 -1
  106. package/cpp/rn-tts.h +39 -0
  107. package/cpp/sampling.cpp +7 -8
  108. package/cpp/tools/mtmd/clip-impl.h +5 -0
  109. package/cpp/tools/mtmd/clip.cpp +572 -436
  110. package/cpp/tools/mtmd/clip.h +14 -4
  111. package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
  112. package/cpp/tools/mtmd/mtmd-audio.h +2 -17
  113. package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
  114. package/cpp/tools/mtmd/mtmd-helper.h +91 -0
  115. package/cpp/tools/mtmd/mtmd.cpp +368 -248
  116. package/cpp/tools/mtmd/mtmd.h +6 -70
  117. package/cpp/unicode.cpp +5 -0
  118. package/ios/CMakeLists.txt +26 -6
  119. package/ios/RNLlama.h +1 -1
  120. package/ios/RNLlama.mm +153 -3
  121. package/ios/RNLlamaContext.h +9 -1
  122. package/ios/RNLlamaContext.mm +112 -9
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  131. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  132. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  133. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  134. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  135. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  136. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  137. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  138. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  139. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  140. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  141. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  142. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  143. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  144. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  145. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  146. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  147. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
  148. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  149. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  150. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
  151. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  152. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  153. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  154. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  155. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  156. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  162. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  163. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  164. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  165. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  166. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  167. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  168. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  169. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  170. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  171. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  172. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  173. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  174. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  175. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  176. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  177. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  178. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  179. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  180. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  181. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  182. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  183. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  184. package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  185. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  186. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  187. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  188. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  189. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  190. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  191. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  192. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
  193. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
  194. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  195. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  196. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  197. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
  198. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  199. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  200. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  201. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  202. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  203. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  204. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  205. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  206. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  207. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  208. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  209. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  210. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  211. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  212. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  213. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  214. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  215. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
  216. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  217. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  218. package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  219. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  220. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  221. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  222. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  223. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  224. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  225. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  226. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  227. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  228. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  229. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  230. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  231. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  232. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  233. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  234. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  235. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  236. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  237. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  238. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  239. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  240. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  241. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  242. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  243. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  244. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  245. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  246. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  247. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  248. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  249. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  250. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  251. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  252. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
  253. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  254. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  255. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  256. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  257. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  258. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  259. package/jest/mock.js +24 -0
  260. package/package.json +1 -1
  261. package/src/NativeRNLlama.ts +46 -2
  262. package/src/index.ts +105 -1
  263. package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  264. package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
  265. package/cpp/ggml-cpu/sgemm.cpp +0 -3544
  266. package/cpp/ggml-cpu/sgemm.h +0 -14
  267. package/cpp/llama-kv-cache.cpp +0 -2827
  268. package/cpp/llama-kv-cache.h +0 -515
  269. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  270. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  271. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  272. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
  273. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  274. /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  275. /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
  276. /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
@@ -0,0 +1,439 @@
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+ #include "llama-cparams.h"
5
+
6
+ #include <bitset>
7
+ #include <cassert>
8
+ #include <vector>
9
+ #include <set>
10
+ #include <map>
11
+
12
+ // meta information about KV cells that can be part of multiple sequences at the same time
13
+ // TODO: add unit tests
14
+ class llama_kv_cells_unified {
15
+ public:
16
+ void reset() {
17
+ for (uint32_t i = 0; i < pos.size(); ++i) {
18
+ pos[i] = -1;
19
+ shift[i] = 0;
20
+ seq[i].reset();
21
+ }
22
+
23
+ has_shift = false;
24
+
25
+ used.clear();
26
+
27
+ for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
28
+ seq_pos[s].clear();
29
+ }
30
+ }
31
+
32
+ void reset_shift() {
33
+ has_shift = false;
34
+
35
+ for (uint32_t i = 0; i < shift.size(); ++i) {
36
+ shift[i] = 0;
37
+ }
38
+ }
39
+
40
+ uint32_t size() const {
41
+ return pos.size();
42
+ }
43
+
44
+ void resize(uint32_t n) {
45
+ pos.resize(n);
46
+ shift.resize(n);
47
+ seq.resize(n);
48
+
49
+ reset();
50
+ }
51
+
52
+ bool is_empty(uint32_t i) const {
53
+ assert(i < pos.size());
54
+ assert((pos[i] < 0 && pos[i] == -1) || pos[i] >= 0);
55
+
56
+ return pos[i] == -1;
57
+ }
58
+
59
+ uint32_t get_used() const {
60
+ return used.size();
61
+ }
62
+
63
+ // the index of the first cell that is used
64
+ // return 0 if no cells are used
65
+ uint32_t used_min() const {
66
+ return used.empty() ? 0 : *used.begin();
67
+ }
68
+
69
+ // the index of the last cell that is used + 1
70
+ // return 0 if no cells are used
71
+ uint32_t used_max_p1() const {
72
+ return used.empty() ? 0 : *used.rbegin() + 1;
73
+ }
74
+
75
+ bool get_has_shift() const {
76
+ return has_shift;
77
+ }
78
+
79
+ // move cell isrc to idst (used during defrag)
80
+ void mv(uint32_t isrc, uint32_t idst) {
81
+ assert(isrc < pos.size());
82
+ assert(idst < pos.size());
83
+
84
+ assert(pos[idst] == -1);
85
+ assert(pos[isrc] != -1);
86
+
87
+ pos [idst] = pos [isrc];
88
+ shift[idst] = shift[isrc];
89
+ seq [idst] = seq [isrc];
90
+
91
+ pos [isrc] = -1;
92
+ shift[isrc] = 0;
93
+ seq [isrc].reset();
94
+
95
+ used.erase (isrc);
96
+ used.insert(idst);
97
+ }
98
+
99
+ // copy the state of cells [i, i + n) (used for save/restore the state of the cells)
100
+ llama_kv_cells_unified cp(uint32_t i, uint32_t n) const {
101
+ assert(i + n <= pos.size());
102
+
103
+ llama_kv_cells_unified res;
104
+
105
+ res.resize(n);
106
+
107
+ for (uint32_t j = 0; j < n; ++j) {
108
+ res.pos[j] = pos[i + j];
109
+ res.seq[j] = seq[i + j];
110
+
111
+ assert(shift[i + j] == 0);
112
+ }
113
+
114
+ return res;
115
+ }
116
+
117
+ // set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells)
118
+ void set(uint32_t i, const llama_kv_cells_unified & other) {
119
+ assert(i + other.pos.size() <= pos.size());
120
+
121
+ for (uint32_t j = 0; j < other.pos.size(); ++j) {
122
+ if (pos[i + j] == -1 && other.pos[j] != -1) {
123
+ used.insert(i + j);
124
+ }
125
+
126
+ if (pos[i + j] != -1 && other.pos[j] == -1) {
127
+ used.erase(i + j);
128
+ }
129
+
130
+ if (pos[i + j] != -1) {
131
+ seq_pos_rm(i + j);
132
+ }
133
+
134
+ pos[i + j] = other.pos[j];
135
+ seq[i + j] = other.seq[j];
136
+
137
+ if (pos[i + j] != -1) {
138
+ seq_pos_add(i + j);
139
+ }
140
+
141
+ assert(shift[i + j] == 0);
142
+ }
143
+ }
144
+
145
+ // clear a non-empty cell
146
+ void rm(uint32_t i) {
147
+ assert(i < pos.size());
148
+ assert(pos[i] != -1);
149
+
150
+ seq_pos_rm(i);
151
+ seq[i].reset();
152
+
153
+ pos[i] = -1;
154
+ shift[i] = 0;
155
+
156
+ used.erase(i);
157
+ }
158
+
159
+ // note: call only if the cell has seq_id
160
+ // return true if the cell becomes empty
161
+ bool seq_rm(uint32_t i, llama_seq_id seq_id) {
162
+ assert(i < pos.size());
163
+ assert(seq[i].test(seq_id));
164
+ assert(pos[i] != -1);
165
+ assert(seq_id >= 0);
166
+
167
+ seq[i].reset(seq_id);
168
+ seq_pos_dec(seq_id, pos[i]);
169
+
170
+ if (seq[i].none()) {
171
+ pos[i] = -1;
172
+ shift[i] = 0;
173
+
174
+ used.erase(i);
175
+
176
+ return true;
177
+ }
178
+
179
+ return false;
180
+ }
181
+
182
+ // return true if the cell becomes empty (i.e. it did not contain seq_id before the call)
183
+ bool seq_keep(uint32_t i, llama_seq_id seq_id) {
184
+ assert(i < pos.size());
185
+
186
+ if (seq[i].test(seq_id)) {
187
+ seq_pos_rm(i);
188
+ seq[i].reset();
189
+
190
+ seq[i].set(seq_id);
191
+ seq_pos_inc(seq_id, pos[i]);
192
+
193
+ return false;
194
+ }
195
+
196
+ if (seq[i].any()) {
197
+ seq_pos_rm(i);
198
+ seq[i].reset();
199
+
200
+ pos[i] = -1;
201
+ shift[i] = 0;
202
+
203
+ used.erase(i);
204
+
205
+ return true;
206
+ }
207
+
208
+ assert(pos[i] == -1);
209
+
210
+ return false;
211
+ }
212
+
213
+ // number of different sequences in the cell
214
+ int seq_count(uint32_t i) const {
215
+ assert(i < pos.size());
216
+ assert(pos[i] != -1);
217
+
218
+ return seq[i].count();
219
+ }
220
+
221
+ // check if the cell contains seq_id
222
+ bool seq_has(uint32_t i, llama_seq_id seq_id) const {
223
+ assert(i < pos.size());
224
+ assert(seq_id >= 0);
225
+
226
+ return seq[i].test(seq_id);
227
+ }
228
+
229
+ // note: call only if the cell is not empty and the seq_id is not in the cell
230
+ void seq_add(uint32_t i, llama_seq_id seq_id) {
231
+ assert(i < pos.size());
232
+ assert(pos[i] != -1);
233
+ assert(!seq[i].test(seq_id));
234
+
235
+ seq[i].set(seq_id);
236
+ seq_pos_inc(seq_id, pos[i]);
237
+ }
238
+
239
+ // return the sequence id of this cell
240
+ // note: call only for cells with exactly one sequence
241
+ llama_seq_id seq_get(uint32_t i) const {
242
+ assert(seq[i].count() == 1);
243
+
244
+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
245
+ if (seq[i].test(s)) {
246
+ return s;
247
+ }
248
+ }
249
+
250
+ return -1;
251
+ }
252
+
253
+ // the minimum position of sequence seq_id currently present in any of the cells
254
+ // return -1 if the sequence is not present
255
+ llama_pos seq_pos_min(llama_seq_id seq_id) const {
256
+ assert(seq_id >= 0);
257
+ assert(seq_id < LLAMA_MAX_SEQ);
258
+
259
+ if (seq_pos[seq_id].empty()) {
260
+ return -1;
261
+ }
262
+
263
+ assert(seq_pos[seq_id].begin()->second > 0);
264
+
265
+ return seq_pos[seq_id].begin()->first;
266
+ }
267
+
268
+ // the maximum position of sequence seq_id currently present in any of the cells
269
+ // return -1 if the sequence is not present
270
+ llama_pos seq_pos_max(llama_seq_id seq_id) const {
271
+ assert(seq_id >= 0);
272
+ assert(seq_id < LLAMA_MAX_SEQ);
273
+
274
+ if (seq_pos[seq_id].empty()) {
275
+ return -1;
276
+ }
277
+
278
+ assert(seq_pos[seq_id].rbegin()->second > 0);
279
+
280
+ return seq_pos[seq_id].rbegin()->first;
281
+ }
282
+
283
+ // note: call only if the cell is not empty
284
+ llama_pos pos_get(uint32_t i) const {
285
+ assert(i < pos.size());
286
+ assert(pos[i] != -1);
287
+
288
+ return pos[i];
289
+ }
290
+
291
+ // note: call only if the cell is not empty
292
+ llama_pos get_shift(uint32_t i) const {
293
+ assert(i < pos.size());
294
+ assert(pos[i] != -1);
295
+
296
+ return shift[i];
297
+ }
298
+
299
+ // check if a cell is not empty and its position is within [p0, p1)
300
+ bool pos_in(uint32_t i, llama_pos p0, llama_pos p1) const {
301
+ assert(i < pos.size());
302
+
303
+ return pos[i] >= p0 && pos[i] < p1;
304
+ }
305
+
306
+ // set the position of an empty cell
307
+ // does not modify "has_shift"
308
+ // note: call only if the cell is empty
309
+ void pos_set(uint32_t i, llama_pos p) {
310
+ assert(i < pos.size());
311
+ assert(pos[i] == -1);
312
+ assert(seq[i].none());
313
+
314
+ pos[i] = p;
315
+
316
+ used.insert(i);
317
+ }
318
+
319
+ // pos[i] = pos[i] + d
320
+ // sets "has_shift" to true
321
+ // note: call only if the cell is not empty
322
+ bool pos_add(uint32_t i, llama_pos d) {
323
+ assert(i < pos.size());
324
+ assert(pos[i] != -1);
325
+
326
+ seq_pos_rm(i);
327
+
328
+ pos[i] += d;
329
+ shift[i] += d;
330
+
331
+ has_shift = true;
332
+
333
+ if (pos[i] < 0) {
334
+ seq[i].reset();
335
+ pos[i] = -1;
336
+ shift[i] = 0;
337
+
338
+ used.erase(i);
339
+
340
+ return true;
341
+ }
342
+
343
+ seq_pos_add(i);
344
+
345
+ return false;
346
+ }
347
+
348
+ // pos[i] = pos[i] / d
349
+ // sets "has_shift" to true
350
+ // note: call only if the cell is not empty
351
+ void pos_div(uint32_t i, int d) {
352
+ assert(i < pos.size());
353
+ assert(pos[i] != -1);
354
+
355
+ const llama_pos p_old = pos[i];
356
+
357
+ seq_pos_rm(i);
358
+
359
+ pos[i] /= d;
360
+ shift[i] += p_old - pos[i];
361
+
362
+ seq_pos_add(i);
363
+
364
+ has_shift = true;
365
+ }
366
+
367
+ private:
368
+ bool has_shift = false;
369
+
370
+ // set of indices of used cells (i.e. pos[i] != -1, allowed to not have any seq_id)
371
+ std::set<uint32_t> used;
372
+
373
+ std::vector<llama_pos> pos;
374
+
375
+ // this array accumulates any applied shifts to the pos array since the last reset_shift() call
376
+ // this is used to queue multiple updates to the pos array, which in the end can be applied in one go:
377
+ //
378
+ // cells.pos_add(x, shift_x);
379
+ // cells.pos_div(y, shift_y);
380
+ // ...
381
+ //
382
+ // if (cells.has_shift()) {
383
+ // for (int i = 0; i < n; ++i) {
384
+ // auto shift_i = cells.get_shift(i);
385
+ // ...
386
+ // }
387
+ // cells.reset_shift();
388
+ // }
389
+ //
390
+ std::vector<llama_pos> shift;
391
+
392
+ using seq_set_t = std::bitset<LLAMA_MAX_SEQ>;
393
+
394
+ // the bitset seq[i] tells us which sequences are currently occupying the i-th cell
395
+ std::vector<seq_set_t> seq;
396
+
397
+ // the set seq_pos[s][p] tells us how many times the position p is currently present for sequence s
398
+ // if the position p is not present, seq_pos[s][p] is not set
399
+ // this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
400
+ //
401
+ // note that we cannot a use an std::set because in some cases a position can occur more than once for the same seq:
402
+ // - during performing a cache reuse via (rm + add)
403
+ // - some vision models have input embeddings with repeating positions
404
+ //
405
+ std::map<llama_pos, int> seq_pos[LLAMA_MAX_SEQ];
406
+
407
+ // helper functions for updating `seq_pos`, once cell at a time:
408
+
409
+ void seq_pos_dec(llama_seq_id s, llama_pos p) {
410
+ auto it = seq_pos[s].find(p);
411
+ assert(it != seq_pos[s].end());
412
+
413
+ if (--it->second == 0) {
414
+ seq_pos[s].erase(it);
415
+ }
416
+ }
417
+
418
+ void seq_pos_inc(llama_seq_id s, llama_pos p) {
419
+ seq_pos[s][p]++;
420
+ }
421
+
422
+ // remove cell i
423
+ void seq_pos_rm(uint32_t i) {
424
+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
425
+ if (seq[i].test(s)) {
426
+ seq_pos_dec(s, pos[i]);
427
+ }
428
+ }
429
+ }
430
+
431
+ // add cell i
432
+ void seq_pos_add(uint32_t i) {
433
+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
434
+ if (seq[i].test(s)) {
435
+ seq_pos_inc(s, pos[i]);
436
+ }
437
+ }
438
+ }
439
+ };
@@ -0,0 +1,246 @@
1
+ #include "llama-memory-hybrid.h"
2
+
3
+ #include "llama-impl.h"
4
+ #include "llama-model.h"
5
+ #include "llama-context.h"
6
+
7
+ //
8
+ // llama_memory_hybrid
9
+ //
10
+
11
+ llama_memory_hybrid::llama_memory_hybrid(
12
+ const llama_model & model,
13
+ /* attn */
14
+ lm_ggml_type type_k,
15
+ lm_ggml_type type_v,
16
+ bool v_trans,
17
+ uint32_t kv_size,
18
+ uint32_t n_pad,
19
+ uint32_t n_swa,
20
+ llama_swa_type swa_type,
21
+ /* recurrent */
22
+ lm_ggml_type type_r,
23
+ lm_ggml_type type_s,
24
+ uint32_t rs_size,
25
+ /* common */
26
+ uint32_t n_seq_max,
27
+ bool offload,
28
+ /* layer filters */
29
+ layer_filter_cb && filter_attn,
30
+ layer_filter_cb && filter_recr) :
31
+ hparams(model.hparams),
32
+ mem_attn(new llama_kv_cache_unified(
33
+ model,
34
+ filter_attn == nullptr ?
35
+ [&](int32_t il) { return !hparams.is_recurrent(il); }
36
+ : filter_attn,
37
+ type_k,
38
+ type_v,
39
+ v_trans,
40
+ offload,
41
+ kv_size,
42
+ n_seq_max,
43
+ n_pad,
44
+ n_swa,
45
+ swa_type
46
+ )),
47
+ mem_recr(new llama_memory_recurrent(
48
+ model,
49
+ filter_recr == nullptr ?
50
+ [&](int32_t il) { return hparams.is_recurrent(il); }
51
+ : filter_recr,
52
+ type_r,
53
+ type_s,
54
+ offload,
55
+ rs_size,
56
+ n_seq_max
57
+ )) {}
58
+
59
+ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
60
+ do {
61
+ balloc.split_reset();
62
+
63
+ // follow the recurrent pattern for creating the ubatch splits
64
+ std::vector<llama_ubatch> ubatches;
65
+
66
+ while (true) {
67
+ llama_ubatch ubatch;
68
+
69
+ if (embd_all) {
70
+ // if all tokens are output, split by sequence
71
+ ubatch = balloc.split_seq(n_ubatch);
72
+ } else {
73
+ ubatch = balloc.split_equal(n_ubatch);
74
+ }
75
+
76
+ if (ubatch.n_tokens == 0) {
77
+ break;
78
+ }
79
+
80
+ ubatches.push_back(std::move(ubatch)); // NOLINT
81
+ }
82
+
83
+ // prepare the recurrent batches first
84
+ if (!mem_recr->prepare(ubatches)) {
85
+ // TODO: will the recurrent cache be in an undefined context at this point?
86
+ LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__);
87
+ return std::make_unique<llama_memory_hybrid_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
88
+ }
89
+
90
+ // prepare the attention cache
91
+ auto heads_attn = mem_attn->prepare(ubatches);
92
+ if (heads_attn.empty()) {
93
+ LLAMA_LOG_ERROR("%s: failed to prepare attention ubatches\n", __func__);
94
+ return std::make_unique<llama_memory_hybrid_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
95
+ }
96
+
97
+ return std::make_unique<llama_memory_hybrid_context>(
98
+ this, std::move(heads_attn), std::move(ubatches));
99
+ } while(false);
100
+
101
+ return std::make_unique<llama_memory_hybrid_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
102
+ }
103
+
104
+ llama_memory_context_ptr llama_memory_hybrid::init_full() {
105
+ return std::make_unique<llama_memory_hybrid_context>(this);
106
+ }
107
+
108
+ llama_memory_context_ptr llama_memory_hybrid::init_update(llama_context * lctx, bool optimize) {
109
+ return std::make_unique<llama_memory_hybrid_context>(this, lctx, optimize);
110
+ }
111
+
112
+ bool llama_memory_hybrid::get_can_shift() const {
113
+ // Shifting is trivially supported for recurrent
114
+ return mem_attn->get_can_shift();
115
+ }
116
+
117
+ void llama_memory_hybrid::clear(bool data) {
118
+ mem_attn->clear(data);
119
+ mem_recr->clear(data);
120
+ }
121
+
122
+ bool llama_memory_hybrid::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
123
+ // Try removing from the recurrent cache first since it may fail. If it does
124
+ // fail, the cache will not have been mutated.
125
+ if (!mem_recr->seq_rm(seq_id, p0, p1)) {
126
+ return false;
127
+ }
128
+ return mem_attn->seq_rm(seq_id, p0, p1);
129
+ }
130
+
131
+ void llama_memory_hybrid::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
132
+ mem_attn->seq_cp(seq_id_src, seq_id_dst, p0, p1);
133
+ mem_recr->seq_cp(seq_id_src, seq_id_dst, p0, p1);
134
+ }
135
+
136
+ void llama_memory_hybrid::seq_keep(llama_seq_id seq_id) {
137
+ mem_attn->seq_keep(seq_id);
138
+ mem_recr->seq_keep(seq_id);
139
+ }
140
+
141
+ void llama_memory_hybrid::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
142
+ mem_attn->seq_add(seq_id, p0, p1, shift);
143
+ mem_recr->seq_add(seq_id, p0, p1, shift);
144
+ }
145
+
146
+ void llama_memory_hybrid::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
147
+ mem_attn->seq_div(seq_id, p0, p1, d);
148
+ mem_recr->seq_div(seq_id, p0, p1, d);
149
+ }
150
+
151
+ llama_pos llama_memory_hybrid::seq_pos_min(llama_seq_id seq_id) const {
152
+ // the min of the total cache is the max of the two caches' min values
153
+ return std::max(mem_attn->seq_pos_min(seq_id), mem_recr->seq_pos_min(seq_id));
154
+ }
155
+
156
+ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
157
+ // the max of the total cache is the min of the two caches' max values
158
+ return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
159
+ }
160
+
161
+ void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
162
+ mem_attn->state_write(io, seq_id);
163
+ mem_recr->state_write(io, seq_id);
164
+ }
165
+
166
+ void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
167
+ mem_attn->state_read(io, seq_id);
168
+ mem_recr->state_read(io, seq_id);
169
+ }
170
+
171
+ llama_kv_cache_unified * llama_memory_hybrid::get_mem_attn() const {
172
+ return mem_attn.get();
173
+ }
174
+
175
+ llama_memory_recurrent * llama_memory_hybrid::get_mem_recr() const {
176
+ return mem_recr.get();
177
+ }
178
+
179
+ llama_memory_hybrid_context::llama_memory_hybrid_context(llama_memory_status status) : status(status) {}
180
+
181
+ llama_memory_hybrid_context::llama_memory_hybrid_context(llama_memory_hybrid * mem) :
182
+ ctx_attn(mem->get_mem_attn()->init_full()),
183
+ ctx_recr(mem->get_mem_recr()->init_full()),
184
+ status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
185
+ }
186
+
187
+ llama_memory_hybrid_context::llama_memory_hybrid_context(
188
+ llama_memory_hybrid * mem,
189
+ llama_context * lctx,
190
+ bool optimize) :
191
+ ctx_attn(mem->get_mem_attn()->init_update(lctx, optimize)),
192
+ ctx_recr(mem->get_mem_recr()->init_update(lctx, optimize)),
193
+ status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
194
+ }
195
+
196
+ llama_memory_hybrid_context::llama_memory_hybrid_context(
197
+ llama_memory_hybrid * mem,
198
+ std::vector<uint32_t> heads_attn,
199
+ std::vector<llama_ubatch> ubatches) :
200
+ ubatches(std::move(ubatches)),
201
+ // note: here we copy the ubatches. not sure if this is ideal
202
+ ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(heads_attn), this->ubatches)),
203
+ ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
204
+ status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
205
+ }
206
+
207
+ bool llama_memory_hybrid_context::next() {
208
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
209
+
210
+ ctx_attn->next();
211
+ ctx_recr->next();
212
+
213
+ if (++i_next >= ubatches.size()) {
214
+ return false;
215
+ }
216
+
217
+ return true;
218
+ }
219
+
220
+ bool llama_memory_hybrid_context::apply() {
221
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
222
+
223
+ bool res = true;
224
+
225
+ res = res & ctx_attn->apply();
226
+ res = res & ctx_recr->apply();
227
+
228
+ return res;
229
+ }
230
+
231
+ llama_memory_status llama_memory_hybrid_context::get_status() const {
232
+ return status;
233
+ }
234
+
235
+ const llama_ubatch & llama_memory_hybrid_context::get_ubatch() const {
236
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
237
+ return ubatches[i_next];
238
+ }
239
+
240
+ const llama_kv_cache_unified_context * llama_memory_hybrid_context::get_attn() const {
241
+ return static_cast<const llama_kv_cache_unified_context *>(ctx_attn.get());
242
+ }
243
+
244
+ const llama_memory_recurrent_context * llama_memory_hybrid_context::get_recr() const {
245
+ return static_cast<const llama_memory_recurrent_context *>(ctx_recr.get());
246
+ }