cui-llama.rn 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (285) hide show
  1. package/README.md +35 -7
  2. package/android/src/main/CMakeLists.txt +22 -11
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +42 -6
  4. package/android/src/main/java/com/rnllama/RNLlama.java +139 -4
  5. package/android/src/main/jni.cpp +173 -18
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +24 -4
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +22 -2
  16. package/cpp/LICENSE +21 -0
  17. package/cpp/chat.cpp +129 -107
  18. package/cpp/chat.h +2 -0
  19. package/cpp/common.cpp +58 -78
  20. package/cpp/common.h +29 -21
  21. package/cpp/ggml-alloc.c +4 -1
  22. package/cpp/ggml-backend.cpp +9 -5
  23. package/cpp/ggml-backend.h +4 -4
  24. package/cpp/ggml-cpp.h +1 -1
  25. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  26. package/cpp/ggml-cpu/amx/amx.h +8 -0
  27. package/cpp/ggml-cpu/amx/common.h +91 -0
  28. package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
  29. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  30. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/binary-ops.h +1 -1
  31. package/cpp/ggml-cpu/common.h +72 -0
  32. package/cpp/{ggml-cpu-aarch64.cpp → ggml-cpu/ggml-cpu-aarch64.cpp} +809 -103
  33. package/cpp/{ggml-cpu-quants.c → ggml-cpu/ggml-cpu-quants.c} +306 -6
  34. package/cpp/{ggml-cpu.c → ggml-cpu/ggml-cpu.c} +114 -55
  35. package/cpp/{ggml-cpu.cpp → ggml-cpu/ggml-cpu.cpp} +32 -16
  36. package/cpp/{ops.cpp → ggml-cpu/ops.cpp} +353 -173
  37. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/ops.h +2 -20
  38. package/cpp/{sgemm.cpp → ggml-cpu/sgemm.cpp} +501 -0
  39. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/simd-mappings.h +7 -3
  40. package/{ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/unary-ops.h +1 -1
  41. package/cpp/{vec.cpp → ggml-cpu/vec.cpp} +0 -6
  42. package/{ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/vec.h +16 -0
  43. package/cpp/ggml-cpu.h +5 -0
  44. package/cpp/ggml-impl.h +16 -9
  45. package/cpp/ggml-llama-sim.metallib +0 -0
  46. package/cpp/ggml-llama.metallib +0 -0
  47. package/cpp/ggml-metal-impl.h +36 -11
  48. package/cpp/ggml-metal.m +810 -176
  49. package/cpp/ggml-opt.cpp +373 -190
  50. package/cpp/ggml-opt.h +49 -28
  51. package/cpp/ggml-quants.c +0 -6
  52. package/cpp/ggml.c +227 -282
  53. package/cpp/ggml.h +82 -101
  54. package/cpp/gguf.cpp +33 -33
  55. package/cpp/json-schema-to-grammar.cpp +3 -0
  56. package/cpp/llama-adapter.cpp +6 -0
  57. package/cpp/llama-arch.cpp +49 -17
  58. package/cpp/llama-arch.h +9 -0
  59. package/cpp/llama-batch.cpp +8 -2
  60. package/cpp/llama-batch.h +2 -1
  61. package/cpp/llama-chat.cpp +39 -16
  62. package/cpp/llama-chat.h +4 -2
  63. package/cpp/llama-context.cpp +440 -611
  64. package/cpp/llama-context.h +44 -33
  65. package/cpp/llama-cparams.h +1 -0
  66. package/cpp/llama-graph.cpp +214 -291
  67. package/cpp/llama-graph.h +69 -21
  68. package/cpp/llama-hparams.cpp +17 -1
  69. package/cpp/llama-hparams.h +39 -5
  70. package/cpp/llama-kv-cache.cpp +2067 -620
  71. package/cpp/llama-kv-cache.h +410 -108
  72. package/cpp/llama-memory.h +12 -1
  73. package/cpp/llama-model-loader.cpp +24 -15
  74. package/cpp/llama-model-saver.cpp +281 -0
  75. package/cpp/llama-model-saver.h +37 -0
  76. package/cpp/llama-model.cpp +1089 -359
  77. package/cpp/llama-model.h +19 -3
  78. package/cpp/llama-sampling.cpp +20 -7
  79. package/cpp/llama-vocab.cpp +54 -9
  80. package/cpp/llama-vocab.h +6 -0
  81. package/cpp/llama.cpp +14 -0
  82. package/cpp/llama.h +86 -142
  83. package/cpp/minja/chat-template.hpp +9 -5
  84. package/cpp/minja/minja.hpp +69 -36
  85. package/cpp/rn-llama.cpp +602 -190
  86. package/cpp/rn-llama.h +34 -8
  87. package/cpp/sampling.cpp +57 -50
  88. package/cpp/tools/mtmd/clip-impl.h +462 -0
  89. package/cpp/tools/mtmd/clip.cpp +4024 -0
  90. package/cpp/tools/mtmd/clip.h +101 -0
  91. package/cpp/tools/mtmd/miniaudio.h +93468 -0
  92. package/cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  93. package/cpp/tools/mtmd/mtmd-audio.h +62 -0
  94. package/cpp/tools/mtmd/mtmd-helper.cpp +297 -0
  95. package/cpp/tools/mtmd/mtmd.cpp +942 -0
  96. package/cpp/tools/mtmd/mtmd.h +362 -0
  97. package/cpp/tools/mtmd/stb_image.h +7988 -0
  98. package/ios/CMakeLists.txt +20 -10
  99. package/ios/RNLlama.h +6 -0
  100. package/ios/RNLlama.mm +82 -3
  101. package/ios/RNLlamaContext.h +5 -1
  102. package/ios/RNLlamaContext.mm +131 -38
  103. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +2 -0
  104. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +29 -21
  105. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  106. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  107. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  108. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  109. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  110. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  111. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +82 -101
  112. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  113. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  114. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
  115. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +44 -33
  116. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  117. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
  118. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
  119. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  120. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
  121. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  122. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +19 -3
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +86 -142
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Info.plist +0 -0
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  131. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  132. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
  133. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  134. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  135. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  136. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  137. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  138. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  139. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
  140. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  141. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  142. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
  143. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
  144. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  145. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
  146. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
  147. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  148. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
  149. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  150. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
  151. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  152. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
  153. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  154. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  155. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
  156. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  160. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +2 -0
  161. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +29 -21
  162. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  163. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  164. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  165. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  166. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  167. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  168. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +82 -101
  169. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  170. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  171. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
  172. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +44 -33
  173. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  174. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
  175. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
  176. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  177. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
  178. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  179. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +19 -3
  180. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  181. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +86 -142
  182. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  183. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  184. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
  185. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist +0 -0
  186. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  187. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  188. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  189. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
  190. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  191. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  192. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  193. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  194. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  195. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  196. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
  197. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  198. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  199. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
  200. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
  201. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  202. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
  203. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
  204. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  205. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
  206. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  207. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
  208. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  209. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
  210. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  211. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  212. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
  213. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  214. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  215. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  216. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  217. package/jest/mock.js +33 -7
  218. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  219. package/lib/commonjs/index.js +153 -21
  220. package/lib/commonjs/index.js.map +1 -1
  221. package/lib/module/NativeRNLlama.js.map +1 -1
  222. package/lib/module/index.js +152 -20
  223. package/lib/module/index.js.map +1 -1
  224. package/lib/typescript/NativeRNLlama.d.ts +54 -4
  225. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  226. package/lib/typescript/index.d.ts +72 -6
  227. package/lib/typescript/index.d.ts.map +1 -1
  228. package/package.json +1 -1
  229. package/src/NativeRNLlama.ts +72 -4
  230. package/src/index.ts +212 -38
  231. package/cpp/binary-ops.h +0 -16
  232. package/cpp/ops.h +0 -128
  233. package/cpp/simd-mappings.h +0 -888
  234. package/cpp/unary-ops.h +0 -28
  235. package/cpp/vec.h +0 -802
  236. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  237. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  238. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  239. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  240. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  241. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +0 -128
  242. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  243. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  244. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +0 -802
  245. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  246. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  247. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  248. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  249. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  250. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  251. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  252. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  253. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  254. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  255. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  256. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  257. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +0 -128
  258. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  259. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  260. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +0 -28
  261. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +0 -16
  262. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  263. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  264. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  265. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  266. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +0 -128
  267. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  268. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +0 -888
  269. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  270. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  271. package/lib/commonjs/chat.js +0 -37
  272. package/lib/commonjs/chat.js.map +0 -1
  273. package/lib/module/chat.js +0 -33
  274. package/lib/module/chat.js.map +0 -1
  275. package/lib/typescript/chat.d.ts +0 -10
  276. package/lib/typescript/chat.d.ts.map +0 -1
  277. package/src/chat.ts +0 -44
  278. /package/cpp/{binary-ops.cpp → ggml-cpu/binary-ops.cpp} +0 -0
  279. /package/cpp/{ggml-cpu-aarch64.h → ggml-cpu/ggml-cpu-aarch64.h} +0 -0
  280. /package/cpp/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -0
  281. /package/cpp/{ggml-cpu-quants.h → ggml-cpu/ggml-cpu-quants.h} +0 -0
  282. /package/cpp/{ggml-cpu-traits.cpp → ggml-cpu/ggml-cpu-traits.cpp} +0 -0
  283. /package/cpp/{ggml-cpu-traits.h → ggml-cpu/ggml-cpu-traits.h} +0 -0
  284. /package/cpp/{sgemm.h → ggml-cpu/sgemm.h} +0 -0
  285. /package/cpp/{unary-ops.cpp → ggml-cpu/unary-ops.cpp} +0 -0
package/cpp/rn-llama.cpp CHANGED
@@ -1,8 +1,86 @@
1
1
  #include "rn-llama.h"
2
2
 
3
+ // Include multimodal support
4
+ #include "tools/mtmd/mtmd.h"
5
+ #include "tools/mtmd/clip.h"
6
+
3
7
  namespace rnllama {
4
8
 
5
- const std::vector<lm_ggml_type> kv_cache_types = {
9
+ // Computes FNV-1a hash of the data
10
+ static std::string fnv_hash(const uint8_t * data, size_t len) {
11
+ const uint64_t fnv_prime = 0x100000001b3ULL;
12
+ uint64_t hash = 0xcbf29ce484222325ULL;
13
+
14
+ for (size_t i = 0; i < len; ++i) {
15
+ hash ^= data[i];
16
+ hash *= fnv_prime;
17
+ }
18
+ return std::to_string(hash);
19
+ }
20
+
21
+ static const std::string base64_chars =
22
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
23
+ "abcdefghijklmnopqrstuvwxyz"
24
+ "0123456789+/";
25
+
26
+ // Base64 decoding function
27
+ static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
28
+ std::vector<uint8_t> decoded;
29
+ int in_len = encoded_string.size();
30
+ int i = 0;
31
+ int j = 0;
32
+ int in_ = 0;
33
+ unsigned char char_array_4[4], char_array_3[3];
34
+
35
+ while (in_len-- && (encoded_string[in_] != '=')) {
36
+ if (isspace(encoded_string[in_])) {
37
+ in_++;
38
+ continue;
39
+ }
40
+
41
+ if (encoded_string[in_] == '=' || base64_chars.find(encoded_string[in_]) == std::string::npos) {
42
+ break;
43
+ }
44
+
45
+ char_array_4[i++] = encoded_string[in_]; in_++;
46
+ if (i == 4) {
47
+ for (i = 0; i < 4; i++) {
48
+ char_array_4[i] = base64_chars.find(char_array_4[i]);
49
+ }
50
+
51
+ char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
52
+ char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
53
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
54
+
55
+ for (i = 0; i < 3; i++) {
56
+ decoded.push_back(char_array_3[i]);
57
+ }
58
+ i = 0;
59
+ }
60
+ }
61
+
62
+ if (i) {
63
+ for (j = i; j < 4; j++) {
64
+ char_array_4[j] = 0;
65
+ }
66
+
67
+ for (j = 0; j < 4; j++) {
68
+ char_array_4[j] = base64_chars.find(char_array_4[j]);
69
+ }
70
+
71
+ char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
72
+ char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
73
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
74
+
75
+ for (j = 0; j < i - 1; j++) {
76
+ decoded.push_back(char_array_3[j]);
77
+ }
78
+ }
79
+
80
+ return decoded;
81
+ }
82
+
83
+ static const std::vector<lm_ggml_type> kv_cache_types = {
6
84
  LM_GGML_TYPE_F32,
7
85
  LM_GGML_TYPE_F16,
8
86
  LM_GGML_TYPE_BF16,
@@ -149,10 +227,16 @@ std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::co
149
227
  return ret;
150
228
  }
151
229
 
230
+ struct llama_rn_context_mtmd {
231
+ mtmd_context *mtmd_ctx = nullptr;
232
+ };
233
+
152
234
  llama_rn_context::~llama_rn_context() {
153
235
  if (ctx_sampling != nullptr) {
154
236
  common_sampler_free(ctx_sampling);
155
237
  }
238
+
239
+ releaseMultimodal();
156
240
  }
157
241
 
158
242
  void llama_rn_context::rewind() {
@@ -165,6 +249,7 @@ void llama_rn_context::rewind() {
165
249
  generated_text.reserve(params.n_ctx);
166
250
  generated_token_probs.clear();
167
251
  truncated = false;
252
+ context_full = false;
168
253
  stopped_eos = false;
169
254
  stopped_word = false;
170
255
  stopped_limit = false;
@@ -197,6 +282,9 @@ bool llama_rn_context::loadModel(common_params &params_)
197
282
  templates = common_chat_templates_init(model, params.chat_template);
198
283
  n_ctx = llama_n_ctx(ctx);
199
284
 
285
+ // Initialize context shift flag
286
+ LOG_INFO("ctx_shift: %s", params.ctx_shift ? "enabled" : "disabled");
287
+
200
288
  // We can uncomment for debugging or after this fix: https://github.com/ggerganov/llama.cpp/pull/11101
201
289
  // LOG_INFO("%s\n", common_params_get_system_info(params).c_str());
202
290
 
@@ -271,11 +359,11 @@ void llama_rn_context::truncatePrompt(std::vector<llama_token> &prompt_tokens) {
271
359
 
272
360
  new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
273
361
 
274
- LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s, num_prompt_tokens: %d",
362
+ LOG_INFO("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, old_size: %d, new_size: %d",
275
363
  n_ctx,
276
364
  params.n_keep,
277
365
  n_left,
278
- tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()).c_str(),
366
+ prompt_tokens.size(),
279
367
  new_tokens.size()
280
368
  );
281
369
 
@@ -283,65 +371,71 @@ void llama_rn_context::truncatePrompt(std::vector<llama_token> &prompt_tokens) {
283
371
  prompt_tokens = new_tokens;
284
372
  }
285
373
 
286
- void llama_rn_context::loadPrompt() {
287
- std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, params.prompt, true, true);
288
- num_prompt_tokens = prompt_tokens.size();
374
+ void llama_rn_context::loadPrompt(const std::vector<std::string> &media_paths) {
375
+ bool has_media = !media_paths.empty();
289
376
 
290
- // LOG tokens
291
- std::stringstream ss;
292
- ss << "\n" << __func__ << ": prompt_tokens = ";
293
- for (auto& token : prompt_tokens) {
294
- ss << token << " ";
295
- }
296
- LOG_INFO("%s\n", ss.str().c_str());
377
+ if (!has_media) {
378
+ std::vector<llama_token> text_tokens;
379
+ // Text-only path
380
+ text_tokens = ::common_tokenize(ctx, params.prompt, true, true);
381
+ num_prompt_tokens = text_tokens.size();
297
382
 
298
- if (params.n_keep < 0)
299
- {
300
- params.n_keep = (int)num_prompt_tokens;
301
- }
302
- params.n_keep = std::min(n_ctx - 4, params.n_keep);
383
+ // LOG tokens
384
+ std::stringstream ss;
385
+ ss << "\n" << __func__ << ": prompt_tokens = ";
386
+ for (auto& token : text_tokens) {
387
+ ss << token << " ";
388
+ }
389
+ LOG_INFO("%s\n", ss.str().c_str());
303
390
 
304
- // if input prompt is too big, truncate like normal
305
- if (num_prompt_tokens >= (size_t) n_ctx)
306
- {
307
- truncatePrompt(prompt_tokens);
308
- num_prompt_tokens = prompt_tokens.size();
391
+ if (params.n_keep < 0) {
392
+ params.n_keep = (int)num_prompt_tokens;
393
+ }
394
+ params.n_keep = std::min(n_ctx - 4, params.n_keep);
309
395
 
310
- LM_GGML_ASSERT(num_prompt_tokens < (size_t) n_ctx);
311
- }
396
+ // Handle truncation if needed
397
+ if (num_prompt_tokens >= (size_t)n_ctx) {
398
+ if (!params.ctx_shift) {
399
+ context_full = true;
400
+ return;
401
+ }
402
+ truncatePrompt(text_tokens);
403
+ num_prompt_tokens = text_tokens.size();
404
+ LM_GGML_ASSERT(num_prompt_tokens < (size_t)n_ctx);
405
+ }
312
406
 
313
- // do context shifitng
314
- if(!params.embedding){
315
- purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
316
- }
407
+ // Update sampling context
408
+ for (auto & token : text_tokens) {
409
+ common_sampler_accept(ctx_sampling, token, false);
410
+ }
317
411
 
412
+ // compare the evaluated prompt with the new prompt
413
+ n_past = common_part(embd, text_tokens);
318
414
 
319
- // push the prompt into the sampling context (do not apply grammar)
320
- for (auto & token : prompt_tokens)
321
- {
322
- common_sampler_accept(ctx_sampling, token, false);
323
- }
415
+ embd = text_tokens;
416
+ if (n_past == num_prompt_tokens) {
417
+ // we have to evaluate at least 1 token to generate logits.
418
+ n_past--;
419
+ }
324
420
 
325
- // compare the evaluated prompt with the new prompt
326
- n_past = common_part(embd, prompt_tokens);
421
+ // Manage KV cache
422
+ llama_kv_self_seq_rm(ctx, 0, n_past, -1);
327
423
 
328
- embd = prompt_tokens;
329
- if (n_past == num_prompt_tokens)
330
- {
331
- // we have to evaluate at least 1 token to generate logits.
332
- n_past--;
424
+ LOG_VERBOSE("prompt ingested, n_past: %d, cached: %s, to_eval: %s",
425
+ n_past,
426
+ tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past).c_str(),
427
+ tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
428
+ );
429
+ } else {
430
+ // Multimodal path - process all media paths
431
+ processMedia(params.prompt, media_paths);
432
+ num_prompt_tokens = embd.size();
333
433
  }
334
434
 
335
- // since #3228 we now have to manually manage the KV cache
336
- llama_kv_self_seq_rm(ctx, 0, n_past, -1);
337
-
338
- LOG_VERBOSE("prompt ingested, n_past: %d, cached: %s, to_eval: %s",
339
- n_past,
340
- tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past).c_str(),
341
- tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
342
- );
343
-
344
435
  has_next_token = true;
436
+
437
+ LOG_INFO("[DEBUG] Input processed: n_past=%d, embd.size=%zu, num_prompt_tokens=%zu, has_media=%d",
438
+ n_past, embd.size(), num_prompt_tokens, has_media ? 1 : 0);
345
439
  }
346
440
 
347
441
  void llama_rn_context::beginCompletion() {
@@ -351,6 +445,10 @@ void llama_rn_context::beginCompletion() {
351
445
  is_predicting = true;
352
446
  }
353
447
 
448
+ void llama_rn_context::endCompletion() {
449
+ is_predicting = false;
450
+ }
451
+
354
452
  completion_token_output llama_rn_context::nextToken()
355
453
  {
356
454
  completion_token_output result;
@@ -358,6 +456,14 @@ completion_token_output llama_rn_context::nextToken()
358
456
 
359
457
  if (embd.size() >= (size_t)params.n_ctx)
360
458
  {
459
+ if (!params.ctx_shift) {
460
+ // If context shifting is disabled, stop generation
461
+ LOG_WARNING("context full, n_ctx: %d, tokens: %d", params.n_ctx, embd.size());
462
+ has_next_token = false;
463
+ context_full = true;
464
+ return result;
465
+ }
466
+
361
467
  // Shift context
362
468
 
363
469
  const int n_left = n_past - params.n_keep - 1;
@@ -373,12 +479,9 @@ completion_token_output llama_rn_context::nextToken()
373
479
  embd.resize(embd.size() - n_discard);
374
480
 
375
481
  n_past -= n_discard;
482
+ truncated = true;
376
483
 
377
- LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s",
378
- params.n_ctx,
379
- params.n_keep,
380
- n_left
381
- );
484
+ LOG_VERBOSE("context shifted, new n_past: %d, new size: %d", n_past, embd.size());
382
485
  }
383
486
 
384
487
  bool tg = true;
@@ -677,7 +780,7 @@ std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
677
780
  }
678
781
 
679
782
  if (is_interrupted) llama_kv_self_clear(ctx);
680
- is_predicting = false;
783
+ endCompletion();
681
784
 
682
785
  char model_desc[128];
683
786
  llama_model_desc(model, model_desc, sizeof(model_desc));
@@ -712,162 +815,471 @@ void llama_rn_context::removeLoraAdapters() {
712
815
  std::vector<common_adapter_lora_info> llama_rn_context::getLoadedLoraAdapters() {
713
816
  return this->lora;
714
817
  }
715
- std::vector<int> llama_rn_context::longest_common_subseq(const std::vector<int> x, const std::vector<int> y){
716
- int m = x.size(), n = y.size();
717
-
718
- //int LCSuff[m+1][n+1];
719
- std::vector<std::vector<int>> LCSuff(m+1, std::vector<int>(n+1));
720
-
721
- for (int j = 0; j <= n; j++)
722
- LCSuff[0][j] = 0;
723
- for (int i = 0; i <= m; i++)
724
- LCSuff[i][0] = 0;
725
-
726
- for (int i = 1; i <= m; i++)
727
- {
728
- for (int j = 1; j <= n; j++)
729
- {
730
- if (x[i - 1] == y[j - 1])
731
- LCSuff[i][j] = LCSuff[i - 1][j - 1] + 1;
732
- else
733
- LCSuff[i][j] = 0;
734
- }
735
- }
736
-
737
- std::vector<int> longest;
738
- for (int i = 1; i <= m; i++)
739
- {
740
- for (int j = 1; j <= n; j++)
741
- {
742
- if (LCSuff[i][j] > longest.size())
743
- {
744
- auto off1 = ((i - LCSuff[i][j] + 1) - 1);
745
- auto off2 = off1 + LCSuff[i][j];
746
- longest.clear();
747
- // std::vector<int>().swap(longest);
748
- longest = std::vector<int>(x.begin() + off1, x.begin() + off2);
749
- // x.substr((i - LCSuff[i][j] + 1) - 1, LCSuff[i][j]);
750
- }
751
- }
752
- }
753
- return longest;
754
- }
755
-
756
- bool llama_rn_context::arr_start_with(const std::vector<int> targetArray, const std::vector<int> searchSeq)
757
- {
758
- int ss = searchSeq.size();
759
- if(targetArray.size()<ss)
760
- {
761
- return false;
762
- }
763
- for(int i=0;i<ss;++i)
764
- {
765
- if(targetArray[i]!=searchSeq[i])
766
- {
767
- return false;
768
- }
769
- }
770
- return true;
771
- }
772
-
773
- int llama_rn_context::arr_find_index_of(const std::vector<int> targetArray, const std::vector<int> searchSeq)
774
- {
775
- int ss = searchSeq.size();
776
- int tas = targetArray.size();
777
- if(tas<ss)
778
- {
779
- return -1;
780
- }
781
- for(int i=0;i<tas;++i)
782
- {
783
- int srch = 0;
784
- bool fail = false;
785
- for(int srch=0;srch<ss;++srch)
786
- {
787
- if ((i + srch) >= tas || targetArray[i + srch] != searchSeq[srch])
788
- {
789
- fail = true;
790
- break;
791
- }
792
- }
793
- if(!fail)
794
- {
795
- return i;
796
- }
797
- }
798
- return -1;
799
- }
800
-
801
- void llama_rn_context::purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context_tokens, std::vector<int> &new_context_tokens, const int genamt, const int nctx)
802
- {
803
- //scan from start old and new ctx, until first mismatch found, save as p0
804
- //check remaining old and new ctx for longest common subseq, which needs to be at 256 tokens
805
- //test: longest common subseq (LCQ) MUST start within 0 tokens from end of memory, otherwise purge fails
806
- //if passed, save beginning of LCQ from old ctx as p1
807
- //remove all tokens from old ctx between p0 and p1, updating both arrays and kv, then continue as normal
808
818
 
809
- const int short_fall_threshold = 200 + (nctx/30); //dont trigger shifting if the distance between trimstart and currhead < this
810
- const int stack_allowance = 60 + (nctx/50); //in case the end text is slightly modified, be forgiving
819
+ bool llama_rn_context::initMultimodal(const std::string &mmproj_path, bool use_gpu) {
820
+ LOG_INFO("[DEBUG] Initializing multimodal with mmproj path: %s", mmproj_path.c_str());
811
821
 
812
- int trimstart = 0;
813
- int new_tokens_len = new_context_tokens.size();
814
- bool purge_needed = true;
822
+ if (model == nullptr) {
823
+ LOG_ERROR("[DEBUG] Model not loaded, cannot initialize multimodal", "");
824
+ return false;
825
+ }
815
826
 
816
- for (int i = 0; i < current_context_tokens.size(); ++i)
817
- {
818
- if (current_context_tokens[i] == new_context_tokens[i])
819
- {
820
- trimstart += 1;
827
+ LOG_INFO("[DEBUG] Model info: n_ctx=%d, n_embd=%d",
828
+ llama_n_ctx(ctx),
829
+ llama_model_n_embd(model));
830
+
831
+ // Initialize mtmd context
832
+ mtmd_context_params mtmd_params = mtmd_context_params_default();
833
+ mtmd_params.use_gpu = use_gpu;
834
+ mtmd_params.print_timings = false;
835
+ mtmd_params.n_threads = params.cpuparams.n_threads;
836
+ mtmd_params.verbosity = (lm_ggml_log_level)LM_GGML_LOG_LEVEL_INFO;
837
+
838
+ LOG_INFO("[DEBUG] Initializing mtmd context with threads=%d", mtmd_params.n_threads);
839
+
840
+ auto mtmd_ctx = mtmd_init_from_file(mmproj_path.c_str(), model, mtmd_params);
841
+ if (mtmd_ctx == nullptr) {
842
+ LOG_ERROR("[DEBUG] Failed to initialize multimodal context with mmproj: %s", mmproj_path.c_str());
843
+ return false;
844
+ }
845
+ mtmd_wrapper = new llama_rn_context_mtmd();
846
+ mtmd_wrapper->mtmd_ctx = mtmd_ctx;
847
+
848
+ has_multimodal = true;
849
+
850
+ // Check if the model uses M-RoPE or non-causal attention
851
+ bool uses_mrope = mtmd_decode_use_mrope(mtmd_ctx);
852
+ bool uses_non_causal = mtmd_decode_use_non_causal(mtmd_ctx);
853
+ LOG_INFO("[DEBUG] Model multimodal properties: uses_mrope=%d, uses_non_causal=%d",
854
+ uses_mrope ? 1 : 0,
855
+ uses_non_causal ? 1 : 0);
856
+
857
+ // Disable context shifting when multimodal is enabled
858
+ // This is because an media chunk may contain multiple tokens
859
+ // and context shifting could break the media representation
860
+ params.ctx_shift = false;
861
+
862
+ // params.n_cache_reuse = 0;
863
+
864
+ LOG_INFO("Multimodal context initialized successfully with mmproj: %s", mmproj_path.c_str());
865
+ LOG_INFO("Context shifting disabled for multimodal support");
866
+ return true;
867
+ }
868
+
869
+ struct mtmd_tokenize_result {
870
+ std::vector<std::string> bitmap_hashes;
871
+ std::vector<llama_token> tokens;
872
+ std::vector<size_t> chunk_pos; // both text and media
873
+ std::vector<size_t> chunk_pos_media; // media only
874
+ mtmd_input_chunks* chunks = nullptr;
875
+ };
876
+
877
+ mtmd_tokenize_result tokenizeWithMedia(llama_rn_context_mtmd *mtmd_wrapper, const std::string &prompt, const std::vector<std::string> &media_paths) {
878
+ mtmd_tokenize_result result;
879
+ mtmd::bitmaps bitmaps;
880
+
881
+ // Load all media paths
882
+ for (const auto& media_path : media_paths) {
883
+ LOG_INFO("[DEBUG] Loading media: %s",
884
+ media_path.substr(0, 50).c_str()); // Only log part of path for base64
885
+
886
+ // Check if it's a base64 media
887
+ if (media_path.compare(0, 11, "data:image/") == 0 || media_path.compare(0, 11, "data:audio/") == 0) {
888
+ LOG_INFO("[DEBUG] Detected base64 encoded media");
889
+
890
+ // Parse base64 data
891
+ std::vector<std::string> parts;
892
+ size_t comma_pos = media_path.find(',');
893
+ if (comma_pos == std::string::npos) {
894
+ throw std::runtime_error("Invalid base64 media format, missing comma separator");
895
+ }
896
+
897
+ std::string header = media_path.substr(0, comma_pos);
898
+ std::string base64_data = media_path.substr(comma_pos + 1);
899
+
900
+ if (header.find("base64") == std::string::npos) {
901
+ bitmaps.entries.clear();
902
+ throw std::runtime_error("Image must be base64 encoded");
903
+ }
904
+
905
+ // Decode base64
906
+ std::vector<uint8_t> media_data = base64_decode(base64_data);
907
+ LOG_INFO("[DEBUG] Base64 decoded, size: %zu bytes", media_data.size());
908
+
909
+ // Load bitmap from memory buffer using direct initialization
910
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(media_data.data(), media_data.size()));
911
+ if (!bmp.ptr) {
912
+ bitmaps.entries.clear();
913
+ throw std::runtime_error("Failed to load base64 media");
914
+ }
915
+
916
+ // Calculate bitmap hash (for KV caching)
917
+ std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
918
+ bmp.set_id(hash.c_str());
919
+ LOG_INFO("[DEBUG] Bitmap hash: %s", hash.c_str());
920
+ bitmaps.entries.push_back(std::move(bmp));
921
+ result.bitmap_hashes.push_back(hash.c_str());
922
+ } else if (media_path.compare(0, 7, "http://") == 0 || media_path.compare(0, 8, "https://") == 0) {
923
+ // HTTP URLs are not supported yet
924
+ LOG_ERROR("[DEBUG] HTTP/HTTPS URLs are not supported yet: %s", media_path.c_str());
925
+ throw std::runtime_error("HTTP/HTTPS URLs are not supported yet");
926
+ } else {
927
+ // Regular file path
928
+ LOG_INFO("[DEBUG] Loading media from file");
929
+
930
+ // Check if file exists
931
+ FILE* file = fopen(media_path.c_str(), "rb");
932
+ if (file == nullptr) {
933
+ bitmaps.entries.clear();
934
+ throw std::runtime_error("File does not exist or cannot be opened");
935
+ }
936
+
937
+ // Get file size
938
+ fseek(file, 0, SEEK_END);
939
+ long file_size = ftell(file);
940
+ fseek(file, 0, SEEK_SET);
941
+ LOG_INFO("[DEBUG] File exists and size is %ld bytes", file_size);
942
+ fclose(file);
943
+
944
+ // Create bitmap directly
945
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(media_path.c_str()));
946
+ if (!bmp.ptr) {
947
+ bitmaps.entries.clear();
948
+ throw std::runtime_error("Failed to load media");
949
+ }
950
+
951
+ // Calculate bitmap hash (for KV caching)
952
+ std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
953
+ bmp.set_id(hash.c_str());
954
+ LOG_INFO("[DEBUG] Bitmap hash: %s", hash.c_str());
955
+ bitmaps.entries.push_back(std::move(bmp));
956
+ result.bitmap_hashes.push_back(hash.c_str());
821
957
  }
822
- else
823
- {
958
+ }
959
+
960
+ // Create input chunks
961
+ LOG_INFO("[DEBUG] Initializing input chunks");
962
+ result.chunks = mtmd_input_chunks_init();
963
+ if (result.chunks == nullptr) {
964
+ bitmaps.entries.clear();
965
+ throw std::runtime_error("Failed to initialize input chunks");
966
+ }
967
+
968
+ mtmd_input_text input_text;
969
+ input_text.text = prompt.c_str(); // Use the full prompt with image marker
970
+ input_text.add_special = true; // Add BOS token if this is the first message
971
+ input_text.parse_special = true; // Parse special tokens like <__media__>
972
+
973
+ /**
974
+ * Tokenize the text and media together.
975
+ *
976
+ * Example of tokenization for "foo bar <__media__> baz <__media__>":
977
+ *
978
+ * 1. Input text with media markers:
979
+ *
980
+ * "foo bar <__media__> baz <__media__>"
981
+ *
982
+ * 2. Model-specific markers are added.
983
+ *
984
+ * 3. Text is split and tokenized into chunks:
985
+ *
986
+ * ┌─────────────┐ ┌─────────────────────────┐ ┌─────────┐ ┌─────────────────────────┐
987
+ * │ TEXT CHUNK │ │ IMAGE CHUNK │ │ TEXT │ │ IMAGE CHUNK │
988
+ * │ "foo bar " │ │ │ │ " baz " │ │ │
989
+ * └─────────────┘ └─────────────────────────┘ └─────────┘ └─────────────────────────┘
990
+ * │ │ │ │
991
+ * ▼ ▼ ▼ ▼
992
+ * ┌─────────────┐ ┌─────────────────────────┐ ┌─────────┐ ┌─────────────────────────┐
993
+ * │ [1234,5678] │ │ Image Data Structure │ │ [9012] │ │ Image Data Structure │
994
+ * └─────────────┘ └─────────────────────────┘ └─────────┘ └─────────────────────────┘
995
+ *
996
+ * 4. Image token structure differences:
997
+ *
998
+ * For Qwen2VL (uses M-RoPE with 2D positions):
999
+ * ┌─────────────────────────────────────────┐
1000
+ * │ MEDIA_CHUNK │
1001
+ * │ ┌───────────────────────────────────┐ │
1002
+ * │ │ mtmd_image_tokens: │ │
1003
+ * │ │ nx = 16, ny = 16 │ │ ← 2D grid (16×16 = 256 tokens)
1004
+ * │ │ use_mrope_pos = true │ │ ← Uses M-RoPE positioning
1005
+ * │ │ batch_f32 = [image_embeddings] │ │
1006
+ * │ └───────────────────────────────────┘ │
1007
+ * └─────────────────────────────────────────┘
1008
+ *
1009
+ * For other models (uses 1D positions):
1010
+ * ┌─────────────────────────────────────────┐
1011
+ * │ MEDIA_CHUNK │
1012
+ * │ ┌───────────────────────────────────┐ │
1013
+ * │ │ mtmd_image_tokens: │ │
1014
+ * │ │ nx = 256, ny = 1 │ │ ← 1D sequence (256 tokens)
1015
+ * │ │ use_mrope_pos = false │ │ ← Uses standard positioning
1016
+ * │ │ batch_f32 = [image_embeddings] │ │
1017
+ * │ └───────────────────────────────────┘ │
1018
+ * └─────────────────────────────────────────┘
1019
+ *
1020
+ * 5. Final chunks array:
1021
+ * chunks[0] = TEXT_CHUNK([1234, 5678])
1022
+ * chunks[1] = MEDIA_CHUNK(first_image)
1023
+ * chunks[2] = TEXT_CHUNK([9012])
1024
+ * chunks[3] = MEDIA_CHUNK(second_image)
1025
+ */
1026
+ LOG_INFO("[DEBUG] Tokenizing text and %zu media", bitmaps.entries.size());
1027
+ auto bitmaps_c_ptr = bitmaps.c_ptr();
1028
+ int32_t res = mtmd_tokenize(mtmd_wrapper->mtmd_ctx, result.chunks, &input_text, bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
1029
+ if (res != 0) {
1030
+ mtmd_input_chunks_free(result.chunks);
1031
+ bitmaps.entries.clear();
1032
+ throw std::runtime_error("Failed to tokenize text and media");
1033
+ }
1034
+
1035
+ // Log chunk information
1036
+ size_t num_chunks = mtmd_input_chunks_size(result.chunks);
1037
+ LOG_INFO("[DEBUG] Tokenization successful: num_chunks=%zu", num_chunks);
1038
+
1039
+ // Track the total number of tokens (both text and image)
1040
+ size_t total_token_count = 0;
1041
+
1042
+ /**
1043
+ * Evaluate the chunks.
1044
+ *
1045
+ * For our example "foo bar <__media__> baz <__media__>":
1046
+ *
1047
+ * Token organization in memory:
1048
+ *
1049
+ * all_tokens: [t0][t1][NULL][NULL]...[NULL][t2][NULL][NULL]...[NULL]
1050
+ * positions: 0 1 2 3 ... 257 258 259 260 ... 514
1051
+ * chunk_pos: 0 2 258 259
1052
+ *
1053
+ * Where:
1054
+ * - [t0][t1] are text tokens for "foo bar " (positions 0-1)
1055
+ * - [NULL]x256 are placeholder tokens for the first image (positions 2-257)
1056
+ * - [t2] is the text token for " baz " (position 258)
1057
+ * - [NULL]x256 are placeholder tokens for the second image (positions 259-514)
1058
+ */
1059
+ for (size_t i = 0; i < num_chunks; i++) {
1060
+ result.chunk_pos.push_back(total_token_count);
1061
+
1062
+ const mtmd_input_chunk* chunk = mtmd_input_chunks_get(result.chunks, i);
1063
+ mtmd_input_chunk_type chunk_type = mtmd_input_chunk_get_type(chunk);
1064
+
1065
+ if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
1066
+ size_t n_tokens;
1067
+ const llama_token* tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
1068
+ LOG_INFO("[DEBUG] Chunk %zu: type=TEXT, n_tokens=%zu", i, n_tokens);
1069
+
1070
+ // Add text tokens
1071
+ result.tokens.insert(result.tokens.end(), tokens, tokens + n_tokens);
1072
+ total_token_count += n_tokens;
1073
+ } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE || chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
1074
+ result.chunk_pos_media.push_back(total_token_count);
1075
+
1076
+ size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
1077
+ size_t n_pos = mtmd_input_chunk_get_n_pos(chunk);
1078
+ LOG_INFO("[DEBUG] Chunk %zu: type=%s, n_tokens=%zu, n_pos=%zu",
1079
+ i, chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "IMAGE" : "AUDIO", n_tokens, n_pos);
1080
+
1081
+ for (size_t j = 0; j < n_pos; j++) {
1082
+ result.tokens.push_back(LLAMA_TOKEN_NULL); // Placeholder token
1083
+ }
1084
+ total_token_count += n_pos;
1085
+ }
1086
+ }
1087
+
1088
+ bitmaps.entries.clear();
1089
+
1090
+ return result;
1091
+ }
1092
+ void llama_rn_context::processMedia(
1093
+ const std::string &prompt,
1094
+ const std::vector<std::string> &media_paths
1095
+ ) {
1096
+ if (!isMultimodalEnabled()) {
1097
+ throw std::runtime_error("Multimodal is not enabled but image paths are provided");
1098
+ }
1099
+
1100
+ // Multimodal path
1101
+ std::string full_prompt = prompt;
1102
+ auto default_media_marker = mtmd_default_marker();
1103
+ // Add media marker if it doesn't already exist
1104
+ if (full_prompt.find(default_media_marker) == std::string::npos) {
1105
+ full_prompt += " ";
1106
+ full_prompt += default_media_marker;
1107
+ }
1108
+
1109
+ LOG_INFO("[DEBUG] Processing message with role=user, content=%s", full_prompt.c_str());
1110
+ LOG_INFO("[DEBUG] Processing %zu media with prompt: %s", media_paths.size(), prompt.c_str());
1111
+ LOG_INFO("[DEBUG] Current context state: n_past=%d, n_ctx=%d", n_past, n_ctx);
1112
+
1113
+ auto result = tokenizeWithMedia(mtmd_wrapper, full_prompt, media_paths);
1114
+
1115
+ auto all_tokens = result.tokens;
1116
+ auto chunks = result.chunks;
1117
+ auto chunk_pos = result.chunk_pos;
1118
+ auto chunk_pos_media = result.chunk_pos_media;
1119
+ auto bitmap_hashes = result.bitmap_hashes;
1120
+
1121
+ // Check if we have enough context space for all tokens
1122
+ if (all_tokens.size() >= (size_t)n_ctx) {
1123
+ mtmd_input_chunks_free(chunks);
1124
+ context_full = true;
1125
+ throw std::runtime_error("Not enough context space");
1126
+ }
1127
+
1128
+ n_past = common_part(embd, all_tokens);
1129
+
1130
+ llama_pos new_n_past = n_past;
1131
+
1132
+ // Adjust n_past to position of the text chunk
1133
+ // TODO: Edit the text chunk to remove the tokens before n_past to speed up
1134
+ // need to update the mtmd api
1135
+ auto adjusted_n_past = -1;
1136
+ for (size_t i = 0; i < chunk_pos.size(); i++) {
1137
+ if (n_past < chunk_pos[i]) {
824
1138
  break;
825
1139
  }
826
- if ((i + 2) >= new_tokens_len)
827
- {
828
- purge_needed = false;
829
- break; //no surgery required
1140
+ bool is_end = i + 1 == chunk_pos.size();
1141
+ if (
1142
+ chunk_pos[i] < n_past &&
1143
+ (!is_end && chunk_pos[i + 1] > n_past)
1144
+ // is_end & n_past < total_token_count:
1145
+ // don't need to adjust and it will skip eval_chunk_single, let nextToken() to finish the job
1146
+ ) {
1147
+ adjusted_n_past = chunk_pos[i];
830
1148
  }
831
1149
  }
1150
+ if (adjusted_n_past != -1) {
1151
+ n_past = adjusted_n_past;
1152
+ new_n_past = n_past;
1153
+ LOG_INFO("[DEBUG] Adjusted n_past to %d", n_past);
1154
+ }
832
1155
 
833
-
834
-
835
- if(!purge_needed || new_tokens_len < 6 || current_context_tokens.size() < 6 || new_tokens_len - trimstart < short_fall_threshold)
836
- {
837
- LOG_INFO("Fall Threshold: %d out of %d\n", new_tokens_len - trimstart, short_fall_threshold);
838
- return; //no purge is needed
1156
+ // Compare bitmap hashes, if they are not the same, backtrack n_past to the position of the first mismatch
1157
+ if (mtmd_bitmap_past_hashes.size() > 0) {
1158
+ for (size_t i = 0; i < bitmap_hashes.size(); i++) {
1159
+ auto pos = chunk_pos_media[i];
1160
+ if (n_past < pos) {
1161
+ break;
1162
+ }
1163
+ if (i >= mtmd_bitmap_past_hashes.size()) {
1164
+ break;
1165
+ }
1166
+ if (bitmap_hashes[i] != mtmd_bitmap_past_hashes[i]) {
1167
+ LOG_INFO(
1168
+ "[DEBUG] Bitmap hash mismatch at position %zu, %s != %s",
1169
+ i, bitmap_hashes[i].c_str(), mtmd_bitmap_past_hashes[i].c_str()
1170
+ );
1171
+ n_past = chunk_pos_media[i];
1172
+ new_n_past = n_past;
1173
+ break;
1174
+ }
1175
+ }
839
1176
  }
840
1177
 
841
- //at least this many tokens need to match, otherwise don't bother trimming
842
- const int lc_tok_threshold = std::max(std::min((new_tokens_len - trimstart) - (genamt+stack_allowance), (int)(nctx*0.45)), short_fall_threshold - stack_allowance);
1178
+ // Clear all KV cache entries after position n_past
1179
+ llama_kv_self_seq_rm(ctx, 0, n_past, -1);
843
1180
 
844
- auto curr_ctx_without_memory = std::vector<int>(current_context_tokens.begin() + trimstart, current_context_tokens.end());
845
- auto new_ctx_without_memory = std::vector<int>(new_context_tokens.begin() + trimstart, new_context_tokens.end());
1181
+ LOG_INFO("[DEBUG] Evaluating chunks: n_past=%d, n_batch=%d", n_past, params.n_batch);
846
1182
 
847
- auto shared = longest_common_subseq(curr_ctx_without_memory, new_ctx_without_memory);
1183
+ size_t num_chunks = mtmd_input_chunks_size(chunks);
848
1184
 
849
- if (shared.size() > lc_tok_threshold && arr_start_with(new_ctx_without_memory, shared)) // enough tokens in common
850
- {
851
- int found = arr_find_index_of(current_context_tokens,shared);
852
- if(found>=0 && found > trimstart)
853
- {
1185
+ for (size_t i = 0; i < chunk_pos.size(); i++) {
854
1186
 
855
- //extract the unwanted tokens out from context and KV
856
- int diff = found - trimstart;
857
- llama_kv_self_seq_rm(ctx, 0, trimstart, trimstart + diff);
858
- llama_kv_self_seq_add(ctx, 0, trimstart + diff, -1, -diff);
1187
+ LOG_INFO("[DEBUG] Evaluating chunk %zu: n_past=%d, chunk_pos=%zu", i, n_past, chunk_pos[i]);
859
1188
 
860
- for (size_t i = trimstart + diff; i < current_context_tokens.size() - 1; i++)
861
- {
862
- current_context_tokens[i - diff] = current_context_tokens[i];
1189
+ // Process chunk only if it's after the current n_past
1190
+ if (chunk_pos[i] >= n_past) {
1191
+ bool chunk_logits_last = (i == num_chunks - 1);
1192
+ auto chunk = mtmd_input_chunks_get(chunks, i);
1193
+
1194
+ int32_t res = mtmd_helper_eval_chunk_single(
1195
+ mtmd_wrapper->mtmd_ctx,
1196
+ ctx,
1197
+ chunk,
1198
+ n_past,
1199
+ 0,
1200
+ params.n_batch,
1201
+ chunk_logits_last,
1202
+ &new_n_past
1203
+ );
1204
+ if (res != 0) {
1205
+ mtmd_input_chunks_free(chunks);
1206
+ throw std::runtime_error("Failed to evaluate chunks");
863
1207
  }
1208
+ n_past = new_n_past;
1209
+ }
1210
+ }
864
1211
 
865
- LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
1212
+ if (n_past == all_tokens.size() && n_past > 0 && all_tokens[n_past - 1] != LLAMA_TOKEN_NULL) {
1213
+ // we have to evaluate at least 1 token to generate logits.
1214
+ n_past--;
1215
+ }
1216
+
1217
+ // Update embd with all tokens (both text and media)
1218
+ embd = all_tokens;
1219
+
1220
+ mtmd_bitmap_past_hashes = bitmap_hashes;
1221
+
1222
+ // Update sampling context with text tokens only
1223
+ for (auto & token : all_tokens) {
1224
+ if (token == LLAMA_TOKEN_NULL) {
1225
+ continue;
1226
+ }
1227
+ common_sampler_accept(ctx_sampling, token, false);
1228
+ }
866
1229
 
867
- current_context_tokens.resize(current_context_tokens.size() - diff);
1230
+ // Clean up media resources
1231
+ LOG_INFO("[DEBUG] Cleaning up resources");
1232
+ mtmd_input_chunks_free(chunks);
1233
+ }
1234
+
1235
+ llama_rn_tokenize_result llama_rn_context::tokenize(const std::string &text, const std::vector<std::string> &media_paths) {
1236
+ if (media_paths.size() > 0) {
1237
+ if (!isMultimodalEnabled()) {
1238
+ throw std::runtime_error("Multimodal is not enabled but media paths are provided");
868
1239
  }
1240
+ auto result = tokenizeWithMedia(mtmd_wrapper, text, media_paths);
1241
+ mtmd_input_chunks_free(result.chunks);
1242
+ llama_rn_tokenize_result tokenize_result = {
1243
+ .tokens = result.tokens,
1244
+ .has_media = true,
1245
+ .bitmap_hashes = result.bitmap_hashes,
1246
+ .chunk_pos = result.chunk_pos,
1247
+ .chunk_pos_media = result.chunk_pos_media,
1248
+ };
1249
+ return tokenize_result;
869
1250
  }
1251
+ std::vector<llama_token> text_tokens;
1252
+ text_tokens = common_tokenize(ctx, text, false);
1253
+ llama_rn_tokenize_result tokenize_result = {
1254
+ .tokens = text_tokens,
1255
+ .has_media = false,
1256
+ .bitmap_hashes = {},
1257
+ .chunk_pos = {},
1258
+ .chunk_pos_media = {},
1259
+ };
1260
+ return tokenize_result;
1261
+ }
1262
+
1263
+ bool llama_rn_context::isMultimodalEnabled() const {
1264
+ return has_multimodal && mtmd_wrapper != nullptr;
1265
+ }
870
1266
 
1267
+ bool llama_rn_context::isMultimodalSupportVision() const {
1268
+ return isMultimodalEnabled() && mtmd_support_vision(mtmd_wrapper->mtmd_ctx);
1269
+ }
1270
+
1271
+ bool llama_rn_context::isMultimodalSupportAudio() const {
1272
+ return isMultimodalEnabled() && mtmd_support_audio(mtmd_wrapper->mtmd_ctx);
1273
+ }
1274
+
1275
+ void llama_rn_context::releaseMultimodal() {
1276
+ if (mtmd_wrapper && mtmd_wrapper->mtmd_ctx != nullptr) {
1277
+ mtmd_free(mtmd_wrapper->mtmd_ctx);
1278
+ mtmd_wrapper->mtmd_ctx = nullptr;
1279
+ delete mtmd_wrapper;
1280
+ mtmd_wrapper = nullptr;
1281
+ has_multimodal = false;
1282
+ }
871
1283
  }
872
1284
 
873
1285
  }