cui-llama.rn 1.7.4 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. package/README.md +217 -17
  2. package/android/src/main/CMakeLists.txt +34 -15
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +79 -5
  4. package/android/src/main/java/com/rnllama/RNLlama.java +237 -0
  5. package/android/src/main/jni.cpp +213 -14
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
  16. package/cpp/README.md +1 -1
  17. package/cpp/chat-parser.cpp +385 -0
  18. package/cpp/chat-parser.h +120 -0
  19. package/cpp/chat.cpp +726 -596
  20. package/cpp/chat.h +71 -6
  21. package/cpp/common.cpp +56 -38
  22. package/cpp/common.h +9 -3
  23. package/cpp/ggml-backend-reg.cpp +5 -0
  24. package/cpp/ggml-backend.cpp +10 -2
  25. package/cpp/ggml-common.h +4 -0
  26. package/cpp/ggml-cpu/amx/amx.cpp +1 -1
  27. package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
  28. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  29. package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
  30. package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
  31. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  32. package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
  33. package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  34. package/cpp/ggml-cpu/arch-fallback.h +184 -0
  35. package/cpp/ggml-cpu/common.h +4 -3
  36. package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
  37. package/cpp/ggml-cpu/ggml-cpu.c +123 -104
  38. package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
  39. package/cpp/ggml-cpu/ops.cpp +330 -148
  40. package/cpp/ggml-cpu/ops.h +1 -0
  41. package/cpp/ggml-cpu/quants.c +1158 -0
  42. package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  43. package/cpp/ggml-cpu/repack.cpp +1571 -0
  44. package/cpp/ggml-cpu/repack.h +98 -0
  45. package/cpp/ggml-cpu/simd-mappings.h +330 -38
  46. package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  47. package/cpp/ggml-cpu/vec.cpp +87 -18
  48. package/cpp/ggml-cpu/vec.h +249 -94
  49. package/cpp/ggml-cpu.h +1 -0
  50. package/cpp/ggml-impl.h +63 -183
  51. package/cpp/ggml-llama-sim.metallib +0 -0
  52. package/cpp/ggml-llama.metallib +0 -0
  53. package/cpp/ggml-metal.m +152 -45
  54. package/cpp/ggml-quants.c +0 -2
  55. package/cpp/ggml.c +61 -21
  56. package/cpp/ggml.h +22 -3
  57. package/cpp/gguf.cpp +24 -3
  58. package/cpp/json-partial.cpp +256 -0
  59. package/cpp/json-partial.h +38 -0
  60. package/cpp/json-schema-to-grammar.cpp +5 -47
  61. package/cpp/json-schema-to-grammar.h +4 -4
  62. package/cpp/llama-arch.cpp +153 -3
  63. package/cpp/llama-arch.h +27 -1
  64. package/cpp/llama-batch.cpp +741 -272
  65. package/cpp/llama-batch.h +112 -54
  66. package/cpp/llama-chat.cpp +30 -8
  67. package/cpp/llama-chat.h +1 -0
  68. package/cpp/llama-context.cpp +524 -339
  69. package/cpp/llama-context.h +38 -17
  70. package/cpp/llama-cparams.cpp +4 -0
  71. package/cpp/llama-cparams.h +2 -0
  72. package/cpp/llama-grammar.cpp +12 -2
  73. package/cpp/llama-graph.cpp +431 -356
  74. package/cpp/llama-graph.h +126 -58
  75. package/cpp/llama-hparams.cpp +10 -2
  76. package/cpp/llama-hparams.h +19 -2
  77. package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
  78. package/cpp/llama-kv-cache-unified-iswa.h +128 -0
  79. package/cpp/llama-kv-cache-unified.cpp +1841 -0
  80. package/cpp/llama-kv-cache-unified.h +303 -0
  81. package/cpp/llama-kv-cells.h +439 -0
  82. package/cpp/llama-memory-hybrid.cpp +246 -0
  83. package/cpp/llama-memory-hybrid.h +138 -0
  84. package/cpp/llama-memory-recurrent.cpp +1112 -0
  85. package/cpp/llama-memory-recurrent.h +183 -0
  86. package/cpp/llama-memory.cpp +41 -0
  87. package/cpp/llama-memory.h +86 -5
  88. package/cpp/llama-mmap.cpp +1 -1
  89. package/cpp/llama-model-loader.cpp +42 -17
  90. package/cpp/llama-model-saver.cpp +1 -0
  91. package/cpp/llama-model.cpp +1639 -513
  92. package/cpp/llama-model.h +26 -0
  93. package/cpp/llama-sampling.cpp +2 -2
  94. package/cpp/llama-vocab.cpp +65 -28
  95. package/cpp/llama-vocab.h +1 -0
  96. package/cpp/llama.cpp +11 -7
  97. package/cpp/llama.h +150 -42
  98. package/cpp/minja/chat-template.hpp +1 -1
  99. package/cpp/minja/minja.hpp +1 -1
  100. package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
  101. package/cpp/nlohmann/json_fwd.hpp +187 -0
  102. package/cpp/regex-partial.cpp +204 -0
  103. package/cpp/regex-partial.h +56 -0
  104. package/cpp/rn-llama.cpp +646 -35
  105. package/cpp/rn-llama.h +32 -1
  106. package/cpp/rn-tts.h +39 -0
  107. package/cpp/sampling.cpp +7 -8
  108. package/cpp/tools/mtmd/clip-impl.h +5 -0
  109. package/cpp/tools/mtmd/clip.cpp +572 -436
  110. package/cpp/tools/mtmd/clip.h +14 -4
  111. package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
  112. package/cpp/tools/mtmd/mtmd-audio.h +2 -17
  113. package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
  114. package/cpp/tools/mtmd/mtmd-helper.h +91 -0
  115. package/cpp/tools/mtmd/mtmd.cpp +368 -248
  116. package/cpp/tools/mtmd/mtmd.h +6 -70
  117. package/cpp/unicode.cpp +5 -0
  118. package/ios/CMakeLists.txt +26 -6
  119. package/ios/RNLlama.h +1 -1
  120. package/ios/RNLlama.mm +153 -3
  121. package/ios/RNLlamaContext.h +9 -1
  122. package/ios/RNLlamaContext.mm +112 -9
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  131. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  132. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  133. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  134. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  135. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  136. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  137. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  138. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  139. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  140. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  141. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  142. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  143. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  144. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  145. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  146. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  147. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
  148. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  149. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  150. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
  151. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  152. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  153. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  154. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  155. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  156. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  162. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  163. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  164. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  165. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  166. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  167. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  168. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  169. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  170. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  171. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  172. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  173. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  174. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  175. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  176. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  177. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  178. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  179. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  180. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  181. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  182. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  183. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  184. package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  185. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  186. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  187. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  188. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  189. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  190. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  191. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  192. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
  193. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
  194. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  195. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  196. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  197. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
  198. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  199. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  200. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  201. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  202. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  203. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  204. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  205. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  206. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  207. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  208. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  209. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  210. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  211. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  212. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  213. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  214. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  215. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
  216. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  217. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  218. package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  219. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  220. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  221. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  222. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  223. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  224. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  225. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  226. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  227. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  228. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  229. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  230. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  231. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  232. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  233. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  234. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  235. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  236. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  237. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  238. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  239. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  240. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  241. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  242. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  243. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  244. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  245. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  246. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  247. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  248. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  249. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  250. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  251. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  252. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
  253. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  254. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  255. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  256. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  257. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  258. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  259. package/jest/mock.js +24 -0
  260. package/package.json +1 -1
  261. package/src/NativeRNLlama.ts +46 -2
  262. package/src/index.ts +105 -1
  263. package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  264. package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
  265. package/cpp/ggml-cpu/sgemm.cpp +0 -3544
  266. package/cpp/ggml-cpu/sgemm.h +0 -14
  267. package/cpp/llama-kv-cache.cpp +0 -2827
  268. package/cpp/llama-kv-cache.h +0 -515
  269. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  270. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  271. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  272. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
  273. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  274. /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  275. /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
  276. /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
@@ -95,15 +95,21 @@ mtmd_context_params mtmd_context_params_default() {
95
95
  }
96
96
 
97
97
  struct mtmd_context {
98
- struct clip_ctx * ctx_clip;
98
+ struct clip_ctx * ctx_v; // vision
99
+ struct clip_ctx * ctx_a; // audio
99
100
  const struct llama_model * text_model;
100
101
  std::vector<float> image_embd_v; // image embedding vector
101
102
 
102
103
  bool print_timings;
103
104
  int n_threads;
104
105
  std::string media_marker;
105
- bool has_vision;
106
- bool has_audio;
106
+ const int n_embd_text;
107
+
108
+ // these are not token, but strings used to mark the beginning and end of image/audio embeddings
109
+ std::string img_beg;
110
+ std::string img_end;
111
+ std::string aud_beg;
112
+ std::string aud_end;
107
113
 
108
114
  // for llava-uhd style models, we need special tokens in-between slices
109
115
  // minicpmv calls them "slices", llama 4 calls them "tiles"
@@ -132,26 +138,61 @@ struct mtmd_context {
132
138
  text_model (text_model),
133
139
  print_timings(ctx_params.print_timings),
134
140
  n_threads (ctx_params.n_threads),
135
- media_marker (ctx_params.media_marker)
141
+ media_marker (ctx_params.media_marker),
142
+ n_embd_text (llama_model_n_embd(text_model))
136
143
  {
137
144
  if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
138
145
  throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
139
146
  }
140
147
 
148
+ if (media_marker.empty()) {
149
+ throw std::runtime_error("media_marker must not be empty");
150
+ }
151
+
141
152
  clip_context_params ctx_clip_params;
142
153
  ctx_clip_params.use_gpu = ctx_params.use_gpu;
143
154
  ctx_clip_params.verbosity = ctx_params.verbosity;
144
- ctx_clip = clip_init(mmproj_fname, ctx_clip_params);
145
- if (!ctx_clip) {
155
+ auto res = clip_init(mmproj_fname, ctx_clip_params);
156
+ ctx_v = res.ctx_v;
157
+ ctx_a = res.ctx_a;
158
+ if (!ctx_v && !ctx_a) {
146
159
  throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
147
160
  }
148
161
 
149
- has_vision = clip_has_vision_encoder(ctx_clip);
150
- has_audio = clip_has_audio_encoder(ctx_clip);
151
- use_mrope = clip_is_qwen2vl(ctx_clip);
162
+ // if both vision and audio mmproj are present, we need to validate their n_embd
163
+ if (ctx_v && ctx_a) {
164
+ int n_embd_v = clip_n_mmproj_embd(ctx_v);
165
+ int n_embd_a = clip_n_mmproj_embd(ctx_a);
166
+ if (n_embd_v != n_embd_a) {
167
+ throw std::runtime_error(string_format(
168
+ "mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n",
169
+ n_embd_v, n_embd_a));
170
+ }
171
+ }
172
+
173
+ // since we already validate n_embd of vision and audio mmproj,
174
+ // we can safely assume that they are the same
175
+ int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
176
+ if (n_embd_text != n_embd_clip) {
177
+ throw std::runtime_error(string_format(
178
+ "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
179
+ "hint: you may be using wrong mmproj\n",
180
+ n_embd_text, n_embd_clip));
181
+ }
182
+ if (ctx_v) {
183
+ init_vision();
184
+ }
185
+ if (ctx_a) {
186
+ init_audio();
187
+ }
188
+ }
152
189
 
153
- projector_type proj = clip_get_projector_type(ctx_clip);
154
- int minicpmv_version = clip_is_minicpmv(ctx_clip);
190
+ void init_vision() {
191
+ LM_GGML_ASSERT(ctx_v != nullptr);
192
+ use_mrope = clip_is_qwen2vl(ctx_v);
193
+
194
+ projector_type proj = clip_get_projector_type(ctx_v);
195
+ int minicpmv_version = clip_is_minicpmv(ctx_v);
155
196
  if (minicpmv_version == 2) {
156
197
  // minicpmv 2.5 format:
157
198
  // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
@@ -196,24 +237,82 @@ struct mtmd_context {
196
237
  ov_img_first = false; // overview image is last
197
238
  }
198
239
 
199
- if (proj == PROJECTOR_TYPE_ULTRAVOX) {
240
+ // set boi/eoi
241
+ if (proj == PROJECTOR_TYPE_GEMMA3) {
242
+ // <start_of_image> ... (image embeddings) ... <end_of_image>
243
+ img_beg = "<start_of_image>";
244
+ img_end = "<end_of_image>";
245
+
246
+ } else if (proj == PROJECTOR_TYPE_IDEFICS3) {
247
+ // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
248
+ img_beg = "<fake_token_around_image><global-img>";
249
+ img_end = "<fake_token_around_image>";
250
+
251
+ } else if (proj == PROJECTOR_TYPE_PIXTRAL) {
252
+ // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
253
+ img_end = "[IMG_END]";
254
+
255
+ } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL) {
256
+ // <|vision_start|> ... (image embeddings) ... <|vision_end|>
257
+ img_beg = "<|vision_start|>";
258
+ img_end = "<|vision_end|>";
259
+
260
+ } else if (proj == PROJECTOR_TYPE_LLAMA4) {
261
+ // (more details in mtmd_context constructor)
262
+ img_beg = "<|image_start|>";
263
+ img_end = "<|image_end|>";
264
+ LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
265
+ " https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
266
+
267
+ } else if (proj == PROJECTOR_TYPE_INTERNVL) {
268
+ // <img> ... (image embeddings) ... </img>
269
+ img_beg = "<img>";
270
+ img_end = "</img>";
271
+
272
+ }
273
+ }
274
+
275
+ void init_audio() {
276
+ LM_GGML_ASSERT(ctx_a != nullptr);
277
+ projector_type proj = clip_get_projector_type(ctx_a);
278
+
279
+ if (clip_has_whisper_encoder(ctx_a)) {
200
280
  // TODO @ngxson : check if model n_mel is 128 or 80
201
281
  w_filters = whisper_precalc_filters::get_128_bins();
202
282
  }
203
283
 
204
- // warning messages
205
- if (proj == PROJECTOR_TYPE_LLAMA4) {
206
- LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
207
- " https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
284
+ LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
285
+ " https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
286
+
287
+ if (proj == PROJECTOR_TYPE_QWEN2A) {
288
+ // <|audio_bos|> ... (embeddings) ... <|audio_eos|>
289
+ aud_beg = "<|audio_bos|>";
290
+ aud_end = "<|audio_eos|>";
291
+
208
292
  }
209
- if (has_audio) {
210
- LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
211
- " https://github.com/ggml-org/llama.cpp/pull/13623\n", __func__);
293
+ }
294
+
295
+ // get clip ctx based on chunk type
296
+ clip_ctx * get_clip_ctx(const mtmd_input_chunk * chunk) const {
297
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
298
+ return ctx_v;
299
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
300
+ return ctx_a;
212
301
  }
302
+ LM_GGML_ABORT("unknown chunk type");
303
+ }
304
+
305
+ projector_type proj_type_v() const {
306
+ return ctx_v ? clip_get_projector_type(ctx_v) : PROJECTOR_TYPE_UNKNOWN;
307
+ }
308
+
309
+ projector_type proj_type_a() const {
310
+ return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
213
311
  }
214
312
 
215
313
  ~mtmd_context() {
216
- clip_free(ctx_clip);
314
+ clip_free(ctx_a);
315
+ clip_free(ctx_v);
217
316
  }
218
317
 
219
318
  private:
@@ -260,162 +359,137 @@ void mtmd_free(mtmd_context * ctx) {
260
359
  }
261
360
  }
262
361
 
263
- // copied from common_tokenize
264
- static std::vector<llama_token> mtmd_tokenize_text_internal(
265
- const struct llama_vocab * vocab,
266
- const std::string & text,
267
- bool add_special,
268
- bool parse_special) {
269
- // upper limit for the number of tokens
270
- int n_tokens = text.length() + 2 * add_special;
271
- std::vector<llama_token> result(n_tokens);
272
- n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
273
- if (n_tokens < 0) {
274
- result.resize(-n_tokens);
275
- int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
276
- LM_GGML_ASSERT(check == -n_tokens);
277
- } else {
278
- result.resize(n_tokens);
279
- }
280
- return result;
281
- }
362
+ struct mtmd_tokenizer {
363
+ mtmd_context * ctx;
364
+ std::vector<const mtmd_bitmap *> bitmaps;
282
365
 
283
- int32_t mtmd_tokenize(mtmd_context * ctx,
284
- mtmd_input_chunks * output,
366
+ std::string input_text;
367
+ bool add_special;
368
+ bool parse_special;
369
+ const llama_vocab * vocab;
370
+
371
+ mtmd_input_chunks cur;
372
+
373
+ mtmd_tokenizer(mtmd_context * ctx,
285
374
  const mtmd_input_text * text,
286
375
  const mtmd_bitmap ** bitmaps,
287
- size_t n_bitmaps) {
288
- auto vocab = llama_model_get_vocab(ctx->text_model);
289
-
290
- std::string prompt_modified(text->text);
291
- std::string marker_modified(ctx->media_marker);
292
- projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
293
-
294
- // for compatibility, we convert image marker to media marker
295
- string_replace_all(prompt_modified, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
296
-
297
- // a bit hacky here, but works for now
298
- // for some models, we need to add prefix and suffix to the image embeddings
299
- if (clip_is_gemma3(ctx->ctx_clip)) {
300
- // gemma 3
301
- // <start_of_image> ... (image embeddings) ... <end_of_image>
302
- marker_modified = "<start_of_image>" + ctx->media_marker + "<end_of_image>";
303
- string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
304
-
305
- } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
306
- // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
307
- marker_modified = "<fake_token_around_image><global-img>" + ctx->media_marker + "<fake_token_around_image>";
308
- string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
309
-
310
- } else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
311
- // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
312
- marker_modified = ctx->media_marker + "[IMG_END]";
313
- string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
314
-
315
- } else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
316
- // <|vision_start|> ... (image embeddings) ... <|vision_end|>
317
- marker_modified = "<|vision_start|>" + ctx->media_marker + "<|vision_end|>";
318
- string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
319
-
320
- } else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
321
- // (more details in mtmd_context constructor)
322
- marker_modified = "<|image_start|>" + ctx->media_marker + "<|image_end|>";
323
- string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
324
-
325
- } else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
326
- // <img> ... (image embeddings) ... </img>
327
- marker_modified = "<img>" + ctx->media_marker + "</img>";
328
- string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
329
-
330
- }
331
-
332
- // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
333
- // for glm-edge, BOI and EOI token's embeddings are not present in the text model
334
-
335
- std::vector<std::string> parts = string_split_str(prompt_modified, ctx->media_marker);
336
- output->entries.clear();
337
- output->entries.reserve(parts.size());
338
-
339
- size_t i_bm = 0;
340
-
341
- // utility for adding raw tokens
342
- auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
343
- mtmd_input_chunk chunk{
344
- MTMD_INPUT_CHUNK_TYPE_TEXT,
345
- std::move(tokens),
346
- nullptr, // image tokens
347
- nullptr, // audio tokens
348
- };
349
- output->entries.emplace_back(std::move(chunk));
350
- };
376
+ size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) {
377
+ add_special = text->add_special;
378
+ parse_special = text->parse_special;
379
+ input_text = text->text;
380
+ vocab = llama_model_get_vocab(ctx->text_model);
381
+
382
+ // for compatibility, we convert image marker to media marker
383
+ string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
384
+ }
351
385
 
352
- // utility for splitting batch of multiple images into chunks of batch having single images
353
- auto split_batch_to_chunk = [&ctx](clip_image_f32_batch && batch_f32, const std::string & id) {
354
- std::vector<mtmd_input_chunk> chunks;
386
+ int32_t tokenize(mtmd_input_chunks * output) {
387
+ cur.entries.clear();
388
+ std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
389
+ size_t i_bm = 0; // index of the current bitmap
390
+ for (auto & part : parts) {
391
+ if (part == ctx->media_marker) {
392
+ // this is a marker, we should add the next bitmap
393
+ if (i_bm >= bitmaps.size()) {
394
+ LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
395
+ __func__, bitmaps.size(), parts.size() - 1);
396
+ return 1;
397
+ }
398
+ const mtmd_bitmap * bitmap = bitmaps[i_bm++];
399
+ int32_t res = add_media(bitmap);
400
+ if (res != 0) {
401
+ return res;
402
+ }
403
+ } else {
404
+ // this is a text part, we should add it as text
405
+ add_text(part, parse_special);
406
+ }
407
+ }
355
408
 
356
- for (auto & entry : batch_f32.entries) {
357
- mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
358
- image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
359
- image_tokens->ny = 1;
360
- image_tokens->batch_f32.entries.push_back(std::move(entry));
361
- image_tokens->id = id;
409
+ if (add_special && llama_vocab_get_add_bos(vocab)) {
410
+ // if first chunk is text, we add BOS token to first text chunk
411
+ // otherwise, create a new text chunk with BOS token
412
+ if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
413
+ // add BOS token to the beginning of first text chunk
414
+ cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
415
+ } else {
416
+ // create a new text chunk with BOS token at the beginning
417
+ mtmd_input_chunk bos_chunk{
418
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
419
+ {llama_vocab_bos(vocab)},
420
+ nullptr, // image tokens
421
+ nullptr, // audio tokens
422
+ };
423
+ cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
424
+ }
425
+ }
362
426
 
363
- mtmd_input_chunk chunk{
364
- MTMD_INPUT_CHUNK_TYPE_IMAGE,
365
- {}, // text tokens
366
- std::move(image_tokens),
367
- nullptr, // audio tokens
368
- };
369
- chunks.emplace_back(std::move(chunk));
427
+ if (add_special && llama_vocab_get_add_eos(vocab)) {
428
+ // if last chunk is text, we add EOS token to it
429
+ add_text({llama_vocab_eos(vocab)});
370
430
  }
371
431
 
372
- return chunks;
373
- };
432
+ if (i_bm != bitmaps.size()) {
433
+ LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
434
+ __func__, bitmaps.size(), parts.size() - 1);
435
+ return 1;
436
+ }
437
+
438
+ *output = std::move(cur);
439
+
440
+ return 0;
441
+ }
442
+
443
+ void add_text(const std::string & txt, bool parse_special) {
444
+ LOG_DBG("%s: %s\n", __func__, txt.c_str());
445
+ auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
446
+ add_text(tokens);
447
+ }
374
448
 
375
- for (const auto & part : parts) {
376
- // printf("tokenizing part: %s\n", part.c_str());
377
- bool add_bos = &parts.front() == &part;
378
- auto tokens = mtmd_tokenize_text_internal(vocab, part, text->add_special && add_bos, text->parse_special);
449
+ void add_text(const std::vector<llama_token> & tokens) {
379
450
  if (tokens.empty()) {
380
- continue;
451
+ return;
381
452
  }
382
- mtmd_input_chunk chunk{
383
- MTMD_INPUT_CHUNK_TYPE_TEXT,
384
- std::move(tokens),
385
- nullptr, // image tokens
386
- nullptr, // audio tokens
387
- };
388
- output->entries.emplace_back(std::move(chunk));
389
-
390
- // only add image/audio tokens to middle of 2 parts
391
- // therefore, we skip handling image/audio if this is the last part
392
- if (&parts.back() == &part) {
393
- continue;
453
+ // if last entry is also a text chunk, add tokens to it instead of creating new chunk
454
+ if (!cur.entries.empty() && cur.entries.back().type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
455
+ cur.entries.back().tokens_text.insert(
456
+ cur.entries.back().tokens_text.end(),
457
+ tokens.begin(),
458
+ tokens.end());
459
+ } else {
460
+ mtmd_input_chunk chunk{
461
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
462
+ tokens,
463
+ nullptr, // image tokens
464
+ nullptr, // audio tokens
465
+ };
466
+ cur.entries.emplace_back(std::move(chunk));
394
467
  }
468
+ }
395
469
 
396
- if (!bitmaps[i_bm]->is_audio) {
470
+ int32_t add_media(const mtmd_bitmap * bitmap) {
471
+ if (!bitmap->is_audio) {
397
472
  // handle image
398
473
 
399
- if (i_bm >= n_bitmaps) {
400
- LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
401
- return 1;
402
- }
403
-
404
- if (!ctx->has_vision) {
474
+ if (!ctx->ctx_v) {
405
475
  LOG_ERR("%s: error: model does not support vision input\n", __func__);
406
476
  return 2;
407
477
  }
408
478
 
479
+ if (!ctx->img_beg.empty()) {
480
+ add_text(ctx->img_beg, true); // add image begin token
481
+ }
482
+
409
483
  // convert mtmd_bitmap to clip_image_u8
410
484
  clip_image_u8_ptr img_u8(clip_image_u8_init());
411
- img_u8->nx = bitmaps[i_bm]->nx;
412
- img_u8->ny = bitmaps[i_bm]->ny;
413
- img_u8->buf.resize(bitmaps[i_bm]->data.size());
414
- std::memcpy(img_u8->buf.data(), bitmaps[i_bm]->data.data(), img_u8->nx * img_u8->ny * 3);
485
+ img_u8->nx = bitmap->nx;
486
+ img_u8->ny = bitmap->ny;
487
+ img_u8->buf.resize(bitmap->data.size());
488
+ std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
415
489
 
416
490
  // preprocess image
417
491
  clip_image_f32_batch batch_f32;
418
- bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
492
+ bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
419
493
  if (!ok) {
420
494
  LOG_ERR("Unable to preprocess image\n");
421
495
  return 2;
@@ -427,8 +501,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
427
501
  || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
428
502
  || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
429
503
  ) {
504
+ const int n_col = batch_f32.grid_x;
505
+ const int n_row = batch_f32.grid_y;
430
506
  // split batch into chunks of single images
431
- auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_bm]->id);
507
+ // NOTE: batch_f32 will be invalidated after this call
508
+ auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
432
509
  LM_GGML_ASSERT(chunks.size() > 0);
433
510
 
434
511
  auto ov_chunk = std::move(chunks.front());
@@ -437,66 +514,65 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
437
514
  // add overview image (first)
438
515
  if (ctx->ov_img_first) {
439
516
  if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
440
- add_text_chunk({ctx->tok_ov_img_start});
517
+ add_text({ctx->tok_ov_img_start});
441
518
  }
442
- output->entries.emplace_back(std::move(ov_chunk));
519
+ cur.entries.emplace_back(std::move(ov_chunk));
443
520
  if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
444
- add_text_chunk({ctx->tok_ov_img_end});
521
+ add_text({ctx->tok_ov_img_end});
445
522
  }
446
523
  }
447
524
 
448
525
  // add slices (or tiles)
449
526
  if (!chunks.empty()) {
450
- const int n_col = batch_f32.grid_x;
451
- const int n_row = batch_f32.grid_y;
527
+ LM_GGML_ASSERT((int)chunks.size() == n_row * n_col);
452
528
  if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
453
- add_text_chunk({ctx->tok_slices_start});
529
+ add_text({ctx->tok_slices_start});
454
530
  }
455
531
  for (int y = 0; y < n_row; y++) {
456
532
  for (int x = 0; x < n_col; x++) {
457
533
  const bool is_last_in_row = (x == n_col - 1);
458
534
  if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
459
- add_text_chunk({ctx->tok_sli_img_start});
535
+ add_text({ctx->tok_sli_img_start});
460
536
  }
461
- output->entries.emplace_back(std::move(chunks[y * n_col + x]));
537
+ cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
462
538
  if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
463
- add_text_chunk({ctx->tok_sli_img_end});
539
+ add_text({ctx->tok_sli_img_end});
464
540
  }
465
541
  if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
466
- add_text_chunk({ctx->tok_sli_img_mid});
542
+ add_text({ctx->tok_sli_img_mid});
467
543
  }
468
544
  }
469
545
  if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
470
- add_text_chunk({ctx->tok_row_end});
546
+ add_text({ctx->tok_row_end});
471
547
  }
472
548
  }
473
549
  if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
474
- add_text_chunk({ctx->tok_slices_end});
550
+ add_text({ctx->tok_slices_end});
475
551
  }
476
552
  }
477
553
 
478
554
  // add overview image (last)
479
555
  if (!ctx->ov_img_first) {
480
556
  if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
481
- add_text_chunk({ctx->tok_ov_img_start});
557
+ add_text({ctx->tok_ov_img_start});
482
558
  }
483
- output->entries.emplace_back(std::move(ov_chunk));
559
+ cur.entries.emplace_back(std::move(ov_chunk));
484
560
  if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
485
- add_text_chunk({ctx->tok_ov_img_end});
561
+ add_text({ctx->tok_ov_img_end});
486
562
  }
487
563
  }
488
564
 
489
565
  } else {
490
566
  size_t n_tokens = 0;
491
567
  for (const auto & entry : batch_f32.entries) {
492
- n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
568
+ n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
493
569
  }
494
570
 
495
571
  mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
496
572
  if (ctx->use_mrope) {
497
573
  // for Qwen2VL, we need this information for M-RoPE decoding positions
498
- image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
499
- image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
574
+ image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
575
+ image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
500
576
  image_tokens->use_mrope_pos = true;
501
577
  } else {
502
578
  // other models, we only need the total number of tokens
@@ -504,7 +580,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
504
580
  image_tokens->ny = 1;
505
581
  }
506
582
  image_tokens->batch_f32 = std::move(batch_f32);
507
- image_tokens->id = bitmaps[i_bm]->id; // optional
583
+ image_tokens->id = bitmap->id; // optional
508
584
 
509
585
  LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
510
586
  LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
@@ -516,35 +592,35 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
516
592
  std::move(image_tokens),
517
593
  nullptr, // audio tokens
518
594
  };
519
- output->entries.emplace_back(std::move(chunk));
595
+ cur.entries.emplace_back(std::move(chunk));
520
596
  }
521
597
 
522
- i_bm++; // move to next image
523
- continue;
598
+ if (!ctx->img_end.empty()) {
599
+ add_text(ctx->img_end, true); // add image end token
600
+ }
524
601
 
525
602
  } else {
526
603
  // handle audio
527
604
 
528
- if (i_bm >= n_bitmaps) {
529
- LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
530
- return 1;
531
- }
532
-
533
- if (!ctx->has_audio) {
605
+ if (!ctx->ctx_a) {
534
606
  LOG_ERR("%s: error: model does not support audio input\n", __func__);
535
607
  return 2;
536
608
  }
537
609
 
538
- if (bitmaps[i_bm]->data.size() == 0) {
610
+ if (bitmap->data.size() == 0) {
539
611
  LOG_ERR("%s: error: empty audio data\n", __func__);
540
612
  return 2;
541
613
  }
542
614
 
615
+ if (!ctx->aud_beg.empty()) {
616
+ add_text(ctx->aud_beg, true); // add audio begin token
617
+ }
618
+
543
619
  // preprocess audio
544
620
  LM_GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
545
621
  std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
546
- const float * samples = (const float *)bitmaps[i_bm]->data.data();
547
- size_t n_samples = bitmaps[i_bm]->data.size() / sizeof(float);
622
+ const float * samples = (const float *)bitmap->data.data();
623
+ size_t n_samples = bitmap->data.size() / sizeof(float);
548
624
  bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
549
625
  if (!ok) {
550
626
  LOG_ERR("Unable to preprocess audio\n");
@@ -558,7 +634,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
558
634
  mel_f32->nx = mel_spec.n_len;
559
635
  mel_f32->ny = mel_spec.n_mel;
560
636
  mel_f32->buf = std::move(mel_spec.data);
561
- size_t n_tokens = clip_n_output_tokens(ctx->ctx_clip, mel_f32.get());
637
+ size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());
562
638
 
563
639
  clip_image_f32_batch batch_f32;
564
640
  batch_f32.is_audio = true;
@@ -567,7 +643,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
567
643
  mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
568
644
  audio_tokens->n_tokens = n_tokens;
569
645
  audio_tokens->batch_f32 = std::move(batch_f32);
570
- audio_tokens->id = bitmaps[i_bm]->id; // optional
646
+ audio_tokens->id = bitmap->id; // optional
571
647
 
572
648
  LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
573
649
 
@@ -577,15 +653,88 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
577
653
  nullptr, // image tokens
578
654
  std::move(audio_tokens),
579
655
  };
580
- output->entries.emplace_back(std::move(chunk));
656
+ cur.entries.emplace_back(std::move(chunk));
581
657
  }
582
658
 
583
- i_bm++;
584
- continue;
659
+ if (!ctx->aud_end.empty()) {
660
+ add_text(ctx->aud_end, true); // add audio end token
661
+ }
585
662
  }
663
+
664
+ return 0;
586
665
  }
587
666
 
588
- return 0;
667
+ std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
668
+ std::vector<mtmd_input_chunk> chunks;
669
+
670
+ for (auto & entry : batch_f32.entries) {
671
+ mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
672
+ image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, entry.get());
673
+ image_tokens->ny = 1;
674
+ image_tokens->batch_f32.entries.push_back(std::move(entry));
675
+ image_tokens->id = id;
676
+
677
+ mtmd_input_chunk chunk{
678
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
679
+ {}, // text tokens
680
+ std::move(image_tokens),
681
+ nullptr, // audio tokens
682
+ };
683
+ chunks.emplace_back(std::move(chunk));
684
+ }
685
+
686
+ return chunks;
687
+ }
688
+
689
+ // for example: "a <__media__> b <__media__> c" --> "a", "<__media__>", "b", "<__media__>", "c"
690
+ static std::vector<std::string> split_text(const std::string & input, const std::string & delimiter) {
691
+ std::vector<std::string> result;
692
+ if (input.empty()) {
693
+ return result;
694
+ }
695
+ size_t start = 0;
696
+ size_t pos = 0;
697
+ while ((pos = input.find(delimiter, start)) != std::string::npos) {
698
+ if (pos > start) {
699
+ result.push_back(input.substr(start, pos - start));
700
+ }
701
+ result.push_back(delimiter);
702
+ start = pos + delimiter.length();
703
+ }
704
+ if (start < input.length()) {
705
+ result.push_back(input.substr(start));
706
+ }
707
+ return result;
708
+ }
709
+
710
+ // copied from common_tokenize
711
+ static std::vector<llama_token> mtmd_tokenize_text_internal(
712
+ const struct llama_vocab * vocab,
713
+ const std::string & text,
714
+ bool add_special,
715
+ bool parse_special) {
716
+ // upper limit for the number of tokens
717
+ int n_tokens = text.length() + 2 * add_special;
718
+ std::vector<llama_token> result(n_tokens);
719
+ n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
720
+ if (n_tokens < 0) {
721
+ result.resize(-n_tokens);
722
+ int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
723
+ LM_GGML_ASSERT(check == -n_tokens);
724
+ } else {
725
+ result.resize(n_tokens);
726
+ }
727
+ return result;
728
+ }
729
+ };
730
+
731
+ int32_t mtmd_tokenize(mtmd_context * ctx,
732
+ mtmd_input_chunks * output,
733
+ const mtmd_input_text * text,
734
+ const mtmd_bitmap ** bitmaps,
735
+ size_t n_bitmaps) {
736
+ mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
737
+ return tokenizer.tokenize(output);
589
738
  }
590
739
 
591
740
  int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
@@ -593,41 +742,54 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
593
742
  LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
594
743
  return 0;
595
744
  } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
745
+ if (!ctx->ctx_v) {
746
+ LOG_ERR("%s: model does not support vision input\n", __func__);
747
+ return 1;
748
+ }
596
749
  return mtmd_encode(ctx, chunk->tokens_image.get());
597
750
  } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
598
- int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
751
+ if (!ctx->ctx_a) {
752
+ LOG_ERR("%s: model does not support audio input\n", __func__);
753
+ return 1;
754
+ }
755
+ int n_mmproj_embd = ctx->n_embd_text;
599
756
  ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
600
757
  bool ok = clip_image_batch_encode(
601
- ctx->ctx_clip,
758
+ ctx->ctx_a,
602
759
  ctx->n_threads,
603
760
  &chunk->tokens_audio->batch_f32,
604
761
  ctx->image_embd_v.data());
605
762
  return ok ? 0 : 1;
606
763
  }
607
764
 
608
- LOG_ERR("mtmd_encode_chunk: unknown chunk type %d\n", (int)chunk->type);
765
+ LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
609
766
  return 1;
610
767
  }
611
768
 
612
769
  int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
613
- int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
770
+ clip_ctx * ctx_clip = ctx->ctx_v;
771
+ if (!ctx_clip) {
772
+ LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
773
+ return 1;
774
+ }
775
+ int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
614
776
  ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
615
777
  bool ok = false;
616
778
 
617
- if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
779
+ if (clip_is_llava(ctx_clip) || clip_is_minicpmv(ctx_clip) || clip_is_glm(ctx_clip)) {
618
780
  // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
619
781
  const auto & entries = image_tokens->batch_f32.entries;
620
782
  for (size_t i = 0; i < entries.size(); i++) {
621
- int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get());
783
+ int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
622
784
  ok = clip_image_encode(
623
- ctx->ctx_clip,
785
+ ctx_clip,
624
786
  ctx->n_threads,
625
787
  entries[i].get(),
626
788
  ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
627
789
  }
628
790
  } else {
629
791
  ok = clip_image_batch_encode(
630
- ctx->ctx_clip,
792
+ ctx_clip,
631
793
  ctx->n_threads,
632
794
  &image_tokens->batch_f32,
633
795
  ctx->image_embd_v.data());
@@ -641,8 +803,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
641
803
  }
642
804
 
643
805
  bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
644
- projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
645
- if (proj_type == PROJECTOR_TYPE_GEMMA3) {
806
+ if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
646
807
  return true;
647
808
  }
648
809
  return false;
@@ -653,60 +814,19 @@ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
653
814
  }
654
815
 
655
816
  bool mtmd_support_vision(mtmd_context * ctx) {
656
- return ctx->has_vision;
817
+ return ctx->ctx_v != nullptr;
657
818
  }
658
819
 
659
820
  bool mtmd_support_audio(mtmd_context * ctx) {
660
- return ctx->has_audio;
661
- }
662
-
663
- // these 2 helpers below use internal clip_image_u8_ptr,
664
- // so unfortunately they cannot moved to mtmd-helper.h
665
- // however, in theory, user can decode image file to bitmap using
666
- // whichever library they want, and then use mtmd_bitmap_init() to create bitmap
667
-
668
- mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
669
- if (audio_helpers::is_audio_file((const char *)buf, len)) {
670
- std::vector<float> pcmf32;
671
- if (!audio_helpers::decode_audio_from_buf(buf, len, COMMON_SAMPLE_RATE, pcmf32)) {
672
- LOG_ERR("Unable to read WAV audio file from buffer\n");
673
- return nullptr;
674
- }
675
- return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
676
- }
677
-
678
- clip_image_u8_ptr img_u8(clip_image_u8_init());
679
- bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
680
- if (!ok) {
681
- LOG_ERR("Unable to load image from buffer\n");
682
- return nullptr;
683
- }
684
- uint32_t nx, ny;
685
- unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
686
- return mtmd_bitmap_init(nx, ny, data);
821
+ return ctx->ctx_a != nullptr;
687
822
  }
688
823
 
689
- mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
690
- std::vector<unsigned char> buf;
691
- FILE * f = fopen(fname, "rb");
692
- if (!f) {
693
- LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
694
- return nullptr;
824
+ int mtmd_get_audio_bitrate(mtmd_context * ctx) {
825
+ if (!ctx->ctx_a) {
826
+ return -1;
695
827
  }
696
-
697
- fseek(f, 0, SEEK_END);
698
- long file_size = ftell(f);
699
- fseek(f, 0, SEEK_SET);
700
- buf.resize(file_size);
701
-
702
- size_t n_read = fread(buf.data(), 1, file_size, f);
703
- fclose(f);
704
- if (n_read != (size_t)file_size) {
705
- LOG_ERR("Failed to read entire file %s", fname);
706
- return nullptr;
707
- }
708
-
709
- return mtmd_helper_bitmap_init_from_buf(buf.data(), buf.size());
828
+ // for now, we assume that all audio models have the same bitrate
829
+ return 16000; // 16kHz
710
830
  }
711
831
 
712
832
  //