cui-llama.rn 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (285) hide show
  1. package/README.md +35 -7
  2. package/android/src/main/CMakeLists.txt +22 -11
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +42 -6
  4. package/android/src/main/java/com/rnllama/RNLlama.java +139 -4
  5. package/android/src/main/jni.cpp +173 -18
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +24 -4
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +22 -2
  16. package/cpp/LICENSE +21 -0
  17. package/cpp/chat.cpp +129 -107
  18. package/cpp/chat.h +2 -0
  19. package/cpp/common.cpp +58 -78
  20. package/cpp/common.h +29 -21
  21. package/cpp/ggml-alloc.c +4 -1
  22. package/cpp/ggml-backend.cpp +9 -5
  23. package/cpp/ggml-backend.h +4 -4
  24. package/cpp/ggml-cpp.h +1 -1
  25. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  26. package/cpp/ggml-cpu/amx/amx.h +8 -0
  27. package/cpp/ggml-cpu/amx/common.h +91 -0
  28. package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
  29. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  30. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/binary-ops.h +1 -1
  31. package/cpp/ggml-cpu/common.h +72 -0
  32. package/cpp/{ggml-cpu-aarch64.cpp → ggml-cpu/ggml-cpu-aarch64.cpp} +809 -103
  33. package/cpp/{ggml-cpu-quants.c → ggml-cpu/ggml-cpu-quants.c} +306 -6
  34. package/cpp/{ggml-cpu.c → ggml-cpu/ggml-cpu.c} +114 -55
  35. package/cpp/{ggml-cpu.cpp → ggml-cpu/ggml-cpu.cpp} +32 -16
  36. package/cpp/{ops.cpp → ggml-cpu/ops.cpp} +353 -173
  37. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/ops.h +2 -20
  38. package/cpp/{sgemm.cpp → ggml-cpu/sgemm.cpp} +501 -0
  39. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/simd-mappings.h +7 -3
  40. package/{ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/unary-ops.h +1 -1
  41. package/cpp/{vec.cpp → ggml-cpu/vec.cpp} +0 -6
  42. package/{ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/vec.h +16 -0
  43. package/cpp/ggml-cpu.h +5 -0
  44. package/cpp/ggml-impl.h +16 -9
  45. package/cpp/ggml-llama-sim.metallib +0 -0
  46. package/cpp/ggml-llama.metallib +0 -0
  47. package/cpp/ggml-metal-impl.h +36 -11
  48. package/cpp/ggml-metal.m +810 -176
  49. package/cpp/ggml-opt.cpp +373 -190
  50. package/cpp/ggml-opt.h +49 -28
  51. package/cpp/ggml-quants.c +0 -6
  52. package/cpp/ggml.c +227 -282
  53. package/cpp/ggml.h +82 -101
  54. package/cpp/gguf.cpp +33 -33
  55. package/cpp/json-schema-to-grammar.cpp +3 -0
  56. package/cpp/llama-adapter.cpp +6 -0
  57. package/cpp/llama-arch.cpp +49 -17
  58. package/cpp/llama-arch.h +9 -0
  59. package/cpp/llama-batch.cpp +8 -2
  60. package/cpp/llama-batch.h +2 -1
  61. package/cpp/llama-chat.cpp +39 -16
  62. package/cpp/llama-chat.h +4 -2
  63. package/cpp/llama-context.cpp +440 -611
  64. package/cpp/llama-context.h +44 -33
  65. package/cpp/llama-cparams.h +1 -0
  66. package/cpp/llama-graph.cpp +214 -291
  67. package/cpp/llama-graph.h +69 -21
  68. package/cpp/llama-hparams.cpp +17 -1
  69. package/cpp/llama-hparams.h +39 -5
  70. package/cpp/llama-kv-cache.cpp +2067 -620
  71. package/cpp/llama-kv-cache.h +410 -108
  72. package/cpp/llama-memory.h +12 -1
  73. package/cpp/llama-model-loader.cpp +24 -15
  74. package/cpp/llama-model-saver.cpp +281 -0
  75. package/cpp/llama-model-saver.h +37 -0
  76. package/cpp/llama-model.cpp +1089 -359
  77. package/cpp/llama-model.h +19 -3
  78. package/cpp/llama-sampling.cpp +20 -7
  79. package/cpp/llama-vocab.cpp +54 -9
  80. package/cpp/llama-vocab.h +6 -0
  81. package/cpp/llama.cpp +14 -0
  82. package/cpp/llama.h +86 -142
  83. package/cpp/minja/chat-template.hpp +9 -5
  84. package/cpp/minja/minja.hpp +69 -36
  85. package/cpp/rn-llama.cpp +602 -190
  86. package/cpp/rn-llama.h +34 -8
  87. package/cpp/sampling.cpp +57 -50
  88. package/cpp/tools/mtmd/clip-impl.h +462 -0
  89. package/cpp/tools/mtmd/clip.cpp +4024 -0
  90. package/cpp/tools/mtmd/clip.h +101 -0
  91. package/cpp/tools/mtmd/miniaudio.h +93468 -0
  92. package/cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  93. package/cpp/tools/mtmd/mtmd-audio.h +62 -0
  94. package/cpp/tools/mtmd/mtmd-helper.cpp +297 -0
  95. package/cpp/tools/mtmd/mtmd.cpp +942 -0
  96. package/cpp/tools/mtmd/mtmd.h +362 -0
  97. package/cpp/tools/mtmd/stb_image.h +7988 -0
  98. package/ios/CMakeLists.txt +20 -10
  99. package/ios/RNLlama.h +6 -0
  100. package/ios/RNLlama.mm +82 -3
  101. package/ios/RNLlamaContext.h +5 -1
  102. package/ios/RNLlamaContext.mm +131 -38
  103. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +2 -0
  104. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +29 -21
  105. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  106. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  107. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  108. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  109. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  110. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  111. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +82 -101
  112. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  113. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  114. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
  115. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +44 -33
  116. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  117. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
  118. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
  119. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  120. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
  121. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  122. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +19 -3
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +86 -142
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Info.plist +0 -0
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  131. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  132. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
  133. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  134. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  135. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  136. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  137. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  138. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  139. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
  140. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  141. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  142. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
  143. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
  144. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  145. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
  146. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
  147. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  148. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
  149. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  150. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
  151. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  152. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
  153. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  154. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  155. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
  156. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  160. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +2 -0
  161. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +29 -21
  162. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  163. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  164. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  165. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  166. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  167. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  168. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +82 -101
  169. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  170. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  171. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
  172. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +44 -33
  173. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  174. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
  175. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
  176. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  177. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
  178. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  179. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +19 -3
  180. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  181. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +86 -142
  182. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  183. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  184. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
  185. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist +0 -0
  186. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  187. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  188. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  189. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
  190. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  191. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  192. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  193. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  194. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  195. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  196. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
  197. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  198. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  199. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
  200. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
  201. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  202. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
  203. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
  204. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  205. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
  206. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  207. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
  208. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  209. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
  210. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  211. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  212. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
  213. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  214. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  215. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  216. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  217. package/jest/mock.js +33 -7
  218. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  219. package/lib/commonjs/index.js +153 -21
  220. package/lib/commonjs/index.js.map +1 -1
  221. package/lib/module/NativeRNLlama.js.map +1 -1
  222. package/lib/module/index.js +152 -20
  223. package/lib/module/index.js.map +1 -1
  224. package/lib/typescript/NativeRNLlama.d.ts +54 -4
  225. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  226. package/lib/typescript/index.d.ts +72 -6
  227. package/lib/typescript/index.d.ts.map +1 -1
  228. package/package.json +1 -1
  229. package/src/NativeRNLlama.ts +72 -4
  230. package/src/index.ts +212 -38
  231. package/cpp/binary-ops.h +0 -16
  232. package/cpp/ops.h +0 -128
  233. package/cpp/simd-mappings.h +0 -888
  234. package/cpp/unary-ops.h +0 -28
  235. package/cpp/vec.h +0 -802
  236. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  237. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  238. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  239. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  240. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  241. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +0 -128
  242. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  243. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  244. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +0 -802
  245. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  246. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  247. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  248. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  249. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  250. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  251. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  252. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  253. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  254. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  255. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  256. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  257. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +0 -128
  258. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  259. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  260. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +0 -28
  261. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +0 -16
  262. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  263. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  264. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  265. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  266. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +0 -128
  267. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  268. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +0 -888
  269. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  270. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  271. package/lib/commonjs/chat.js +0 -37
  272. package/lib/commonjs/chat.js.map +0 -1
  273. package/lib/module/chat.js +0 -33
  274. package/lib/module/chat.js.map +0 -1
  275. package/lib/typescript/chat.d.ts +0 -10
  276. package/lib/typescript/chat.d.ts.map +0 -1
  277. package/src/chat.ts +0 -44
  278. /package/cpp/{binary-ops.cpp → ggml-cpu/binary-ops.cpp} +0 -0
  279. /package/cpp/{ggml-cpu-aarch64.h → ggml-cpu/ggml-cpu-aarch64.h} +0 -0
  280. /package/cpp/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -0
  281. /package/cpp/{ggml-cpu-quants.h → ggml-cpu/ggml-cpu-quants.h} +0 -0
  282. /package/cpp/{ggml-cpu-traits.cpp → ggml-cpu/ggml-cpu-traits.cpp} +0 -0
  283. /package/cpp/{ggml-cpu-traits.h → ggml-cpu/ggml-cpu-traits.h} +0 -0
  284. /package/cpp/{sgemm.h → ggml-cpu/sgemm.h} +0 -0
  285. /package/cpp/{unary-ops.cpp → ggml-cpu/unary-ops.cpp} +0 -0
@@ -0,0 +1,942 @@
1
+ #include "clip.h"
2
+ #include "clip-impl.h"
3
+ #include "mtmd.h"
4
+ #include "mtmd-audio.h"
5
+
6
+ #include "llama.h"
7
+
8
+ #include <algorithm>
9
+ #include <cerrno>
10
+ #include <cstdio>
11
+ #include <cstdlib>
12
+ #include <cstring>
13
+ #include <limits>
14
+ #include <vector>
15
+
16
+ // represents raw image data, layout is RGBRGBRGB...
17
+ // length of data must be nx * ny * 3
18
+ struct mtmd_bitmap {
19
+ uint32_t nx;
20
+ uint32_t ny;
21
+ std::vector<unsigned char> data;
22
+ std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
23
+ bool is_audio = false; // true if the bitmap is audio
24
+ };
25
+
26
+ struct mtmd_image_tokens {
27
+ uint32_t nx; // number of tokens in x direction
28
+ uint32_t ny; // number of tokens in y direction
29
+ bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
30
+ uint32_t n_tokens() const { return nx * ny; }
31
+ clip_image_f32_batch batch_f32; // preprocessed image patches
32
+ std::string id; // optional user-defined ID, useful for KV cache tracking
33
+
34
+ mtmd_image_tokens clone() {
35
+ return mtmd_image_tokens{
36
+ nx,
37
+ ny,
38
+ use_mrope_pos,
39
+ batch_f32.clone(),
40
+ id
41
+ };
42
+ }
43
+ };
44
+ using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens>;
45
+
46
+ struct mtmd_audio_tokens {
47
+ uint32_t n_tokens; // number of tokens
48
+ clip_image_f32_batch batch_f32; // preprocessed image patches
49
+ std::string id; // optional user-defined ID, useful for KV cache tracking
50
+
51
+ mtmd_audio_tokens clone() {
52
+ return mtmd_audio_tokens{
53
+ n_tokens,
54
+ batch_f32.clone(),
55
+ id
56
+ };
57
+ }
58
+ };
59
+ using mtmd_audio_tokens_ptr = std::unique_ptr<mtmd_audio_tokens>;
60
+
61
+ struct mtmd_input_chunk {
62
+ mtmd_input_chunk_type type;
63
+ std::vector<llama_token> tokens_text;
64
+ mtmd_image_tokens_ptr tokens_image;
65
+ mtmd_audio_tokens_ptr tokens_audio;
66
+ };
67
+
68
+ struct mtmd_input_chunks {
69
+ std::vector<mtmd_input_chunk> entries;
70
+ };
71
+
72
+ // slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
73
+ // models not having it (llava-1.6) will process embeddings without any special tokens in-between
74
+ enum mtmd_slice_tmpl {
75
+ MTMD_SLICE_TMPL_NONE,
76
+ MTMD_SLICE_TMPL_MINICPMV_2_5,
77
+ MTMD_SLICE_TMPL_MINICPMV_2_6,
78
+ MTMD_SLICE_TMPL_LLAMA4,
79
+ // TODO @ngxson : add support for idefics (SmolVLM)
80
+ };
81
+
82
+ const char * mtmd_default_marker() {
83
+ return "<__media__>";
84
+ }
85
+
86
+ mtmd_context_params mtmd_context_params_default() {
87
+ mtmd_context_params params;
88
+ params.use_gpu = true;
89
+ params.print_timings = true;
90
+ params.n_threads = 4;
91
+ params.verbosity = LM_GGML_LOG_LEVEL_INFO;
92
+ params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
93
+ params.media_marker = mtmd_default_marker();
94
+ return params;
95
+ }
96
+
97
+ struct mtmd_context {
98
+ struct clip_ctx * ctx_clip;
99
+ const struct llama_model * text_model;
100
+ std::vector<float> image_embd_v; // image embedding vector
101
+
102
+ bool print_timings;
103
+ int n_threads;
104
+ std::string media_marker;
105
+ bool has_vision;
106
+ bool has_audio;
107
+
108
+ // for llava-uhd style models, we need special tokens in-between slices
109
+ // minicpmv calls them "slices", llama 4 calls them "tiles"
110
+ mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
111
+ llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
112
+ llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
113
+ llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
114
+ llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
115
+ llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice start
116
+ llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice end
117
+ llama_token tok_sli_img_mid = LLAMA_TOKEN_NULL; // between 2 slices
118
+ llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
119
+ bool tok_row_end_trail = false;
120
+ bool ov_img_first = false;
121
+
122
+ bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
123
+
124
+ // for whisper, we pre-calculate the mel filter bank
125
+ whisper_preprocessor::whisper_filters w_filters;
126
+
127
+ // TODO @ngxson : add timings
128
+
129
+ mtmd_context(const char * mmproj_fname,
130
+ const llama_model * text_model,
131
+ const mtmd_context_params & ctx_params) :
132
+ text_model (text_model),
133
+ print_timings(ctx_params.print_timings),
134
+ n_threads (ctx_params.n_threads),
135
+ media_marker (ctx_params.media_marker)
136
+ {
137
+ if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
138
+ throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
139
+ }
140
+
141
+ clip_context_params ctx_clip_params;
142
+ ctx_clip_params.use_gpu = ctx_params.use_gpu;
143
+ ctx_clip_params.verbosity = ctx_params.verbosity;
144
+ ctx_clip = clip_init(mmproj_fname, ctx_clip_params);
145
+ if (!ctx_clip) {
146
+ throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
147
+ }
148
+
149
+ has_vision = clip_has_vision_encoder(ctx_clip);
150
+ has_audio = clip_has_audio_encoder(ctx_clip);
151
+ use_mrope = clip_is_qwen2vl(ctx_clip);
152
+
153
+ projector_type proj = clip_get_projector_type(ctx_clip);
154
+ int minicpmv_version = clip_is_minicpmv(ctx_clip);
155
+ if (minicpmv_version == 2) {
156
+ // minicpmv 2.5 format:
157
+ // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
158
+ slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
159
+ tok_ov_img_start = lookup_token("<image>");
160
+ tok_ov_img_end = lookup_token("</image>");
161
+ tok_slices_start = lookup_token("<slice>");
162
+ tok_slices_end = lookup_token("</slice>");
163
+ tok_sli_img_start = tok_ov_img_start;
164
+ tok_sli_img_end = tok_ov_img_end;
165
+ tok_row_end = lookup_token("\n");
166
+ tok_row_end_trail = false; // no trailing end-of-row token
167
+ ov_img_first = true;
168
+
169
+ } else if (minicpmv_version == 3 || minicpmv_version == 4) {
170
+ // minicpmv 2.6 format:
171
+ // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
172
+ slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
173
+ tok_ov_img_start = lookup_token("<image>");
174
+ tok_ov_img_end = lookup_token("</image>");
175
+ tok_sli_img_start = lookup_token("<slice>");
176
+ tok_sli_img_end = lookup_token("</slice>");
177
+ tok_row_end = lookup_token("\n");
178
+ tok_row_end_trail = false; // no trailing end-of-row token
179
+ ov_img_first = true;
180
+
181
+ } else if (minicpmv_version != 0) {
182
+ LM_GGML_ASSERT(false && "unsupported minicpmv version");
183
+ } else if (proj == PROJECTOR_TYPE_LLAMA4) {
184
+ // llama 4 format:
185
+ // <|image_start|>
186
+ // (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
187
+ // (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
188
+ // ... <|tile_y_separator|> <-- trailing end-of-row token
189
+ // <|image|> (overview) <-- overview image is last
190
+ // <|image_end|>
191
+ slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
192
+ tok_ov_img_start = lookup_token("<|image|>");
193
+ tok_sli_img_mid = lookup_token("<|tile_x_separator|>");
194
+ tok_row_end = lookup_token("<|tile_y_separator|>");
195
+ tok_row_end_trail = true; // add trailing end-of-row token
196
+ ov_img_first = false; // overview image is last
197
+ }
198
+
199
+ if (proj == PROJECTOR_TYPE_ULTRAVOX) {
200
+ // TODO @ngxson : check if model n_mel is 128 or 80
201
+ w_filters = whisper_precalc_filters::get_128_bins();
202
+ }
203
+
204
+ // warning messages
205
+ if (proj == PROJECTOR_TYPE_LLAMA4) {
206
+ LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
207
+ " https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
208
+ }
209
+ if (has_audio) {
210
+ LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
211
+ " https://github.com/ggml-org/llama.cpp/pull/13623\n", __func__);
212
+ }
213
+ }
214
+
215
+ ~mtmd_context() {
216
+ clip_free(ctx_clip);
217
+ }
218
+
219
+ private:
220
+ llama_token lookup_token(const std::string & token_text) {
221
+ const llama_vocab * vocab = llama_model_get_vocab(text_model);
222
+ const int n_vocab = llama_vocab_n_tokens(vocab);
223
+ for (int i = 0; i < n_vocab; i++) {
224
+ if (token_to_piece(vocab, i, true) == token_text) {
225
+ return i;
226
+ }
227
+ }
228
+ return LLAMA_TOKEN_NULL;
229
+ }
230
+
231
+ std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) {
232
+ std::string piece;
233
+ piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
234
+ const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
235
+ if (n_chars < 0) {
236
+ piece.resize(-n_chars);
237
+ int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
238
+ LM_GGML_ASSERT(check == -n_chars);
239
+ } else {
240
+ piece.resize(n_chars);
241
+ }
242
+ return piece;
243
+ }
244
+ };
245
+
246
+ mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
247
+ const struct llama_model * text_model,
248
+ const struct mtmd_context_params ctx_params) {
249
+ try {
250
+ return new mtmd_context(mmproj_fname, text_model, ctx_params);
251
+ } catch (const std::exception & e) {
252
+ LOG_ERR("%s: error: %s\n", __func__, e.what());
253
+ return nullptr;
254
+ }
255
+ }
256
+
257
+ void mtmd_free(mtmd_context * ctx) {
258
+ if (ctx) {
259
+ delete ctx;
260
+ }
261
+ }
262
+
263
+ // copied from common_tokenize
264
+ static std::vector<llama_token> mtmd_tokenize_text_internal(
265
+ const struct llama_vocab * vocab,
266
+ const std::string & text,
267
+ bool add_special,
268
+ bool parse_special) {
269
+ // upper limit for the number of tokens
270
+ int n_tokens = text.length() + 2 * add_special;
271
+ std::vector<llama_token> result(n_tokens);
272
+ n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
273
+ if (n_tokens < 0) {
274
+ result.resize(-n_tokens);
275
+ int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
276
+ LM_GGML_ASSERT(check == -n_tokens);
277
+ } else {
278
+ result.resize(n_tokens);
279
+ }
280
+ return result;
281
+ }
282
+
283
+ int32_t mtmd_tokenize(mtmd_context * ctx,
284
+ mtmd_input_chunks * output,
285
+ const mtmd_input_text * text,
286
+ const mtmd_bitmap ** bitmaps,
287
+ size_t n_bitmaps) {
288
+ auto vocab = llama_model_get_vocab(ctx->text_model);
289
+
290
+ std::string prompt_modified(text->text);
291
+ std::string marker_modified(ctx->media_marker);
292
+ projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
293
+
294
+ // for compatibility, we convert image marker to media marker
295
+ string_replace_all(prompt_modified, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
296
+
297
+ // a bit hacky here, but works for now
298
+ // for some models, we need to add prefix and suffix to the image embeddings
299
+ if (clip_is_gemma3(ctx->ctx_clip)) {
300
+ // gemma 3
301
+ // <start_of_image> ... (image embeddings) ... <end_of_image>
302
+ marker_modified = "<start_of_image>" + ctx->media_marker + "<end_of_image>";
303
+ string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
304
+
305
+ } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
306
+ // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
307
+ marker_modified = "<fake_token_around_image><global-img>" + ctx->media_marker + "<fake_token_around_image>";
308
+ string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
309
+
310
+ } else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
311
+ // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
312
+ marker_modified = ctx->media_marker + "[IMG_END]";
313
+ string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
314
+
315
+ } else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
316
+ // <|vision_start|> ... (image embeddings) ... <|vision_end|>
317
+ marker_modified = "<|vision_start|>" + ctx->media_marker + "<|vision_end|>";
318
+ string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
319
+
320
+ } else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
321
+ // (more details in mtmd_context constructor)
322
+ marker_modified = "<|image_start|>" + ctx->media_marker + "<|image_end|>";
323
+ string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
324
+
325
+ } else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
326
+ // <img> ... (image embeddings) ... </img>
327
+ marker_modified = "<img>" + ctx->media_marker + "</img>";
328
+ string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
329
+
330
+ }
331
+
332
+ // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
333
+ // for glm-edge, BOI and EOI token's embeddings are not present in the text model
334
+
335
+ std::vector<std::string> parts = string_split_str(prompt_modified, ctx->media_marker);
336
+ output->entries.clear();
337
+ output->entries.reserve(parts.size());
338
+
339
+ size_t i_bm = 0;
340
+
341
+ // utility for adding raw tokens
342
+ auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
343
+ mtmd_input_chunk chunk{
344
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
345
+ std::move(tokens),
346
+ nullptr, // image tokens
347
+ nullptr, // audio tokens
348
+ };
349
+ output->entries.emplace_back(std::move(chunk));
350
+ };
351
+
352
+ // utility for splitting batch of multiple images into chunks of batch having single images
353
+ auto split_batch_to_chunk = [&ctx](clip_image_f32_batch && batch_f32, const std::string & id) {
354
+ std::vector<mtmd_input_chunk> chunks;
355
+
356
+ for (auto & entry : batch_f32.entries) {
357
+ mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
358
+ image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
359
+ image_tokens->ny = 1;
360
+ image_tokens->batch_f32.entries.push_back(std::move(entry));
361
+ image_tokens->id = id;
362
+
363
+ mtmd_input_chunk chunk{
364
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
365
+ {}, // text tokens
366
+ std::move(image_tokens),
367
+ nullptr, // audio tokens
368
+ };
369
+ chunks.emplace_back(std::move(chunk));
370
+ }
371
+
372
+ return chunks;
373
+ };
374
+
375
+ for (const auto & part : parts) {
376
+ // printf("tokenizing part: %s\n", part.c_str());
377
+ bool add_bos = &parts.front() == &part;
378
+ auto tokens = mtmd_tokenize_text_internal(vocab, part, text->add_special && add_bos, text->parse_special);
379
+ if (tokens.empty()) {
380
+ continue;
381
+ }
382
+ mtmd_input_chunk chunk{
383
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
384
+ std::move(tokens),
385
+ nullptr, // image tokens
386
+ nullptr, // audio tokens
387
+ };
388
+ output->entries.emplace_back(std::move(chunk));
389
+
390
+ // only add image/audio tokens to middle of 2 parts
391
+ // therefore, we skip handling image/audio if this is the last part
392
+ if (&parts.back() == &part) {
393
+ continue;
394
+ }
395
+
396
+ if (!bitmaps[i_bm]->is_audio) {
397
+ // handle image
398
+
399
+ if (i_bm >= n_bitmaps) {
400
+ LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
401
+ return 1;
402
+ }
403
+
404
+ if (!ctx->has_vision) {
405
+ LOG_ERR("%s: error: model does not support vision input\n", __func__);
406
+ return 2;
407
+ }
408
+
409
+ // convert mtmd_bitmap to clip_image_u8
410
+ clip_image_u8_ptr img_u8(clip_image_u8_init());
411
+ img_u8->nx = bitmaps[i_bm]->nx;
412
+ img_u8->ny = bitmaps[i_bm]->ny;
413
+ img_u8->buf.resize(bitmaps[i_bm]->data.size());
414
+ std::memcpy(img_u8->buf.data(), bitmaps[i_bm]->data.data(), img_u8->nx * img_u8->ny * 3);
415
+
416
+ // preprocess image
417
+ clip_image_f32_batch batch_f32;
418
+ bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
419
+ if (!ok) {
420
+ LOG_ERR("Unable to preprocess image\n");
421
+ return 2;
422
+ }
423
+
424
+ // handle llava-uhd style preprocessing
425
+ if (
426
+ ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
427
+ || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
428
+ || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
429
+ ) {
430
+ // split batch into chunks of single images
431
+ auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_bm]->id);
432
+ LM_GGML_ASSERT(chunks.size() > 0);
433
+
434
+ auto ov_chunk = std::move(chunks.front());
435
+ chunks.erase(chunks.begin());
436
+
437
+ // add overview image (first)
438
+ if (ctx->ov_img_first) {
439
+ if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
440
+ add_text_chunk({ctx->tok_ov_img_start});
441
+ }
442
+ output->entries.emplace_back(std::move(ov_chunk));
443
+ if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
444
+ add_text_chunk({ctx->tok_ov_img_end});
445
+ }
446
+ }
447
+
448
+ // add slices (or tiles)
449
+ if (!chunks.empty()) {
450
+ const int n_col = batch_f32.grid_x;
451
+ const int n_row = batch_f32.grid_y;
452
+ if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
453
+ add_text_chunk({ctx->tok_slices_start});
454
+ }
455
+ for (int y = 0; y < n_row; y++) {
456
+ for (int x = 0; x < n_col; x++) {
457
+ const bool is_last_in_row = (x == n_col - 1);
458
+ if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
459
+ add_text_chunk({ctx->tok_sli_img_start});
460
+ }
461
+ output->entries.emplace_back(std::move(chunks[y * n_col + x]));
462
+ if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
463
+ add_text_chunk({ctx->tok_sli_img_end});
464
+ }
465
+ if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
466
+ add_text_chunk({ctx->tok_sli_img_mid});
467
+ }
468
+ }
469
+ if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
470
+ add_text_chunk({ctx->tok_row_end});
471
+ }
472
+ }
473
+ if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
474
+ add_text_chunk({ctx->tok_slices_end});
475
+ }
476
+ }
477
+
478
+ // add overview image (last)
479
+ if (!ctx->ov_img_first) {
480
+ if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
481
+ add_text_chunk({ctx->tok_ov_img_start});
482
+ }
483
+ output->entries.emplace_back(std::move(ov_chunk));
484
+ if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
485
+ add_text_chunk({ctx->tok_ov_img_end});
486
+ }
487
+ }
488
+
489
+ } else {
490
+ size_t n_tokens = 0;
491
+ for (const auto & entry : batch_f32.entries) {
492
+ n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
493
+ }
494
+
495
+ mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
496
+ if (ctx->use_mrope) {
497
+ // for Qwen2VL, we need this information for M-RoPE decoding positions
498
+ image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
499
+ image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
500
+ image_tokens->use_mrope_pos = true;
501
+ } else {
502
+ // other models, we only need the total number of tokens
503
+ image_tokens->nx = n_tokens;
504
+ image_tokens->ny = 1;
505
+ }
506
+ image_tokens->batch_f32 = std::move(batch_f32);
507
+ image_tokens->id = bitmaps[i_bm]->id; // optional
508
+
509
+ LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
510
+ LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
511
+ LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
512
+
513
+ mtmd_input_chunk chunk{
514
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
515
+ {}, // text tokens
516
+ std::move(image_tokens),
517
+ nullptr, // audio tokens
518
+ };
519
+ output->entries.emplace_back(std::move(chunk));
520
+ }
521
+
522
+ i_bm++; // move to next image
523
+ continue;
524
+
525
+ } else {
526
+ // handle audio
527
+
528
+ if (i_bm >= n_bitmaps) {
529
+ LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
530
+ return 1;
531
+ }
532
+
533
+ if (!ctx->has_audio) {
534
+ LOG_ERR("%s: error: model does not support audio input\n", __func__);
535
+ return 2;
536
+ }
537
+
538
+ if (bitmaps[i_bm]->data.size() == 0) {
539
+ LOG_ERR("%s: error: empty audio data\n", __func__);
540
+ return 2;
541
+ }
542
+
543
+ // preprocess audio
544
+ LM_GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
545
+ std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
546
+ const float * samples = (const float *)bitmaps[i_bm]->data.data();
547
+ size_t n_samples = bitmaps[i_bm]->data.size() / sizeof(float);
548
+ bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
549
+ if (!ok) {
550
+ LOG_ERR("Unable to preprocess audio\n");
551
+ return 2;
552
+ }
553
+
554
+ // consider each mel_spec as a separate audio chunk
555
+ // TODO: maybe support batching, but this may come with memory cost
556
+ for (auto & mel_spec : mel_spec_chunks) {
557
+ clip_image_f32_ptr mel_f32(clip_image_f32_init());
558
+ mel_f32->nx = mel_spec.n_len;
559
+ mel_f32->ny = mel_spec.n_mel;
560
+ mel_f32->buf = std::move(mel_spec.data);
561
+ size_t n_tokens = clip_n_output_tokens(ctx->ctx_clip, mel_f32.get());
562
+
563
+ clip_image_f32_batch batch_f32;
564
+ batch_f32.is_audio = true;
565
+ batch_f32.entries.push_back(std::move(mel_f32));
566
+
567
+ mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
568
+ audio_tokens->n_tokens = n_tokens;
569
+ audio_tokens->batch_f32 = std::move(batch_f32);
570
+ audio_tokens->id = bitmaps[i_bm]->id; // optional
571
+
572
+ LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
573
+
574
+ mtmd_input_chunk chunk{
575
+ MTMD_INPUT_CHUNK_TYPE_AUDIO,
576
+ {}, // text tokens
577
+ nullptr, // image tokens
578
+ std::move(audio_tokens),
579
+ };
580
+ output->entries.emplace_back(std::move(chunk));
581
+ }
582
+
583
+ i_bm++;
584
+ continue;
585
+ }
586
+ }
587
+
588
+ return 0;
589
+ }
590
+
591
+ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
592
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
593
+ LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
594
+ return 0;
595
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
596
+ return mtmd_encode(ctx, chunk->tokens_image.get());
597
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
598
+ int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
599
+ ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
600
+ bool ok = clip_image_batch_encode(
601
+ ctx->ctx_clip,
602
+ ctx->n_threads,
603
+ &chunk->tokens_audio->batch_f32,
604
+ ctx->image_embd_v.data());
605
+ return ok ? 0 : 1;
606
+ }
607
+
608
+ LOG_ERR("mtmd_encode_chunk: unknown chunk type %d\n", (int)chunk->type);
609
+ return 1;
610
+ }
611
+
612
+ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
613
+ int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
614
+ ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
615
+ bool ok = false;
616
+
617
+ if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
618
+ // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
619
+ const auto & entries = image_tokens->batch_f32.entries;
620
+ for (size_t i = 0; i < entries.size(); i++) {
621
+ int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get());
622
+ ok = clip_image_encode(
623
+ ctx->ctx_clip,
624
+ ctx->n_threads,
625
+ entries[i].get(),
626
+ ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
627
+ }
628
+ } else {
629
+ ok = clip_image_batch_encode(
630
+ ctx->ctx_clip,
631
+ ctx->n_threads,
632
+ &image_tokens->batch_f32,
633
+ ctx->image_embd_v.data());
634
+ }
635
+
636
+ return ok ? 0 : 1;
637
+ }
638
+
639
+ float * mtmd_get_output_embd(mtmd_context * ctx) {
640
+ return ctx->image_embd_v.data();
641
+ }
642
+
643
+ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
644
+ projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
645
+ if (proj_type == PROJECTOR_TYPE_GEMMA3) {
646
+ return true;
647
+ }
648
+ return false;
649
+ }
650
+
651
+ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
652
+ return ctx->use_mrope;
653
+ }
654
+
655
+ bool mtmd_support_vision(mtmd_context * ctx) {
656
+ return ctx->has_vision;
657
+ }
658
+
659
+ bool mtmd_support_audio(mtmd_context * ctx) {
660
+ return ctx->has_audio;
661
+ }
662
+
663
+ // these 2 helpers below use internal clip_image_u8_ptr,
664
+ // so unfortunately they cannot moved to mtmd-helper.h
665
+ // however, in theory, user can decode image file to bitmap using
666
+ // whichever library they want, and then use mtmd_bitmap_init() to create bitmap
667
+
668
+ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
669
+ if (audio_helpers::is_audio_file((const char *)buf, len)) {
670
+ std::vector<float> pcmf32;
671
+ if (!audio_helpers::decode_audio_from_buf(buf, len, COMMON_SAMPLE_RATE, pcmf32)) {
672
+ LOG_ERR("Unable to read WAV audio file from buffer\n");
673
+ return nullptr;
674
+ }
675
+ return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
676
+ }
677
+
678
+ clip_image_u8_ptr img_u8(clip_image_u8_init());
679
+ bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
680
+ if (!ok) {
681
+ LOG_ERR("Unable to load image from buffer\n");
682
+ return nullptr;
683
+ }
684
+ uint32_t nx, ny;
685
+ unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
686
+ return mtmd_bitmap_init(nx, ny, data);
687
+ }
688
+
689
+ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
690
+ std::vector<unsigned char> buf;
691
+ FILE * f = fopen(fname, "rb");
692
+ if (!f) {
693
+ LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
694
+ return nullptr;
695
+ }
696
+
697
+ fseek(f, 0, SEEK_END);
698
+ long file_size = ftell(f);
699
+ fseek(f, 0, SEEK_SET);
700
+ buf.resize(file_size);
701
+
702
+ size_t n_read = fread(buf.data(), 1, file_size, f);
703
+ fclose(f);
704
+ if (n_read != (size_t)file_size) {
705
+ LOG_ERR("Failed to read entire file %s", fname);
706
+ return nullptr;
707
+ }
708
+
709
+ return mtmd_helper_bitmap_init_from_buf(buf.data(), buf.size());
710
+ }
711
+
712
+ //
713
+ // public API functions
714
+ //
715
+
716
+ // mtmd_bitmap
717
+
718
+ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
719
+ uint32_t ny,
720
+ const unsigned char * data) {
721
+ mtmd_bitmap * bitmap = new mtmd_bitmap;
722
+ bitmap->nx = nx;
723
+ bitmap->ny = ny;
724
+ size_t data_size = (size_t)nx * ny * 3;
725
+ bitmap->data.resize(data_size);
726
+ std::memcpy(bitmap->data.data(), data, data_size);
727
+ return bitmap;
728
+ }
729
+
730
+ mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
731
+ const float * data) {
732
+ mtmd_bitmap * bitmap = new mtmd_bitmap;
733
+ bitmap->nx = n_samples;
734
+ bitmap->ny = 1;
735
+ bitmap->is_audio = true;
736
+ size_t data_size = n_samples * sizeof(float);
737
+ bitmap->data.resize(data_size);
738
+ std::memcpy(bitmap->data.data(), data, data_size);
739
+ return bitmap;
740
+ }
741
+
742
+ uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
743
+ return bitmap->nx;
744
+ }
745
+
746
+ uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
747
+ return bitmap->ny;
748
+ }
749
+
750
+ const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
751
+ return bitmap->data.data();
752
+ }
753
+
754
+ size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
755
+ return bitmap->data.size();
756
+ }
757
+
758
+ bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
759
+ return bitmap->is_audio;
760
+ }
761
+
762
+ const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
763
+ return bitmap->id.c_str();
764
+ }
765
+
766
+ void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id) {
767
+ if (id) {
768
+ bitmap->id = std::string(id);
769
+ } else {
770
+ bitmap->id.clear();
771
+ }
772
+ }
773
+
774
+ void mtmd_bitmap_free(mtmd_bitmap * bitmap) {
775
+ if (bitmap) {
776
+ delete bitmap;
777
+ }
778
+ }
779
+
780
+ // mtmd_input_chunks
781
+
782
+ mtmd_input_chunks * mtmd_input_chunks_init() {
783
+ return new mtmd_input_chunks;
784
+ }
785
+
786
+ size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks) {
787
+ return chunks->entries.size();
788
+ }
789
+
790
+ const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx) {
791
+ if (idx >= chunks->entries.size()) {
792
+ return nullptr;
793
+ }
794
+ return &chunks->entries[idx];
795
+ }
796
+
797
+ void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
798
+ if (chunks) {
799
+ delete chunks;
800
+ }
801
+ }
802
+
803
+ // mtmd_input_chunk
804
+
805
+ enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk) {
806
+ return chunk->type;
807
+ }
808
+
809
+ const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output) {
810
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
811
+ *n_tokens_output = chunk->tokens_text.size();
812
+ return chunk->tokens_text.data();
813
+ }
814
+ *n_tokens_output = 0;
815
+ return nullptr;
816
+ }
817
+
818
+ const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk) {
819
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
820
+ return chunk->tokens_image.get();
821
+ }
822
+ return nullptr;
823
+ }
824
+
825
+ size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk) {
826
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
827
+ return chunk->tokens_text.size();
828
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
829
+ return mtmd_image_tokens_get_n_tokens(chunk->tokens_image.get());
830
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
831
+ return chunk->tokens_audio->n_tokens;
832
+ } else {
833
+ LM_GGML_ABORT("invalid chunk type");
834
+ }
835
+ }
836
+
837
+ llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk) {
838
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
839
+ return chunk->tokens_text.size();
840
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
841
+ return mtmd_image_tokens_get_n_pos(chunk->tokens_image.get());
842
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
843
+ return chunk->tokens_audio->n_tokens;
844
+ } else {
845
+ LM_GGML_ABORT("invalid chunk type");
846
+ }
847
+ }
848
+
849
+ const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk) {
850
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
851
+ return chunk->tokens_image->id.c_str();
852
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
853
+ return chunk->tokens_audio->id.c_str();
854
+ }
855
+ return nullptr;
856
+ }
857
+
858
+ mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
859
+ mtmd_input_chunk * copy = new mtmd_input_chunk{
860
+ chunk->type,
861
+ chunk->tokens_text,
862
+ nullptr,
863
+ nullptr,
864
+ };
865
+ if (chunk->tokens_image) {
866
+ // copy the image tokens
867
+ copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens());
868
+ *copy->tokens_image = chunk->tokens_image->clone();
869
+ }
870
+ if (chunk->tokens_audio) {
871
+ // copy the audio tokens
872
+ copy->tokens_audio = mtmd_audio_tokens_ptr(new mtmd_audio_tokens());
873
+ *copy->tokens_audio = chunk->tokens_audio->clone();
874
+ }
875
+ return copy;
876
+ }
877
+
878
+ void mtmd_input_chunk_free(mtmd_input_chunk * chunk) {
879
+ if (chunk) {
880
+ delete chunk;
881
+ }
882
+ }
883
+
884
+ // mtmd_image_tokens
885
+
886
+ size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
887
+ return image_tokens->n_tokens();
888
+ }
889
+
890
+ size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
891
+ return image_tokens->nx;
892
+ }
893
+
894
+ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
895
+ return image_tokens->ny;
896
+ }
897
+
898
+ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
899
+ return image_tokens->id.c_str();
900
+ }
901
+
902
+ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
903
+ if (image_tokens->use_mrope_pos) {
904
+ return 1; // for M-RoPE, the whole image is 1 in temporal dimension
905
+ }
906
+ return image_tokens->n_tokens();
907
+ }
908
+
909
+ // test function
910
+
911
+ mtmd_input_chunks * mtmd_test_create_input_chunks() {
912
+ mtmd_input_chunks * chunks = mtmd_input_chunks_init();
913
+ if (!chunks) {
914
+ return nullptr;
915
+ }
916
+
917
+ // create a text chunk
918
+ std::vector<llama_token> tokens_text = { 1, 2, 3, 4, 5 };
919
+ mtmd_input_chunk chunk_text{
920
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
921
+ std::move(tokens_text),
922
+ nullptr, // image tokens
923
+ nullptr, // audio tokens
924
+ };
925
+ chunks->entries.emplace_back(std::move(chunk_text));
926
+
927
+ // create an image chunk
928
+ mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
929
+ image_tokens->nx = 4;
930
+ image_tokens->ny = 4;
931
+ image_tokens->batch_f32.entries.resize(16);
932
+ image_tokens->id = "image_1";
933
+ mtmd_input_chunk chunk_image{
934
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
935
+ {}, // text tokens
936
+ std::move(image_tokens),
937
+ nullptr, // audio tokens
938
+ };
939
+ chunks->entries.emplace_back(std::move(chunk_image));
940
+
941
+ return chunks;
942
+ }