cui-llama.rn 1.7.3 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. package/README.md +217 -17
  2. package/android/src/main/CMakeLists.txt +34 -15
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +94 -8
  4. package/android/src/main/java/com/rnllama/RNLlama.java +247 -0
  5. package/android/src/main/jni.cpp +213 -14
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
  16. package/cpp/README.md +1 -1
  17. package/cpp/chat-parser.cpp +385 -0
  18. package/cpp/chat-parser.h +120 -0
  19. package/cpp/chat.cpp +726 -596
  20. package/cpp/chat.h +71 -6
  21. package/cpp/common.cpp +56 -38
  22. package/cpp/common.h +9 -3
  23. package/cpp/ggml-backend-reg.cpp +5 -0
  24. package/cpp/ggml-backend.cpp +10 -2
  25. package/cpp/ggml-common.h +4 -0
  26. package/cpp/ggml-cpu/amx/amx.cpp +1 -1
  27. package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
  28. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  29. package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
  30. package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
  31. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  32. package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
  33. package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  34. package/cpp/ggml-cpu/arch-fallback.h +184 -0
  35. package/cpp/ggml-cpu/common.h +4 -3
  36. package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
  37. package/cpp/ggml-cpu/ggml-cpu.c +123 -104
  38. package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
  39. package/cpp/ggml-cpu/ops.cpp +330 -148
  40. package/cpp/ggml-cpu/ops.h +1 -0
  41. package/cpp/ggml-cpu/quants.c +1158 -0
  42. package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  43. package/cpp/ggml-cpu/repack.cpp +1571 -0
  44. package/cpp/ggml-cpu/repack.h +98 -0
  45. package/cpp/ggml-cpu/simd-mappings.h +330 -38
  46. package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  47. package/cpp/ggml-cpu/vec.cpp +87 -18
  48. package/cpp/ggml-cpu/vec.h +249 -94
  49. package/cpp/ggml-cpu.h +1 -0
  50. package/cpp/ggml-impl.h +63 -183
  51. package/cpp/ggml-llama-sim.metallib +0 -0
  52. package/cpp/ggml-llama.metallib +0 -0
  53. package/cpp/ggml-metal.m +152 -45
  54. package/cpp/ggml-quants.c +0 -2
  55. package/cpp/ggml.c +61 -21
  56. package/cpp/ggml.h +22 -3
  57. package/cpp/gguf.cpp +24 -3
  58. package/cpp/json-partial.cpp +256 -0
  59. package/cpp/json-partial.h +38 -0
  60. package/cpp/json-schema-to-grammar.cpp +5 -47
  61. package/cpp/json-schema-to-grammar.h +4 -4
  62. package/cpp/llama-arch.cpp +153 -3
  63. package/cpp/llama-arch.h +27 -1
  64. package/cpp/llama-batch.cpp +741 -272
  65. package/cpp/llama-batch.h +112 -54
  66. package/cpp/llama-chat.cpp +30 -8
  67. package/cpp/llama-chat.h +1 -0
  68. package/cpp/llama-context.cpp +524 -339
  69. package/cpp/llama-context.h +38 -17
  70. package/cpp/llama-cparams.cpp +4 -0
  71. package/cpp/llama-cparams.h +2 -0
  72. package/cpp/llama-grammar.cpp +12 -2
  73. package/cpp/llama-graph.cpp +431 -356
  74. package/cpp/llama-graph.h +126 -58
  75. package/cpp/llama-hparams.cpp +10 -2
  76. package/cpp/llama-hparams.h +19 -2
  77. package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
  78. package/cpp/llama-kv-cache-unified-iswa.h +128 -0
  79. package/cpp/llama-kv-cache-unified.cpp +1841 -0
  80. package/cpp/llama-kv-cache-unified.h +303 -0
  81. package/cpp/llama-kv-cells.h +439 -0
  82. package/cpp/llama-memory-hybrid.cpp +246 -0
  83. package/cpp/llama-memory-hybrid.h +138 -0
  84. package/cpp/llama-memory-recurrent.cpp +1112 -0
  85. package/cpp/llama-memory-recurrent.h +183 -0
  86. package/cpp/llama-memory.cpp +41 -0
  87. package/cpp/llama-memory.h +86 -5
  88. package/cpp/llama-mmap.cpp +1 -1
  89. package/cpp/llama-model-loader.cpp +42 -17
  90. package/cpp/llama-model-saver.cpp +1 -0
  91. package/cpp/llama-model.cpp +1639 -513
  92. package/cpp/llama-model.h +26 -0
  93. package/cpp/llama-sampling.cpp +2 -2
  94. package/cpp/llama-vocab.cpp +65 -28
  95. package/cpp/llama-vocab.h +1 -0
  96. package/cpp/llama.cpp +11 -7
  97. package/cpp/llama.h +150 -42
  98. package/cpp/minja/chat-template.hpp +1 -1
  99. package/cpp/minja/minja.hpp +1 -1
  100. package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
  101. package/cpp/nlohmann/json_fwd.hpp +187 -0
  102. package/cpp/regex-partial.cpp +204 -0
  103. package/cpp/regex-partial.h +56 -0
  104. package/cpp/rn-llama.cpp +646 -35
  105. package/cpp/rn-llama.h +32 -1
  106. package/cpp/rn-tts.h +39 -0
  107. package/cpp/sampling.cpp +7 -8
  108. package/cpp/tools/mtmd/clip-impl.h +5 -0
  109. package/cpp/tools/mtmd/clip.cpp +572 -436
  110. package/cpp/tools/mtmd/clip.h +14 -4
  111. package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
  112. package/cpp/tools/mtmd/mtmd-audio.h +2 -17
  113. package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
  114. package/cpp/tools/mtmd/mtmd-helper.h +91 -0
  115. package/cpp/tools/mtmd/mtmd.cpp +368 -248
  116. package/cpp/tools/mtmd/mtmd.h +6 -70
  117. package/cpp/unicode.cpp +5 -0
  118. package/ios/CMakeLists.txt +26 -6
  119. package/ios/RNLlama.h +1 -1
  120. package/ios/RNLlama.mm +153 -3
  121. package/ios/RNLlamaContext.h +9 -1
  122. package/ios/RNLlamaContext.mm +112 -9
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  131. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  132. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  133. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  134. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  135. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  136. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  137. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  138. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  139. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  140. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  141. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  142. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  143. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  144. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  145. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  146. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  147. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
  148. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  149. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  150. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
  151. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  152. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  153. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  154. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  155. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  156. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  162. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  163. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  164. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  165. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  166. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  167. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  168. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  169. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  170. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  171. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  172. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  173. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  174. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  175. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  176. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  177. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  178. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  179. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  180. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  181. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  182. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  183. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  184. package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  185. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  186. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  187. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  188. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  189. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  190. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  191. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  192. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
  193. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
  194. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  195. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  196. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  197. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
  198. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  199. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  200. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  201. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  202. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  203. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  204. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  205. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  206. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  207. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  208. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  209. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  210. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  211. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  212. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  213. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  214. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  215. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
  216. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  217. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  218. package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  219. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  220. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  221. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  222. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  223. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  224. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  225. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  226. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  227. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  228. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  229. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  230. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  231. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  232. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  233. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  234. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  235. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  236. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  237. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  238. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  239. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  240. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  241. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  242. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  243. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  244. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  245. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  246. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  247. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  248. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  249. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  250. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  251. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  252. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
  253. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  254. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  255. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  256. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  257. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  258. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  259. package/jest/mock.js +24 -0
  260. package/package.json +1 -1
  261. package/src/NativeRNLlama.ts +46 -2
  262. package/src/index.ts +105 -1
  263. package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  264. package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
  265. package/cpp/ggml-cpu/sgemm.cpp +0 -3544
  266. package/cpp/ggml-cpu/sgemm.h +0 -14
  267. package/cpp/llama-kv-cache.cpp +0 -2827
  268. package/cpp/llama-kv-cache.h +0 -515
  269. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  270. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  271. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  272. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
  273. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  274. /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  275. /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
  276. /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
@@ -11,9 +11,6 @@
11
11
  #include "ggml-backend.h"
12
12
  #include "gguf.h"
13
13
 
14
- #define STB_IMAGE_IMPLEMENTATION
15
- #include "stb_image.h"
16
-
17
14
  #include <cassert>
18
15
  #include <cmath>
19
16
  #include <cstdlib>
@@ -172,9 +169,6 @@ enum patch_merge_type {
172
169
  };
173
170
 
174
171
  struct clip_hparams {
175
- bool has_vision = false;
176
- bool has_audio = false;
177
-
178
172
  int32_t image_size;
179
173
  int32_t patch_size;
180
174
  int32_t n_embd;
@@ -184,9 +178,13 @@ struct clip_hparams {
184
178
  int32_t n_layer;
185
179
  int32_t proj_scale_factor = 0; // idefics3
186
180
 
181
+ float image_mean[3];
182
+ float image_std[3];
183
+
187
184
  // for models using dynamic image size, we need to have a smaller image size to warmup
188
185
  // otherwise, user will get OOM everytime they load the model
189
186
  int32_t warmup_image_size = 0;
187
+ int32_t warmup_audio_size = 3000;
190
188
 
191
189
  ffn_op_type ffn_op = FFN_GELU;
192
190
 
@@ -195,7 +193,7 @@ struct clip_hparams {
195
193
  float eps = 1e-6;
196
194
  float rope_theta = 0.0;
197
195
 
198
- std::vector<int32_t> image_grid_pinpoints;
196
+ std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
199
197
  int32_t image_crop_resolution;
200
198
  std::unordered_set<int32_t> vision_feature_layer;
201
199
  int32_t attn_window_size = 0;
@@ -205,6 +203,10 @@ struct clip_hparams {
205
203
  // audio
206
204
  int32_t n_mel_bins = 0; // whisper preprocessor
207
205
  int32_t proj_stack_factor = 0; // ultravox
206
+
207
+ // legacy
208
+ bool has_llava_projector = false;
209
+ int minicpmv_version = 0;
208
210
  };
209
211
 
210
212
  struct clip_layer {
@@ -242,8 +244,10 @@ struct clip_layer {
242
244
  lm_ggml_tensor * ls_2_w = nullptr;
243
245
  };
244
246
 
245
- struct clip_vision_model {
246
- struct clip_hparams hparams;
247
+ struct clip_model {
248
+ clip_modality modality = CLIP_MODALITY_VISION;
249
+ projector_type proj_type = PROJECTOR_TYPE_MLP;
250
+ clip_hparams hparams;
247
251
 
248
252
  // embeddings
249
253
  lm_ggml_tensor * class_embedding = nullptr;
@@ -260,7 +264,9 @@ struct clip_vision_model {
260
264
  lm_ggml_tensor * post_ln_w;
261
265
  lm_ggml_tensor * post_ln_b;
262
266
 
263
- lm_ggml_tensor * projection;
267
+ lm_ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
268
+ lm_ggml_tensor * mm_fc_w;
269
+ lm_ggml_tensor * mm_fc_b;
264
270
 
265
271
  // LLaVA projection
266
272
  lm_ggml_tensor * mm_input_norm_w = nullptr;
@@ -357,14 +363,7 @@ struct clip_vision_model {
357
363
  };
358
364
 
359
365
  struct clip_ctx {
360
- bool has_llava_projector = false;
361
- int minicpmv_version = 0;
362
-
363
- struct clip_vision_model vision_model;
364
- projector_type proj_type = PROJECTOR_TYPE_MLP;
365
-
366
- float image_mean[3];
367
- float image_std[3];
366
+ clip_model model;
368
367
 
369
368
  lm_gguf_context_ptr ctx_gguf;
370
369
  lm_ggml_context_ptr ctx_data;
@@ -418,11 +417,16 @@ struct clip_ctx {
418
417
  lm_ggml_backend_free(backend_cpu);
419
418
  }
420
419
  }
420
+
421
+ // this function is added so that we don't change too much of the existing code
422
+ projector_type proj_type() const {
423
+ return model.proj_type;
424
+ }
421
425
  };
422
426
 
423
427
  struct clip_graph {
424
428
  clip_ctx * ctx;
425
- const clip_vision_model & model;
429
+ const clip_model & model;
426
430
  const clip_hparams & hparams;
427
431
 
428
432
  // we only support single image per batch
@@ -445,7 +449,7 @@ struct clip_graph {
445
449
 
446
450
  clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
447
451
  ctx(ctx),
448
- model(ctx->vision_model),
452
+ model(ctx->model),
449
453
  hparams(model.hparams),
450
454
  img(img),
451
455
  patch_size(hparams.patch_size),
@@ -477,7 +481,7 @@ struct clip_graph {
477
481
  model.position_embeddings,
478
482
  nullptr);
479
483
 
480
- if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
484
+ if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) {
481
485
  const int batch_size = 1;
482
486
  LM_GGML_ASSERT(n_patches_x == n_patches_y);
483
487
  const int patches_per_image = n_patches_x;
@@ -500,7 +504,7 @@ struct clip_graph {
500
504
  lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, model.mm_input_proj_w)),
501
505
  cur);
502
506
 
503
- } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
507
+ } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
504
508
  // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
505
509
 
506
510
  const int scale_factor = model.hparams.proj_scale_factor;
@@ -634,7 +638,7 @@ struct clip_graph {
634
638
  const int n_pos = n_patches;
635
639
  const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
636
640
 
637
- norm_type norm_t = ctx->proj_type == PROJECTOR_TYPE_QWEN25VL
641
+ norm_type norm_t = ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
638
642
  ? NORM_TYPE_RMS // qwen 2.5 vl
639
643
  : NORM_TYPE_NORMAL; // qwen 2 vl
640
644
 
@@ -850,11 +854,11 @@ struct clip_graph {
850
854
  const int d_head = 128;
851
855
  int n_head = n_embd/d_head;
852
856
  int num_query = 96;
853
- if (ctx->minicpmv_version == 2) {
857
+ if (ctx->model.hparams.minicpmv_version == 2) {
854
858
  num_query = 96;
855
- } else if (ctx->minicpmv_version == 3) {
859
+ } else if (ctx->model.hparams.minicpmv_version == 3) {
856
860
  num_query = 64;
857
- } else if (ctx->minicpmv_version == 4) {
861
+ } else if (ctx->model.hparams.minicpmv_version == 4) {
858
862
  num_query = 64;
859
863
  }
860
864
 
@@ -1071,7 +1075,7 @@ struct clip_graph {
1071
1075
  int il_last = hparams.n_layer - 1;
1072
1076
  int deepest_feature_layer = -1;
1073
1077
 
1074
- if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
1078
+ if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV || ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) {
1075
1079
  il_last += 1;
1076
1080
  }
1077
1081
 
@@ -1205,7 +1209,7 @@ struct clip_graph {
1205
1209
  }
1206
1210
 
1207
1211
  // llava projector (also used by granite)
1208
- if (ctx->has_llava_projector) {
1212
+ if (ctx->model.hparams.has_llava_projector) {
1209
1213
  embeddings = lm_ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
1210
1214
 
1211
1215
  lm_ggml_tensor * patches = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_patches);
@@ -1219,7 +1223,7 @@ struct clip_graph {
1219
1223
  // print_tensor_info(embeddings, "embeddings");
1220
1224
 
1221
1225
  // llava projector
1222
- if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
1226
+ if (ctx->proj_type() == PROJECTOR_TYPE_MLP) {
1223
1227
  embeddings = lm_ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
1224
1228
  embeddings = lm_ggml_add(ctx0, embeddings, model.mm_0_b);
1225
1229
 
@@ -1229,7 +1233,7 @@ struct clip_graph {
1229
1233
  embeddings = lm_ggml_add(ctx0, embeddings, model.mm_2_b);
1230
1234
  }
1231
1235
  }
1232
- else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
1236
+ else if (ctx->proj_type() == PROJECTOR_TYPE_MLP_NORM) {
1233
1237
  embeddings = lm_ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
1234
1238
  embeddings = lm_ggml_add(ctx0, embeddings, model.mm_0_b);
1235
1239
  // lm_ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
@@ -1250,7 +1254,7 @@ struct clip_graph {
1250
1254
  embeddings = lm_ggml_add(ctx0, lm_ggml_mul(ctx0, embeddings, model.mm_4_w),
1251
1255
  model.mm_4_b);
1252
1256
  }
1253
- else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
1257
+ else if (ctx->proj_type() == PROJECTOR_TYPE_LDP) {
1254
1258
  // MobileVLM projector
1255
1259
  int n_patch = 24;
1256
1260
  lm_ggml_tensor * mlp_1 = lm_ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
@@ -1360,7 +1364,7 @@ struct clip_graph {
1360
1364
  }
1361
1365
  embeddings = block_1;
1362
1366
  }
1363
- else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
1367
+ else if (ctx->proj_type() == PROJECTOR_TYPE_LDPV2)
1364
1368
  {
1365
1369
  int n_patch = 24;
1366
1370
  lm_ggml_tensor * mlp_0 = lm_ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
@@ -1390,7 +1394,7 @@ struct clip_graph {
1390
1394
  }
1391
1395
 
1392
1396
  // glm projector
1393
- else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
1397
+ else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) {
1394
1398
  size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
1395
1399
  embeddings = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0,embeddings,1,0,2,3));
1396
1400
  embeddings = lm_ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
@@ -1477,48 +1481,58 @@ struct clip_graph {
1477
1481
 
1478
1482
  cb(cur, "after_transformer", -1);
1479
1483
 
1480
- // StackAudioFrames
1481
- // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1482
- {
1483
- int64_t stride = n_embd * hparams.proj_stack_factor;
1484
- int64_t padded_len = LM_GGML_PAD(lm_ggml_nelements(cur), stride);
1485
- int64_t pad = padded_len - lm_ggml_nelements(cur);
1486
- if (pad > 0) {
1487
- cur = lm_ggml_view_1d(ctx0, cur, lm_ggml_nelements(cur), 0);
1488
- cur = lm_ggml_pad(ctx0, cur, pad, 0, 0, 0);
1484
+ if (ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX) {
1485
+ // StackAudioFrames
1486
+ // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1487
+ {
1488
+ int64_t stride = n_embd * hparams.proj_stack_factor;
1489
+ int64_t padded_len = LM_GGML_PAD(lm_ggml_nelements(cur), stride);
1490
+ int64_t pad = padded_len - lm_ggml_nelements(cur);
1491
+ if (pad > 0) {
1492
+ cur = lm_ggml_view_1d(ctx0, cur, lm_ggml_nelements(cur), 0);
1493
+ cur = lm_ggml_pad(ctx0, cur, pad, 0, 0, 0);
1494
+ }
1495
+ cur = lm_ggml_view_2d(ctx0, cur, stride, padded_len / stride,
1496
+ lm_ggml_row_size(cur->type, stride), 0);
1489
1497
  }
1490
- cur = lm_ggml_view_2d(ctx0, cur, stride, padded_len / stride,
1491
- lm_ggml_row_size(cur->type, stride), 0);
1492
- }
1493
1498
 
1494
- cb(cur, "after_stacked", -1);
1499
+ cb(cur, "after_stacked", -1);
1495
1500
 
1496
- // UltravoxProjector
1497
- {
1498
- // pre-norm
1499
- cur = lm_ggml_rms_norm(ctx0, cur, 1e-6);
1500
- cur = lm_ggml_mul(ctx0, cur, model.mm_norm_pre_w);
1501
+ // UltravoxProjector
1502
+ {
1503
+ // pre-norm
1504
+ cur = lm_ggml_rms_norm(ctx0, cur, 1e-6);
1505
+ cur = lm_ggml_mul(ctx0, cur, model.mm_norm_pre_w);
1501
1506
 
1502
- // ffn in
1503
- cur = lm_ggml_mul_mat(ctx0, model.mm_1_w, cur);
1507
+ // ffn in
1508
+ cur = lm_ggml_mul_mat(ctx0, model.mm_1_w, cur);
1504
1509
 
1505
- // swiglu
1506
- {
1507
- int64_t split_point = cur->ne[0] / 2;
1508
- lm_ggml_tensor * x0 = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
1509
- lm_ggml_tensor * x1 = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * lm_ggml_element_size(cur)));
1510
+ // swiglu
1511
+ {
1512
+ int64_t split_point = cur->ne[0] / 2;
1513
+ lm_ggml_tensor * x0 = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
1514
+ lm_ggml_tensor * x1 = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * lm_ggml_element_size(cur)));
1510
1515
 
1511
- // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
1512
- x1 = lm_ggml_silu(ctx0, x1);
1513
- cur = lm_ggml_mul(ctx0, x0, x1);
1516
+ // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
1517
+ x1 = lm_ggml_silu(ctx0, x1);
1518
+ cur = lm_ggml_mul(ctx0, x0, x1);
1519
+ }
1520
+
1521
+ // mid-norm
1522
+ cur = lm_ggml_rms_norm(ctx0, cur, 1e-6);
1523
+ cur = lm_ggml_mul(ctx0, cur, model.mm_norm_mid_w);
1524
+
1525
+ // ffn out
1526
+ cur = lm_ggml_mul_mat(ctx0, model.mm_2_w, cur);
1514
1527
  }
1515
1528
 
1516
- // mid-norm
1517
- cur = lm_ggml_rms_norm(ctx0, cur, 1e-6);
1518
- cur = lm_ggml_mul(ctx0, cur, model.mm_norm_mid_w);
1529
+ } else if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2A) {
1530
+ // projector
1531
+ cur = lm_ggml_mul_mat(ctx0, model.mm_fc_w, cur);
1532
+ cur = lm_ggml_add(ctx0, cur, model.mm_fc_b);
1519
1533
 
1520
- // ffn out
1521
- cur = lm_ggml_mul_mat(ctx0, model.mm_2_w, cur);
1534
+ } else {
1535
+ LM_GGML_ABORT("%s: unknown projector type", __func__);
1522
1536
  }
1523
1537
 
1524
1538
  cb(cur, "projected", -1);
@@ -1661,6 +1675,17 @@ private:
1661
1675
  inpL = cur;
1662
1676
  }
1663
1677
 
1678
+ // TODO @ngxson : find a way to move this outside
1679
+ if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2A) {
1680
+ lm_ggml_tensor * cur = inpL;
1681
+ cur = lm_ggml_transpose(ctx0, cur);
1682
+ cur = lm_ggml_cont(ctx0, cur);
1683
+ cur = lm_ggml_pool_1d(ctx0, cur, LM_GGML_OP_POOL_AVG, 2, 2, 0);
1684
+ cur = lm_ggml_transpose(ctx0, cur);
1685
+ cur = lm_ggml_cont(ctx0, cur);
1686
+ inpL = cur;
1687
+ }
1688
+
1664
1689
  // post-layernorm
1665
1690
  if (model.post_ln_w) {
1666
1691
  inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1);
@@ -1930,7 +1955,7 @@ static lm_ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_
1930
1955
 
1931
1956
  lm_ggml_cgraph * res;
1932
1957
 
1933
- switch (ctx->proj_type) {
1958
+ switch (ctx->proj_type()) {
1934
1959
  case PROJECTOR_TYPE_GEMMA3:
1935
1960
  case PROJECTOR_TYPE_IDEFICS3:
1936
1961
  {
@@ -1958,6 +1983,7 @@ static lm_ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_
1958
1983
  res = graph.build_llama4();
1959
1984
  } break;
1960
1985
  case PROJECTOR_TYPE_ULTRAVOX:
1986
+ case PROJECTOR_TYPE_QWEN2A:
1961
1987
  {
1962
1988
  res = graph.build_whisper_enc();
1963
1989
  } break;
@@ -1973,13 +1999,15 @@ struct clip_model_loader {
1973
1999
  lm_ggml_context_ptr ctx_meta;
1974
2000
  lm_gguf_context_ptr ctx_gguf;
1975
2001
 
1976
- clip_ctx & ctx_clip;
1977
2002
  std::string fname;
1978
2003
 
1979
2004
  size_t model_size = 0; // in bytes
1980
2005
 
1981
- // TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model
1982
- clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) {
2006
+ bool has_vision = false;
2007
+ bool has_audio = false;
2008
+
2009
+ // TODO @ngxson : we should not pass clip_ctx here, it should be clip_model
2010
+ clip_model_loader(const char * fname) : fname(fname) {
1983
2011
  struct lm_ggml_context * meta = nullptr;
1984
2012
 
1985
2013
  struct lm_gguf_init_params params = {
@@ -2011,6 +2039,19 @@ struct clip_model_loader {
2011
2039
  LOG_INF("\n");
2012
2040
  }
2013
2041
 
2042
+ // modalities
2043
+ {
2044
+ get_bool(KEY_HAS_VISION_ENC, has_vision, false);
2045
+ get_bool(KEY_HAS_AUDIO_ENC, has_audio, false);
2046
+
2047
+ if (has_vision) {
2048
+ LOG_INF("%s: has vision encoder\n", __func__);
2049
+ }
2050
+ if (has_audio) {
2051
+ LOG_INF("%s: has audio encoder\n", __func__);
2052
+ }
2053
+ }
2054
+
2014
2055
  // tensors
2015
2056
  {
2016
2057
  for (int i = 0; i < n_tensors; ++i) {
@@ -2026,28 +2067,44 @@ struct clip_model_loader {
2026
2067
  }
2027
2068
  }
2028
2069
 
2029
- void load_hparams() {
2030
- auto & hparams = ctx_clip.vision_model.hparams;
2070
+ void load_hparams(clip_model & model, clip_modality modality) {
2071
+ auto & hparams = model.hparams;
2031
2072
  std::string log_ffn_op; // for logging
2032
2073
 
2074
+ // sanity check
2075
+ if (modality == CLIP_MODALITY_VISION) {
2076
+ LM_GGML_ASSERT(has_vision);
2077
+ } else if (modality == CLIP_MODALITY_AUDIO) {
2078
+ LM_GGML_ASSERT(has_audio);
2079
+ }
2080
+ model.modality = modality;
2081
+
2082
+
2033
2083
  // projector type
2034
2084
  std::string proj_type;
2035
2085
  {
2036
2086
  get_string(KEY_PROJ_TYPE, proj_type, false);
2037
2087
  if (!proj_type.empty()) {
2038
- ctx_clip.proj_type = clip_projector_type_from_string(proj_type);
2088
+ model.proj_type = clip_projector_type_from_string(proj_type);
2039
2089
  }
2040
- if (ctx_clip.proj_type == PROJECTOR_TYPE_UNKNOWN) {
2090
+ if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
2041
2091
  throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
2042
2092
  }
2093
+
2094
+ // correct arch for multimodal models
2095
+ if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
2096
+ model.proj_type = modality == CLIP_MODALITY_VISION
2097
+ ? PROJECTOR_TYPE_QWEN25VL
2098
+ : PROJECTOR_TYPE_QWEN2A;
2099
+ }
2043
2100
  }
2044
2101
 
2102
+ const bool is_vision = model.modality == CLIP_MODALITY_VISION;
2103
+ const bool is_audio = model.modality == CLIP_MODALITY_AUDIO;
2104
+
2045
2105
  // other hparams
2046
2106
  {
2047
- get_bool(KEY_HAS_AUDIO_ENC, hparams.has_audio, false);
2048
- get_bool(KEY_HAS_VISION_ENC, hparams.has_vision, false);
2049
-
2050
- const char * prefix = hparams.has_vision ? "vision" : "audio";
2107
+ const char * prefix = is_vision ? "vision" : "audio";
2051
2108
  get_u32(string_format(KEY_N_EMBD, prefix), hparams.n_embd);
2052
2109
  get_u32(string_format(KEY_N_HEAD, prefix), hparams.n_head);
2053
2110
  get_u32(string_format(KEY_N_FF, prefix), hparams.n_ff);
@@ -2055,27 +2112,40 @@ struct clip_model_loader {
2055
2112
  get_u32(string_format(KEY_PROJ_DIM, prefix), hparams.projection_dim);
2056
2113
  get_f32(string_format(KEY_LAYER_NORM_EPS, prefix), hparams.eps);
2057
2114
 
2058
- if (hparams.has_vision) {
2115
+ if (is_vision) {
2059
2116
  get_u32(KEY_IMAGE_SIZE, hparams.image_size);
2060
2117
  get_u32(KEY_PATCH_SIZE, hparams.patch_size);
2061
- get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
2062
- get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
2063
- get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); // legacy
2118
+ get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
2119
+ get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
2064
2120
 
2065
- } else if (hparams.has_audio) {
2121
+ } else if (is_audio) {
2066
2122
  get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
2067
2123
 
2068
2124
  } else {
2069
- throw std::runtime_error(string_format("%s: neither vision nor audio encoder is present\n", __func__));
2125
+ LM_GGML_ASSERT(false && "unknown modality");
2126
+ }
2127
+
2128
+ // for pinpoints, we need to convert it into a list of resolution candidates
2129
+ {
2130
+ std::vector<int> pinpoints;
2131
+ get_arr_int(KEY_IMAGE_GRID_PINPOINTS, pinpoints, false);
2132
+ if (!pinpoints.empty()) {
2133
+ for (size_t i = 0; i < pinpoints.size(); i += 2) {
2134
+ hparams.image_res_candidates.push_back({
2135
+ pinpoints[i],
2136
+ pinpoints[i+1],
2137
+ });
2138
+ }
2139
+ }
2070
2140
  }
2071
2141
 
2072
2142
  // default warmup value
2073
2143
  hparams.warmup_image_size = hparams.image_size;
2074
2144
 
2075
- ctx_clip.has_llava_projector = ctx_clip.proj_type == PROJECTOR_TYPE_MLP
2076
- || ctx_clip.proj_type == PROJECTOR_TYPE_MLP_NORM
2077
- || ctx_clip.proj_type == PROJECTOR_TYPE_LDP
2078
- || ctx_clip.proj_type == PROJECTOR_TYPE_LDPV2;
2145
+ hparams.has_llava_projector = model.proj_type == PROJECTOR_TYPE_MLP
2146
+ || model.proj_type == PROJECTOR_TYPE_MLP_NORM
2147
+ || model.proj_type == PROJECTOR_TYPE_LDP
2148
+ || model.proj_type == PROJECTOR_TYPE_LDPV2;
2079
2149
 
2080
2150
  {
2081
2151
  bool use_gelu = false;
@@ -2105,7 +2175,7 @@ struct clip_model_loader {
2105
2175
  }
2106
2176
  }
2107
2177
 
2108
- if (hparams.has_vision) {
2178
+ if (is_vision) {
2109
2179
  int idx_mean = lm_gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN);
2110
2180
  int idx_std = lm_gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD);
2111
2181
  LM_GGML_ASSERT(idx_mean >= 0 && "image_mean not found");
@@ -2113,8 +2183,8 @@ struct clip_model_loader {
2113
2183
  const float * mean_data = (const float *) lm_gguf_get_arr_data(ctx_gguf.get(), idx_mean);
2114
2184
  const float * std_data = (const float *) lm_gguf_get_arr_data(ctx_gguf.get(), idx_std);
2115
2185
  for (int i = 0; i < 3; ++i) {
2116
- ctx_clip.image_mean[i] = mean_data[i];
2117
- ctx_clip.image_std[i] = std_data[i];
2186
+ hparams.image_mean[i] = mean_data[i];
2187
+ hparams.image_std[i] = std_data[i];
2118
2188
  }
2119
2189
  }
2120
2190
 
@@ -2131,11 +2201,11 @@ struct clip_model_loader {
2131
2201
  }
2132
2202
 
2133
2203
  // model-specific params
2134
- switch (ctx_clip.proj_type) {
2204
+ switch (model.proj_type) {
2135
2205
  case PROJECTOR_TYPE_MINICPMV:
2136
2206
  {
2137
- if (ctx_clip.minicpmv_version == 0) {
2138
- ctx_clip.minicpmv_version = 2; // default to 2 if not set
2207
+ if (hparams.minicpmv_version == 0) {
2208
+ hparams.minicpmv_version = 2; // default to 2 if not set
2139
2209
  }
2140
2210
  } break;
2141
2211
  case PROJECTOR_TYPE_IDEFICS3:
@@ -2147,6 +2217,9 @@ struct clip_model_loader {
2147
2217
  {
2148
2218
  hparams.rope_theta = 10000.0f;
2149
2219
  hparams.warmup_image_size = hparams.patch_size * 8;
2220
+ // Mistral Small 2506 needs 1024x1024 image size cap to prevent OOM
2221
+ // ref: https://github.com/ggml-org/llama.cpp/issues/14310
2222
+ hparams.image_size = 1024;
2150
2223
  get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
2151
2224
  } break;
2152
2225
  case PROJECTOR_TYPE_GEMMA3:
@@ -2180,20 +2253,13 @@ struct clip_model_loader {
2180
2253
  {
2181
2254
  hparams.rope_theta = 10000.0f;
2182
2255
  get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor);
2183
-
2184
- // borrowed from llava-1.6
2185
- const int isize = hparams.image_size;
2186
- hparams.image_grid_pinpoints = {
2187
- isize, isize*2, // 336, 672
2188
- isize*2, isize, // 672, 336
2189
- isize*2, isize*2, // 672, 672
2190
- isize*3, isize, // 1008, 336
2191
- isize, isize*3, // 336, 1008
2192
- };
2256
+ set_llava_uhd_res_candidates(model, 3);
2193
2257
  } break;
2194
2258
  case PROJECTOR_TYPE_ULTRAVOX:
2259
+ case PROJECTOR_TYPE_QWEN2A:
2195
2260
  {
2196
- get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor);
2261
+ bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX;
2262
+ get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
2197
2263
  if (hparams.n_mel_bins != 128) {
2198
2264
  throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
2199
2265
  }
@@ -2205,23 +2271,22 @@ struct clip_model_loader {
2205
2271
  }
2206
2272
 
2207
2273
  LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
2208
- LOG_INF("%s: has_vision_encoder: %d\n", __func__, hparams.has_vision);
2209
- LOG_INF("%s: has_audio_encoder: %d\n", __func__, hparams.has_audio);
2210
2274
  LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd);
2211
2275
  LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head);
2212
2276
  LOG_INF("%s: n_ff: %d\n", __func__, hparams.n_ff);
2213
2277
  LOG_INF("%s: n_layer: %d\n", __func__, hparams.n_layer);
2214
2278
  LOG_INF("%s: ffn_op: %s\n", __func__, log_ffn_op.c_str());
2215
2279
  LOG_INF("%s: projection_dim: %d\n", __func__, hparams.projection_dim);
2216
- LOG_INF("\n");
2217
- if (hparams.has_vision) {
2280
+ if (is_vision) {
2281
+ LOG_INF("\n--- vision hparams ---\n");
2218
2282
  LOG_INF("%s: image_size: %d\n", __func__, hparams.image_size);
2219
2283
  LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size);
2220
- LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector);
2221
- LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version);
2284
+ LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector);
2285
+ LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
2222
2286
  LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
2223
2287
  LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
2224
- } else if (hparams.has_audio) {
2288
+ } else if (is_audio) {
2289
+ LOG_INF("\n--- audio hparams ---\n");
2225
2290
  LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
2226
2291
  LOG_INF("%s: proj_stack_factor: %d\n", __func__, hparams.proj_stack_factor);
2227
2292
  }
@@ -2231,13 +2296,14 @@ struct clip_model_loader {
2231
2296
  }
2232
2297
  }
2233
2298
 
2234
- void load_tensors() {
2235
- auto & hparams = ctx_clip.vision_model.hparams;
2299
+ void load_tensors(clip_ctx & ctx_clip) {
2300
+ auto & model = ctx_clip.model;
2301
+ auto & hparams = model.hparams;
2236
2302
  std::map<std::string, size_t> tensor_offset;
2237
2303
  std::vector<lm_ggml_tensor *> tensors_to_load;
2238
2304
 
2239
2305
  // TODO @ngxson : support both audio and video in the future
2240
- const char * prefix = hparams.has_audio ? "a" : "v";
2306
+ const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v";
2241
2307
 
2242
2308
  // get offsets
2243
2309
  for (int64_t i = 0; i < lm_gguf_get_n_tensors(ctx_gguf.get()); ++i) {
@@ -2272,26 +2338,24 @@ struct clip_model_loader {
2272
2338
  return cur;
2273
2339
  };
2274
2340
 
2275
- auto & vision_model = ctx_clip.vision_model;
2341
+ model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
2276
2342
 
2277
- vision_model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
2343
+ model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
2344
+ model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, prefix, "bias"), false);
2278
2345
 
2279
- vision_model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
2280
- vision_model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, prefix, "bias"), false);
2346
+ model.post_ln_w = get_tensor(string_format(TN_LN_POST, prefix, "weight"), false);
2347
+ model.post_ln_b = get_tensor(string_format(TN_LN_POST, prefix, "bias"), false);
2281
2348
 
2282
- vision_model.post_ln_w = get_tensor(string_format(TN_LN_POST, prefix, "weight"), false);
2283
- vision_model.post_ln_b = get_tensor(string_format(TN_LN_POST, prefix, "bias"), false);
2349
+ model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
2350
+ model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
2351
+ model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
2284
2352
 
2285
- vision_model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
2286
- vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
2287
- vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
2288
-
2289
- vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
2353
+ model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
2290
2354
 
2291
2355
  // layers
2292
- vision_model.layers.resize(hparams.n_layer);
2356
+ model.layers.resize(hparams.n_layer);
2293
2357
  for (int il = 0; il < hparams.n_layer; ++il) {
2294
- auto & layer = vision_model.layers[il];
2358
+ auto & layer = model.layers[il];
2295
2359
  layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"));
2296
2360
  layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"));
2297
2361
  layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"));
@@ -2332,157 +2396,166 @@ struct clip_model_loader {
2332
2396
  }
2333
2397
  }
2334
2398
 
2335
- switch (ctx_clip.proj_type) {
2399
+ switch (model.proj_type) {
2336
2400
  case PROJECTOR_TYPE_MLP:
2337
2401
  case PROJECTOR_TYPE_MLP_NORM:
2338
2402
  {
2339
2403
  // LLaVA projection
2340
- vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"), false);
2341
- vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
2404
+ model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"), false);
2405
+ model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
2342
2406
  // Yi-type llava
2343
- vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"), false);
2344
- vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
2407
+ model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"), false);
2408
+ model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
2345
2409
  // missing in Yi-type llava
2346
- vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"), false);
2347
- vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
2410
+ model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"), false);
2411
+ model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
2348
2412
  // Yi-type llava
2349
- vision_model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, 3, "weight"), false);
2350
- vision_model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, 3, "bias"), false);
2351
- vision_model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, 4, "weight"), false);
2352
- vision_model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, 4, "bias"), false);
2353
- if (vision_model.mm_3_w) {
2413
+ model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, 3, "weight"), false);
2414
+ model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, 3, "bias"), false);
2415
+ model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, 4, "weight"), false);
2416
+ model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, 4, "bias"), false);
2417
+ if (model.mm_3_w) {
2354
2418
  // TODO: this is a hack to support Yi-type llava
2355
- ctx_clip.proj_type = PROJECTOR_TYPE_MLP_NORM;
2419
+ model.proj_type = PROJECTOR_TYPE_MLP_NORM;
2356
2420
  }
2357
- vision_model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
2421
+ model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
2358
2422
  } break;
2359
2423
  case PROJECTOR_TYPE_LDP:
2360
2424
  {
2361
2425
  // MobileVLM projection
2362
- vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
2363
- vision_model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
2364
- vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
2365
- vision_model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
2366
- vision_model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
2367
- vision_model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
2368
- vision_model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
2369
- vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
2370
- vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
2371
- vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
2372
- vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
2373
- vision_model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
2374
- vision_model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
2375
- vision_model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
2376
- vision_model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
2377
- vision_model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
2378
- vision_model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
2379
- vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
2380
- vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
2381
- vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
2382
- vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
2383
- vision_model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
2384
- vision_model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
2385
- vision_model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
2426
+ model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
2427
+ model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
2428
+ model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
2429
+ model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
2430
+ model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
2431
+ model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
2432
+ model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
2433
+ model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
2434
+ model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
2435
+ model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
2436
+ model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
2437
+ model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
2438
+ model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
2439
+ model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
2440
+ model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
2441
+ model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
2442
+ model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
2443
+ model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
2444
+ model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
2445
+ model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
2446
+ model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
2447
+ model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
2448
+ model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
2449
+ model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
2386
2450
  } break;
2387
2451
  case PROJECTOR_TYPE_LDPV2:
2388
2452
  {
2389
2453
  // MobilVLM_V2 projection
2390
- vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
2391
- vision_model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
2392
- vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
2393
- vision_model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "bias"));
2394
- vision_model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight"));
2395
- vision_model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias"));
2454
+ model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
2455
+ model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
2456
+ model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
2457
+ model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "bias"));
2458
+ model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight"));
2459
+ model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias"));
2396
2460
  } break;
2397
2461
  case PROJECTOR_TYPE_MINICPMV:
2398
2462
  {
2399
- // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
2400
- vision_model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
2401
- vision_model.mm_model_query = get_tensor(TN_MINICPMV_QUERY);
2402
- vision_model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ);
2403
- vision_model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ);
2404
- vision_model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight"));
2405
- vision_model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight"));
2406
- vision_model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight"));
2407
- vision_model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias"));
2408
- vision_model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias"));
2409
- vision_model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias"));
2410
- vision_model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight"));
2411
- vision_model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias"));
2412
- vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight"));
2413
- vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias"));
2414
- vision_model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight"));
2415
- vision_model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias"));
2416
- vision_model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight"));
2417
- vision_model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias"));
2463
+ // model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
2464
+ model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
2465
+ model.mm_model_query = get_tensor(TN_MINICPMV_QUERY);
2466
+ model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ);
2467
+ model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ);
2468
+ model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight"));
2469
+ model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight"));
2470
+ model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight"));
2471
+ model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias"));
2472
+ model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias"));
2473
+ model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias"));
2474
+ model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight"));
2475
+ model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias"));
2476
+ model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight"));
2477
+ model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias"));
2478
+ model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight"));
2479
+ model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias"));
2480
+ model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight"));
2481
+ model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias"));
2418
2482
  } break;
2419
2483
  case PROJECTOR_TYPE_GLM_EDGE:
2420
2484
  {
2421
- vision_model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
2422
- vision_model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
2423
- vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight"));
2424
- vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight"));
2425
- vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias"));
2426
- vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
2427
- vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
2428
- vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
2429
- vision_model.mm_glm_tok_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
2430
- vision_model.mm_glm_tok_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
2485
+ model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
2486
+ model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
2487
+ model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight"));
2488
+ model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight"));
2489
+ model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias"));
2490
+ model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
2491
+ model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
2492
+ model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
2493
+ model.mm_glm_tok_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
2494
+ model.mm_glm_tok_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
2431
2495
  } break;
2432
2496
  case PROJECTOR_TYPE_QWEN2VL:
2433
2497
  case PROJECTOR_TYPE_QWEN25VL:
2434
2498
  {
2435
- vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
2436
- vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
2437
- vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
2438
- vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
2499
+ model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
2500
+ model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
2501
+ model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
2502
+ model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
2439
2503
  } break;
2440
2504
  case PROJECTOR_TYPE_GEMMA3:
2441
2505
  {
2442
- vision_model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
2443
- vision_model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
2506
+ model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
2507
+ model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
2444
2508
  } break;
2445
2509
  case PROJECTOR_TYPE_IDEFICS3:
2446
2510
  {
2447
- vision_model.projection = get_tensor(TN_MM_PROJECTOR);
2511
+ model.projection = get_tensor(TN_MM_PROJECTOR);
2448
2512
  } break;
2449
2513
  case PROJECTOR_TYPE_PIXTRAL:
2450
2514
  {
2451
- vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
2452
- vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
2453
- vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
2454
- vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
2515
+ model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
2516
+ model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
2517
+ model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
2518
+ model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
2455
2519
  // [IMG_BREAK] token embedding
2456
- vision_model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
2520
+ model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
2457
2521
  // for mistral small 3.1
2458
- vision_model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
2459
- vision_model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
2522
+ model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
2523
+ model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
2460
2524
  } break;
2461
2525
  case PROJECTOR_TYPE_ULTRAVOX:
2462
2526
  {
2463
- vision_model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
2464
- vision_model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
2465
- vision_model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
2466
- vision_model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
2467
- vision_model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
2468
- vision_model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
2469
- vision_model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
2470
- vision_model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
2527
+ model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
2528
+ model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
2529
+ model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
2530
+ model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
2531
+ model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
2532
+ model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
2533
+ model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
2534
+ model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
2535
+ } break;
2536
+ case PROJECTOR_TYPE_QWEN2A:
2537
+ {
2538
+ model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
2539
+ model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
2540
+ model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
2541
+ model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
2542
+ model.mm_fc_w = get_tensor(string_format(TN_MM_AUDIO_FC, "weight"));
2543
+ model.mm_fc_b = get_tensor(string_format(TN_MM_AUDIO_FC, "bias"));
2471
2544
  } break;
2472
2545
  case PROJECTOR_TYPE_INTERNVL:
2473
2546
  {
2474
- vision_model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
2475
- vision_model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
2476
- vision_model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
2477
- vision_model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
2478
- vision_model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
2479
- vision_model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
2547
+ model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
2548
+ model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
2549
+ model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
2550
+ model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
2551
+ model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
2552
+ model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
2480
2553
  } break;
2481
2554
  case PROJECTOR_TYPE_LLAMA4:
2482
2555
  {
2483
- vision_model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
2484
- vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
2485
- vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
2556
+ model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
2557
+ model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
2558
+ model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
2486
2559
  } break;
2487
2560
  default:
2488
2561
  LM_GGML_ASSERT(false && "unknown projector type");
@@ -2575,21 +2648,20 @@ struct clip_model_loader {
2575
2648
  }
2576
2649
  }
2577
2650
 
2578
- void alloc_compute_meta() {
2579
- const auto & hparams = ctx_clip.vision_model.hparams;
2651
+ void alloc_compute_meta(clip_ctx & ctx_clip) {
2652
+ const auto & hparams = ctx_clip.model.hparams;
2580
2653
  ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * lm_ggml_tensor_overhead() + lm_ggml_graph_overhead());
2581
2654
 
2582
2655
  // create a fake batch
2583
2656
  clip_image_f32_batch batch;
2584
2657
  clip_image_f32_ptr img(clip_image_f32_init());
2585
- if (hparams.has_vision) {
2658
+ if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
2586
2659
  img->nx = hparams.warmup_image_size;
2587
2660
  img->ny = hparams.warmup_image_size;
2588
2661
  } else {
2589
- img->nx = 1024; // TODO @ngxson : use a better default
2662
+ img->nx = hparams.warmup_audio_size;
2590
2663
  img->ny = hparams.n_mel_bins;
2591
2664
  }
2592
- img->buf.resize(img->nx * img->ny * 3);
2593
2665
  batch.entries.push_back(std::move(img));
2594
2666
 
2595
2667
  lm_ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
@@ -2665,25 +2737,57 @@ struct clip_model_loader {
2665
2737
  output[i] = values[i];
2666
2738
  }
2667
2739
  }
2740
+
2741
+ void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
2742
+ auto & hparams = model.hparams;
2743
+ for (int x = 1; x <= max_patches_per_side; x++) {
2744
+ for (int y = 1; y <= max_patches_per_side; y++) {
2745
+ if (x == 1 && y == 1) {
2746
+ continue; // skip the first point
2747
+ }
2748
+ hparams.image_res_candidates.push_back(clip_image_size{
2749
+ x*hparams.image_size,
2750
+ y*hparams.image_size,
2751
+ });
2752
+ }
2753
+ }
2754
+ }
2668
2755
  };
2669
2756
 
2670
- struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
2757
+ struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) {
2671
2758
  g_logger_state.verbosity_thold = ctx_params.verbosity;
2672
- clip_ctx * ctx_clip = nullptr;
2759
+ clip_ctx * ctx_vision = nullptr;
2760
+ clip_ctx * ctx_audio = nullptr;
2673
2761
 
2674
2762
  try {
2675
- ctx_clip = new clip_ctx(ctx_params);
2676
- clip_model_loader loader(fname, *ctx_clip);
2677
- loader.load_hparams();
2678
- loader.load_tensors();
2679
- loader.alloc_compute_meta();
2763
+ clip_model_loader loader(fname);
2764
+
2765
+ if (loader.has_vision) {
2766
+ ctx_vision = new clip_ctx(ctx_params);
2767
+ loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
2768
+ loader.load_tensors(*ctx_vision);
2769
+ loader.alloc_compute_meta(*ctx_vision);
2770
+ }
2771
+
2772
+ if (loader.has_audio) {
2773
+ ctx_audio = new clip_ctx(ctx_params);
2774
+ loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
2775
+ loader.load_tensors(*ctx_audio);
2776
+ loader.alloc_compute_meta(*ctx_audio);
2777
+ }
2778
+
2680
2779
  } catch (const std::exception & e) {
2681
2780
  LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
2682
- delete ctx_clip;
2683
- return nullptr;
2781
+ if (ctx_vision) {
2782
+ delete ctx_vision;
2783
+ }
2784
+ if (ctx_audio) {
2785
+ delete ctx_audio;
2786
+ }
2787
+ return {nullptr, nullptr};
2684
2788
  }
2685
2789
 
2686
- return ctx_clip;
2790
+ return {ctx_vision, ctx_audio};
2687
2791
  }
2688
2792
 
2689
2793
  struct clip_image_size * clip_image_size_init() {
@@ -2757,30 +2861,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny
2757
2861
  memcpy(img->buf.data(), rgb_pixels, img->buf.size());
2758
2862
  }
2759
2863
 
2760
- bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
2761
- int nx, ny, nc;
2762
- auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
2763
- if (!data) {
2764
- LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
2765
- return false;
2766
- }
2767
- clip_build_img_from_pixels(data, nx, ny, img);
2768
- stbi_image_free(data);
2769
- return true;
2770
- }
2771
-
2772
- bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
2773
- int nx, ny, nc;
2774
- auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
2775
- if (!data) {
2776
- LOG_ERR("%s: failed to decode image bytes\n", __func__);
2777
- return false;
2778
- }
2779
- clip_build_img_from_pixels(data, nx, ny, img);
2780
- stbi_image_free(data);
2781
- return true;
2782
- }
2783
-
2784
2864
  // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
2785
2865
  static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
2786
2866
  dst.nx = src.nx;
@@ -3026,36 +3106,41 @@ struct llava_uhd {
3026
3106
  bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
3027
3107
  };
3028
3108
 
3029
- static int get_max_slices(struct clip_ctx * ctx) {
3030
- if (clip_is_minicpmv(ctx)) {
3031
- return 9;
3032
- }
3033
- return 0;
3034
- }
3035
-
3036
3109
  static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
3037
3110
  slice_instructions res;
3038
3111
  const int patch_size = clip_get_patch_size(ctx);
3039
3112
  const int slice_size = clip_get_image_size(ctx);
3040
- const int max_slice_nums = get_max_slices(ctx);
3041
3113
  const int original_width = original_size.width;
3042
3114
  const int original_height = original_size.height;
3043
- const float log_ratio = log((float)original_width / original_height);
3044
- const float ratio = (float)original_width * original_height / (slice_size * slice_size);
3045
- const int multiple = fmin(ceil(ratio), max_slice_nums);
3046
- const bool has_slices = (multiple > 1);
3047
- const bool has_pinpoints = !ctx->vision_model.hparams.image_grid_pinpoints.empty();
3115
+
3116
+ const bool has_slices = original_size.width > slice_size || original_size.height > slice_size;
3117
+ const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
3118
+
3119
+ if (!has_slices) {
3120
+ // skip slicing logic
3121
+ res.overview_size = clip_image_size{slice_size, slice_size};
3122
+ res.refined_size = clip_image_size{0, 0};
3123
+ res.grid_size = clip_image_size{0, 0};
3124
+
3125
+ return res;
3126
+ }
3048
3127
 
3049
3128
  if (has_pinpoints) {
3050
3129
  // has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
3051
3130
  auto refine_size = llava_uhd::select_best_resolution(
3052
- ctx->vision_model.hparams.image_grid_pinpoints,
3053
- original_size);
3131
+ original_size,
3132
+ ctx->model.hparams.image_res_candidates);
3054
3133
  res.overview_size = clip_image_size{slice_size, slice_size};
3055
3134
  res.refined_size = refine_size;
3056
3135
  res.grid_size = clip_image_size{0, 0};
3057
3136
  res.padding_refined = true;
3058
3137
 
3138
+ LOG_DBG("%s: using pinpoints for slicing\n", __func__);
3139
+ LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n",
3140
+ __func__, original_width, original_height,
3141
+ res.overview_size.width, res.overview_size.height,
3142
+ res.refined_size.width, res.refined_size.height);
3143
+
3059
3144
  for (int y = 0; y < refine_size.height; y += slice_size) {
3060
3145
  for (int x = 0; x < refine_size.width; x += slice_size) {
3061
3146
  slice_coordinates slice;
@@ -3064,13 +3149,16 @@ struct llava_uhd {
3064
3149
  slice.size.width = std::min(slice_size, refine_size.width - x);
3065
3150
  slice.size.height = std::min(slice_size, refine_size.height - y);
3066
3151
  res.slices.push_back(slice);
3067
- if (x == 0) {
3068
- res.grid_size.width++;
3069
- }
3152
+ LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
3153
+ __func__, (int)res.slices.size() - 1,
3154
+ slice.x, slice.y, slice.size.width, slice.size.height);
3070
3155
  }
3071
- res.grid_size.height++;
3072
3156
  }
3073
3157
 
3158
+ res.grid_size.height = refine_size.height / slice_size;
3159
+ res.grid_size.width = refine_size.width / slice_size;
3160
+ LOG_DBG("%s: grid size: %d x %d\n", __func__, res.grid_size.width, res.grid_size.height);
3161
+
3074
3162
  return res;
3075
3163
  }
3076
3164
 
@@ -3079,17 +3167,23 @@ struct llava_uhd {
3079
3167
  auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices);
3080
3168
  res.overview_size = best_size;
3081
3169
 
3082
- if (!has_slices) {
3083
- // skip slicing logic
3084
- res.refined_size = clip_image_size{0, 0};
3085
- res.grid_size = clip_image_size{0, 0};
3170
+ {
3171
+ const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
3172
+ const float log_ratio = log((float)original_width / original_height);
3173
+ const float ratio = (float)original_width * original_height / (slice_size * slice_size);
3174
+ const int multiple = fmin(ceil(ratio), max_slice_nums);
3086
3175
 
3087
- } else {
3088
3176
  auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio);
3089
3177
  auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
3090
3178
  res.grid_size = best_grid;
3091
3179
  res.refined_size = refine_size;
3092
3180
 
3181
+ LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
3182
+ __func__, original_width, original_height,
3183
+ res.overview_size.width, res.overview_size.height,
3184
+ res.refined_size.width, res.refined_size.height,
3185
+ res.grid_size.width, res.grid_size.height);
3186
+
3093
3187
  int width = refine_size.width;
3094
3188
  int height = refine_size.height;
3095
3189
  int grid_x = int(width / best_grid.width);
@@ -3106,7 +3200,9 @@ struct llava_uhd {
3106
3200
  slice.size.width = grid_x;
3107
3201
  slice.size.height = grid_y;
3108
3202
  res.slices.push_back(slice);
3109
- // LOG_INF("slice %d: %d %d %d %d\n", ic, patches_i, patches_j, grid_x, grid_y);
3203
+ LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
3204
+ __func__, (int)res.slices.size() - 1,
3205
+ slice.x, slice.y, slice.size.width, slice.size.height);
3110
3206
  }
3111
3207
  }
3112
3208
  }
@@ -3164,48 +3260,55 @@ private:
3164
3260
  return res;
3165
3261
  }
3166
3262
 
3263
+ static clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max) {
3264
+ float scale_width = static_cast<float>(target_max.width) / orig.width;
3265
+ float scale_height = static_cast<float>(target_max.height) / orig.height;
3266
+ float scale = std::min(scale_width, scale_height);
3267
+ return clip_image_size{
3268
+ static_cast<int>(orig.width * scale),
3269
+ static_cast<int>(orig.height * scale),
3270
+ };
3271
+ }
3272
+
3167
3273
  /**
3168
3274
  * Selects the best resolution from a list of possible resolutions based on the original size.
3169
3275
  *
3276
+ * For example, when given a list of resolutions:
3277
+ * - 100x100
3278
+ * - 200x100
3279
+ * - 100x200
3280
+ * - 200x200
3281
+ *
3282
+ * And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
3283
+ *
3170
3284
  * @param original_size The original size of the image
3171
3285
  * @param possible_resolutions A list of possible resolutions
3172
3286
  * @return The best fit resolution
3173
3287
  */
3174
3288
  static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
3175
- int original_width = original_size.width;
3176
- int original_height = original_size.height;
3177
3289
  clip_image_size best_fit;
3290
+ int min_wasted_area = std::numeric_limits<int>::max();
3178
3291
  int max_effective_resolution = 0;
3179
- int min_wasted_resolution = std::numeric_limits<int>::max();
3180
-
3181
- for (const auto & resolution : possible_resolutions) {
3182
- int width = resolution.width;
3183
- int height = resolution.height;
3184
- float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
3185
- int downscaled_width = static_cast<int>(original_width * scale);
3186
- int downscaled_height = static_cast<int>(original_height * scale);
3187
- int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
3188
- int wasted_resolution = (width * height) - effective_resolution;
3189
- // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
3190
- if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
3292
+
3293
+ for (const clip_image_size & candidate : possible_resolutions) {
3294
+ auto target_size = resize_maintain_aspect_ratio(original_size, candidate);
3295
+ int effective_resolution = std::min(
3296
+ target_size.width * target_size.height,
3297
+ original_size.width * original_size.height);
3298
+ int wasted_area = (candidate.width * candidate.height) - effective_resolution;
3299
+
3300
+ if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_area < min_wasted_area)) {
3191
3301
  max_effective_resolution = effective_resolution;
3192
- min_wasted_resolution = wasted_resolution;
3193
- best_fit = resolution;
3302
+ min_wasted_area = wasted_area;
3303
+ best_fit = candidate;
3194
3304
  }
3305
+
3306
+ LOG_DBG("%s: candidate: %d x %d, target: %d x %d, wasted: %d, effective: %d\n", __func__, candidate.width, candidate.height, target_size.width, target_size.height, wasted_area, effective_resolution);
3195
3307
  }
3196
3308
 
3197
3309
  return best_fit;
3198
3310
  }
3199
3311
 
3200
- // used by llava 1.6 with custom list of pinpoints
3201
- static clip_image_size select_best_resolution(const std::vector<int32_t> & pinpoints, const clip_image_size & original_size) {
3202
- std::vector<clip_image_size> possible_resolutions; // TODO @ngxson : construct this inside hparams, not here
3203
- for (size_t i = 0; i < pinpoints.size(); i += 2) {
3204
- possible_resolutions.push_back(clip_image_size{pinpoints[i], pinpoints[i+1]});
3205
- }
3206
- return select_best_resolution(original_size, possible_resolutions);
3207
- }
3208
-
3209
3312
  static int ensure_divide(int length, int patch_size) {
3210
3313
  return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
3211
3314
  }
@@ -3271,7 +3374,7 @@ private:
3271
3374
  bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
3272
3375
  clip_image_size original_size{img->nx, img->ny};
3273
3376
  bool pad_to_square = true;
3274
- auto & params = ctx->vision_model.hparams;
3377
+ auto & params = ctx->model.hparams;
3275
3378
  // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
3276
3379
  if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) {
3277
3380
  pad_to_square = false;
@@ -3284,7 +3387,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
3284
3387
  for (size_t i = 0; i < imgs.size(); ++i) {
3285
3388
  // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
3286
3389
  clip_image_f32_ptr res(clip_image_f32_init());
3287
- normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
3390
+ normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
3288
3391
  res_imgs->entries.push_back(std::move(res));
3289
3392
  }
3290
3393
 
@@ -3292,7 +3395,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
3292
3395
  res_imgs->grid_y = inst.grid_size.height;
3293
3396
  return true;
3294
3397
 
3295
- } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
3398
+ } else if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL) {
3296
3399
  clip_image_u8 resized;
3297
3400
  auto patch_size = params.patch_size * 2;
3298
3401
  auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
@@ -3300,42 +3403,42 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
3300
3403
 
3301
3404
  clip_image_f32_ptr img_f32(clip_image_f32_init());
3302
3405
  // clip_image_f32_ptr res(clip_image_f32_init());
3303
- normalize_image_u8_to_f32(resized, *img_f32, ctx->image_mean, ctx->image_std);
3406
+ normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
3304
3407
  // res_imgs->data[0] = *res;
3305
3408
  res_imgs->entries.push_back(std::move(img_f32));
3306
3409
  return true;
3307
3410
  }
3308
- else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE
3309
- || ctx->proj_type == PROJECTOR_TYPE_GEMMA3
3310
- || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3
3311
- || ctx->proj_type == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
3411
+ else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
3412
+ || ctx->proj_type() == PROJECTOR_TYPE_GEMMA3
3413
+ || ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3
3414
+ || ctx->proj_type() == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
3312
3415
  ) {
3313
3416
  clip_image_u8 resized_image;
3314
3417
  int sz = params.image_size;
3315
3418
  image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz});
3316
3419
  clip_image_f32_ptr img_f32(clip_image_f32_init());
3317
3420
  //clip_image_save_to_bmp(resized_image, "resized.bmp");
3318
- normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
3421
+ normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
3319
3422
  res_imgs->entries.push_back(std::move(img_f32));
3320
3423
  return true;
3321
3424
 
3322
- } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
3425
+ } else if (ctx->proj_type() == PROJECTOR_TYPE_PIXTRAL) {
3323
3426
  clip_image_u8 resized_image;
3324
3427
  auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
3325
3428
  image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
3326
3429
  clip_image_f32_ptr img_f32(clip_image_f32_init());
3327
- normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
3430
+ normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
3328
3431
  res_imgs->entries.push_back(std::move(img_f32));
3329
3432
  return true;
3330
3433
 
3331
- } else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
3332
- LM_GGML_ASSERT(!params.image_grid_pinpoints.empty());
3434
+ } else if (ctx->proj_type() == PROJECTOR_TYPE_LLAMA4) {
3435
+ LM_GGML_ASSERT(!params.image_res_candidates.empty());
3333
3436
  auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
3334
3437
  std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
3335
3438
 
3336
3439
  for (size_t i = 0; i < imgs.size(); ++i) {
3337
3440
  clip_image_f32_ptr res(clip_image_f32_init());
3338
- normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
3441
+ normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
3339
3442
  res_imgs->entries.push_back(std::move(res));
3340
3443
  }
3341
3444
 
@@ -3365,11 +3468,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
3365
3468
  image_manipulation::resize_and_pad_image(*img, *temp, clip_image_size{params.image_size, params.image_size}, pad_color);
3366
3469
 
3367
3470
  clip_image_f32_ptr res(clip_image_f32_init());
3368
- normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
3471
+ normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std);
3369
3472
  res_imgs->entries.push_back(std::move(res));
3370
3473
  return true;
3371
3474
 
3372
- } else if (!params.image_grid_pinpoints.empty()) {
3475
+ } else if (!params.image_res_candidates.empty()) {
3373
3476
  // "spatial_unpad" with "anyres" processing for llava-1.6
3374
3477
  auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
3375
3478
  std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
@@ -3377,7 +3480,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
3377
3480
  for (size_t i = 0; i < imgs.size(); ++i) {
3378
3481
  // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
3379
3482
  clip_image_f32_ptr res(clip_image_f32_init());
3380
- normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
3483
+ normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
3381
3484
  res_imgs->entries.push_back(std::move(res));
3382
3485
  }
3383
3486
 
@@ -3389,7 +3492,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
3389
3492
  }
3390
3493
 
3391
3494
  lm_ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
3392
- return ctx->vision_model.image_newline;
3495
+ return ctx->model.image_newline;
3393
3496
  }
3394
3497
 
3395
3498
  void clip_free(clip_ctx * ctx) {
@@ -3401,8 +3504,8 @@ void clip_free(clip_ctx * ctx) {
3401
3504
 
3402
3505
  // deprecated
3403
3506
  size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
3404
- const int32_t nx = ctx->vision_model.hparams.image_size;
3405
- const int32_t ny = ctx->vision_model.hparams.image_size;
3507
+ const int32_t nx = ctx->model.hparams.image_size;
3508
+ const int32_t ny = ctx->model.hparams.image_size;
3406
3509
  return clip_embd_nbytes_by_img(ctx, nx, ny);
3407
3510
  }
3408
3511
 
@@ -3414,101 +3517,124 @@ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h
3414
3517
  }
3415
3518
 
3416
3519
  int32_t clip_get_image_size(const struct clip_ctx * ctx) {
3417
- return ctx->vision_model.hparams.image_size;
3520
+ return ctx->model.hparams.image_size;
3418
3521
  }
3419
3522
 
3420
3523
  int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
3421
- return ctx->vision_model.hparams.patch_size;
3524
+ return ctx->model.hparams.patch_size;
3422
3525
  }
3423
3526
 
3424
3527
  int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
3425
- return ctx->vision_model.hparams.n_embd;
3528
+ return ctx->model.hparams.n_embd;
3426
3529
  }
3427
3530
 
3428
3531
  const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
3429
- return ctx->vision_model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
3430
- }
3431
-
3432
- const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
3433
- if (ctx->vision_model.hparams.image_grid_pinpoints.size()) {
3434
- return &ctx->vision_model.hparams.image_grid_pinpoints.front();
3435
- }
3436
- return nullptr;
3437
- }
3438
-
3439
- size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
3440
- return ctx->vision_model.hparams.image_grid_pinpoints.size();
3532
+ return ctx->model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
3441
3533
  }
3442
3534
 
3443
3535
  int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
3444
- const auto & params = ctx->vision_model.hparams;
3536
+ const auto & params = ctx->model.hparams;
3445
3537
  const int n_total = clip_n_output_tokens(ctx, img);
3446
- if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
3538
+ if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL) {
3447
3539
  return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0);
3448
3540
  }
3449
3541
  return n_total;
3450
3542
  }
3451
3543
 
3452
3544
  int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
3453
- const auto & params = ctx->vision_model.hparams;
3454
- if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
3545
+ const auto & params = ctx->model.hparams;
3546
+ if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL) {
3455
3547
  return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0);
3456
3548
  }
3457
3549
  return 1;
3458
3550
  }
3459
3551
 
3460
3552
  int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
3461
- const auto & params = ctx->vision_model.hparams;
3553
+ const auto & params = ctx->model.hparams;
3462
3554
 
3463
- int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
3464
- int scale_factor = ctx->vision_model.hparams.proj_scale_factor;
3555
+ // only for models using fixed size square images
3556
+ int n_patches_sq = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
3465
3557
 
3466
- if (ctx->proj_type == PROJECTOR_TYPE_LDP
3467
- || ctx->proj_type == PROJECTOR_TYPE_LDPV2
3468
- || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
3469
- n_patches /= 4;
3470
- if (ctx->vision_model.mm_glm_tok_boi) {
3471
- n_patches += 2; // for BOI and EOI token embeddings
3472
- }
3473
- } else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
3474
- if (ctx->minicpmv_version == 2) {
3475
- n_patches = 96;
3476
- }
3477
- else if (ctx->minicpmv_version == 3) {
3478
- n_patches = 64;
3479
- }
3480
- else if (ctx->minicpmv_version == 4) {
3481
- n_patches = 64;
3482
- }
3483
- else {
3484
- LM_GGML_ABORT("Unknown minicpmv version");
3485
- }
3486
- } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
3487
- int patch_size = params.patch_size * 2;
3488
- int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
3489
- int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
3490
- n_patches = x_patch * y_patch;
3491
- } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
3492
- int n_per_side = params.image_size / params.patch_size;
3493
- int n_per_side_2d_pool = n_per_side / params.proj_scale_factor;
3494
- n_patches = n_per_side_2d_pool * n_per_side_2d_pool;
3495
- } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3 || ctx->proj_type == PROJECTOR_TYPE_INTERNVL) {
3496
- // both W and H are divided by proj_scale_factor
3497
- n_patches /= (params.proj_scale_factor * params.proj_scale_factor);
3498
- } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
3499
- int n_merge = params.spatial_merge_size;
3500
- int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
3501
- int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1);
3502
- n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
3503
- } else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
3504
- n_patches /= (scale_factor * scale_factor);
3505
- } else if (ctx->proj_type == PROJECTOR_TYPE_ULTRAVOX) {
3506
- const int proj_stack_factor = ctx->vision_model.hparams.proj_stack_factor;
3507
- const int n_len = CLIP_ALIGN(img->nx, proj_stack_factor);
3508
- n_patches = n_len / proj_stack_factor / 2;
3509
- }
3510
-
3511
- return n_patches;
3558
+ projector_type proj = ctx->proj_type();
3559
+
3560
+ switch (proj) {
3561
+ case PROJECTOR_TYPE_MLP:
3562
+ case PROJECTOR_TYPE_MLP_NORM:
3563
+ {
3564
+ // do nothing
3565
+ } break;
3566
+ case PROJECTOR_TYPE_LDP:
3567
+ case PROJECTOR_TYPE_LDPV2:
3568
+ case PROJECTOR_TYPE_GLM_EDGE:
3569
+ {
3570
+ n_patches_sq /= 4;
3571
+ if (ctx->model.mm_glm_tok_boi) {
3572
+ n_patches_sq += 2; // for BOI and EOI token embeddings
3573
+ }
3574
+ } break;
3575
+ case PROJECTOR_TYPE_MINICPMV:
3576
+ {
3577
+ if (params.minicpmv_version == 2) {
3578
+ n_patches_sq = 96;
3579
+ } else if (params.minicpmv_version == 3) {
3580
+ n_patches_sq = 64;
3581
+ } else if (params.minicpmv_version == 4) {
3582
+ n_patches_sq = 64;
3583
+ } else {
3584
+ LM_GGML_ABORT("Unknown minicpmv version");
3585
+ }
3586
+ } break;
3587
+ case PROJECTOR_TYPE_QWEN2VL:
3588
+ case PROJECTOR_TYPE_QWEN25VL:
3589
+ {
3590
+ // dynamic size
3591
+ int patch_size = params.patch_size * 2;
3592
+ int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
3593
+ int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
3594
+ n_patches_sq = x_patch * y_patch;
3595
+ } break;
3596
+ case PROJECTOR_TYPE_GEMMA3:
3597
+ {
3598
+ int n_per_side = params.image_size / params.patch_size;
3599
+ int n_per_side_2d_pool = n_per_side / params.proj_scale_factor;
3600
+ n_patches_sq = n_per_side_2d_pool * n_per_side_2d_pool;
3601
+ } break;
3602
+ case PROJECTOR_TYPE_IDEFICS3:
3603
+ case PROJECTOR_TYPE_INTERNVL:
3604
+ {
3605
+ // both W and H are divided by proj_scale_factor
3606
+ n_patches_sq /= (params.proj_scale_factor * params.proj_scale_factor);
3607
+ } break;
3608
+ case PROJECTOR_TYPE_PIXTRAL:
3609
+ {
3610
+ // dynamic size
3611
+ int n_merge = params.spatial_merge_size;
3612
+ int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
3613
+ int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1);
3614
+ n_patches_sq = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
3615
+ } break;
3616
+ case PROJECTOR_TYPE_LLAMA4:
3617
+ {
3618
+ int scale_factor = ctx->model.hparams.proj_scale_factor;
3619
+ n_patches_sq /= (scale_factor * scale_factor);
3620
+ } break;
3621
+ case PROJECTOR_TYPE_ULTRAVOX:
3622
+ {
3623
+ const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
3624
+ const int n_len = CLIP_ALIGN(img->nx, proj_stack_factor);
3625
+ n_patches_sq = n_len / proj_stack_factor / 2;
3626
+ } break;
3627
+ case PROJECTOR_TYPE_QWEN2A:
3628
+ {
3629
+ // divide by 2 because of whisper
3630
+ // another divide by 2 because of nn.AvgPool1d(2, stride=2)
3631
+ n_patches_sq = img->nx / 4;
3632
+ } break;
3633
+ default:
3634
+ LM_GGML_ABORT("unsupported projector type");
3635
+ }
3636
+
3637
+ return n_patches_sq;
3512
3638
  }
3513
3639
 
3514
3640
  static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
@@ -3623,7 +3749,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3623
3749
  lm_ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
3624
3750
 
3625
3751
  // set inputs
3626
- const auto & model = ctx->vision_model;
3752
+ const auto & model = ctx->model;
3627
3753
  const auto & hparams = model.hparams;
3628
3754
 
3629
3755
  const int image_size_width = imgs.entries[0]->nx;
@@ -3713,7 +3839,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3713
3839
  }
3714
3840
 
3715
3841
  // set input per projector
3716
- switch (ctx->proj_type) {
3842
+ switch (ctx->model.proj_type) {
3717
3843
  case PROJECTOR_TYPE_MINICPMV:
3718
3844
  {
3719
3845
  // inspired from siglip:
@@ -3906,6 +4032,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3906
4032
  case PROJECTOR_TYPE_GEMMA3:
3907
4033
  case PROJECTOR_TYPE_IDEFICS3:
3908
4034
  case PROJECTOR_TYPE_INTERNVL:
4035
+ case PROJECTOR_TYPE_QWEN2A:
3909
4036
  case PROJECTOR_TYPE_ULTRAVOX:
3910
4037
  {
3911
4038
  // do nothing
@@ -3966,7 +4093,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3966
4093
  const int n_tokens_out = embeddings->ne[1];
3967
4094
  const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
3968
4095
  if (n_tokens_out != expected_n_tokens_out) {
3969
- LOG_ERR("%s: expected %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
4096
+ LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
3970
4097
  LM_GGML_ABORT("Invalid number of output tokens");
3971
4098
  }
3972
4099
 
@@ -3977,74 +4104,83 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3977
4104
  }
3978
4105
 
3979
4106
  int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
3980
- switch (ctx->proj_type) {
4107
+ const auto & hparams = ctx->model.hparams;
4108
+ switch (ctx->model.proj_type) {
3981
4109
  case PROJECTOR_TYPE_LDP:
3982
- return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
4110
+ return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
3983
4111
  case PROJECTOR_TYPE_LDPV2:
3984
- return ctx->vision_model.mm_model_peg_0_b->ne[0];
4112
+ return ctx->model.mm_model_peg_0_b->ne[0];
3985
4113
  case PROJECTOR_TYPE_MLP:
3986
4114
  case PROJECTOR_TYPE_PIXTRAL:
3987
- return ctx->vision_model.mm_2_w->ne[1];
4115
+ return ctx->model.mm_2_w->ne[1];
3988
4116
  case PROJECTOR_TYPE_MLP_NORM:
3989
- return ctx->vision_model.mm_3_b->ne[0];
4117
+ return ctx->model.mm_3_b->ne[0];
3990
4118
  case PROJECTOR_TYPE_MINICPMV:
3991
- if (ctx->minicpmv_version == 2) {
4119
+ if (hparams.minicpmv_version == 2) {
3992
4120
  return 4096;
3993
- } else if (ctx->minicpmv_version == 3) {
4121
+ } else if (hparams.minicpmv_version == 3) {
3994
4122
  return 3584;
3995
- } else if (ctx->minicpmv_version == 4) {
4123
+ } else if (hparams.minicpmv_version == 4) {
3996
4124
  return 3584;
3997
4125
  }
3998
4126
  LM_GGML_ABORT("Unknown minicpmv version");
3999
4127
  case PROJECTOR_TYPE_GLM_EDGE:
4000
- return ctx->vision_model.mm_model_mlp_3_w->ne[1];
4128
+ return ctx->model.mm_model_mlp_3_w->ne[1];
4001
4129
  case PROJECTOR_TYPE_QWEN2VL:
4002
4130
  case PROJECTOR_TYPE_QWEN25VL:
4003
- return ctx->vision_model.mm_1_b->ne[0];
4131
+ return ctx->model.mm_1_b->ne[0];
4004
4132
  case PROJECTOR_TYPE_GEMMA3:
4005
- return ctx->vision_model.mm_input_proj_w->ne[0];
4133
+ return ctx->model.mm_input_proj_w->ne[0];
4006
4134
  case PROJECTOR_TYPE_IDEFICS3:
4007
- return ctx->vision_model.projection->ne[1];
4135
+ return ctx->model.projection->ne[1];
4008
4136
  case PROJECTOR_TYPE_ULTRAVOX:
4009
- return ctx->vision_model.mm_2_w->ne[1];
4137
+ return ctx->model.mm_2_w->ne[1];
4010
4138
  case PROJECTOR_TYPE_INTERNVL:
4011
- return ctx->vision_model.mm_3_w->ne[1];
4139
+ return ctx->model.mm_3_w->ne[1];
4012
4140
  case PROJECTOR_TYPE_LLAMA4:
4013
- return ctx->vision_model.mm_model_proj->ne[1];
4141
+ return ctx->model.mm_model_proj->ne[1];
4142
+ case PROJECTOR_TYPE_QWEN2A:
4143
+ return ctx->model.mm_fc_w->ne[1];
4014
4144
  default:
4015
4145
  LM_GGML_ABORT("Unknown projector type");
4016
4146
  }
4017
4147
  }
4018
4148
 
4019
4149
  int clip_is_minicpmv(const struct clip_ctx * ctx) {
4020
- if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
4021
- return ctx->minicpmv_version;
4150
+ if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
4151
+ return ctx->model.hparams.minicpmv_version;
4022
4152
  }
4023
4153
  return 0;
4024
4154
  }
4025
4155
 
4026
4156
  bool clip_is_glm(const struct clip_ctx * ctx) {
4027
- return ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE;
4157
+ return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
4028
4158
  }
4029
4159
 
4030
4160
  bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
4031
- return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL;
4161
+ return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
4162
+ || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL;
4032
4163
  }
4033
4164
 
4034
4165
  bool clip_is_llava(const struct clip_ctx * ctx) {
4035
- return ctx->has_llava_projector;
4166
+ return ctx->model.hparams.has_llava_projector;
4036
4167
  }
4037
4168
 
4038
4169
  bool clip_is_gemma3(const struct clip_ctx * ctx) {
4039
- return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
4170
+ return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
4040
4171
  }
4041
4172
 
4042
4173
  bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
4043
- return ctx->vision_model.hparams.has_vision;
4174
+ return ctx->model.modality == CLIP_MODALITY_VISION;
4044
4175
  }
4045
4176
 
4046
4177
  bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
4047
- return ctx->vision_model.hparams.has_audio;
4178
+ return ctx->model.modality == CLIP_MODALITY_AUDIO;
4179
+ }
4180
+
4181
+ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
4182
+ return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
4183
+ || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A;
4048
4184
  }
4049
4185
 
4050
4186
  bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
@@ -4065,7 +4201,7 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
4065
4201
  //
4066
4202
 
4067
4203
  projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
4068
- return ctx->proj_type;
4204
+ return ctx->proj_type();
4069
4205
  }
4070
4206
 
4071
4207
  void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {