cui-llama.rn 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (285) hide show
  1. package/README.md +35 -7
  2. package/android/src/main/CMakeLists.txt +22 -11
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +42 -6
  4. package/android/src/main/java/com/rnllama/RNLlama.java +139 -4
  5. package/android/src/main/jni.cpp +173 -18
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +24 -4
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +22 -2
  16. package/cpp/LICENSE +21 -0
  17. package/cpp/chat.cpp +129 -107
  18. package/cpp/chat.h +2 -0
  19. package/cpp/common.cpp +58 -78
  20. package/cpp/common.h +29 -21
  21. package/cpp/ggml-alloc.c +4 -1
  22. package/cpp/ggml-backend.cpp +9 -5
  23. package/cpp/ggml-backend.h +4 -4
  24. package/cpp/ggml-cpp.h +1 -1
  25. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  26. package/cpp/ggml-cpu/amx/amx.h +8 -0
  27. package/cpp/ggml-cpu/amx/common.h +91 -0
  28. package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
  29. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  30. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/binary-ops.h +1 -1
  31. package/cpp/ggml-cpu/common.h +72 -0
  32. package/cpp/{ggml-cpu-aarch64.cpp → ggml-cpu/ggml-cpu-aarch64.cpp} +809 -103
  33. package/cpp/{ggml-cpu-quants.c → ggml-cpu/ggml-cpu-quants.c} +306 -6
  34. package/cpp/{ggml-cpu.c → ggml-cpu/ggml-cpu.c} +114 -55
  35. package/cpp/{ggml-cpu.cpp → ggml-cpu/ggml-cpu.cpp} +32 -16
  36. package/cpp/{ops.cpp → ggml-cpu/ops.cpp} +353 -173
  37. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/ops.h +2 -20
  38. package/cpp/{sgemm.cpp → ggml-cpu/sgemm.cpp} +501 -0
  39. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/simd-mappings.h +7 -3
  40. package/{ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/unary-ops.h +1 -1
  41. package/cpp/{vec.cpp → ggml-cpu/vec.cpp} +0 -6
  42. package/{ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/vec.h +16 -0
  43. package/cpp/ggml-cpu.h +5 -0
  44. package/cpp/ggml-impl.h +16 -9
  45. package/cpp/ggml-llama-sim.metallib +0 -0
  46. package/cpp/ggml-llama.metallib +0 -0
  47. package/cpp/ggml-metal-impl.h +36 -11
  48. package/cpp/ggml-metal.m +810 -176
  49. package/cpp/ggml-opt.cpp +373 -190
  50. package/cpp/ggml-opt.h +49 -28
  51. package/cpp/ggml-quants.c +0 -6
  52. package/cpp/ggml.c +227 -282
  53. package/cpp/ggml.h +82 -101
  54. package/cpp/gguf.cpp +33 -33
  55. package/cpp/json-schema-to-grammar.cpp +3 -0
  56. package/cpp/llama-adapter.cpp +6 -0
  57. package/cpp/llama-arch.cpp +49 -17
  58. package/cpp/llama-arch.h +9 -0
  59. package/cpp/llama-batch.cpp +8 -2
  60. package/cpp/llama-batch.h +2 -1
  61. package/cpp/llama-chat.cpp +39 -16
  62. package/cpp/llama-chat.h +4 -2
  63. package/cpp/llama-context.cpp +440 -611
  64. package/cpp/llama-context.h +44 -33
  65. package/cpp/llama-cparams.h +1 -0
  66. package/cpp/llama-graph.cpp +214 -291
  67. package/cpp/llama-graph.h +69 -21
  68. package/cpp/llama-hparams.cpp +17 -1
  69. package/cpp/llama-hparams.h +39 -5
  70. package/cpp/llama-kv-cache.cpp +2067 -620
  71. package/cpp/llama-kv-cache.h +410 -108
  72. package/cpp/llama-memory.h +12 -1
  73. package/cpp/llama-model-loader.cpp +24 -15
  74. package/cpp/llama-model-saver.cpp +281 -0
  75. package/cpp/llama-model-saver.h +37 -0
  76. package/cpp/llama-model.cpp +1089 -359
  77. package/cpp/llama-model.h +19 -3
  78. package/cpp/llama-sampling.cpp +20 -7
  79. package/cpp/llama-vocab.cpp +54 -9
  80. package/cpp/llama-vocab.h +6 -0
  81. package/cpp/llama.cpp +14 -0
  82. package/cpp/llama.h +86 -142
  83. package/cpp/minja/chat-template.hpp +9 -5
  84. package/cpp/minja/minja.hpp +69 -36
  85. package/cpp/rn-llama.cpp +602 -190
  86. package/cpp/rn-llama.h +34 -8
  87. package/cpp/sampling.cpp +57 -50
  88. package/cpp/tools/mtmd/clip-impl.h +462 -0
  89. package/cpp/tools/mtmd/clip.cpp +4024 -0
  90. package/cpp/tools/mtmd/clip.h +101 -0
  91. package/cpp/tools/mtmd/miniaudio.h +93468 -0
  92. package/cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  93. package/cpp/tools/mtmd/mtmd-audio.h +62 -0
  94. package/cpp/tools/mtmd/mtmd-helper.cpp +297 -0
  95. package/cpp/tools/mtmd/mtmd.cpp +942 -0
  96. package/cpp/tools/mtmd/mtmd.h +362 -0
  97. package/cpp/tools/mtmd/stb_image.h +7988 -0
  98. package/ios/CMakeLists.txt +20 -10
  99. package/ios/RNLlama.h +6 -0
  100. package/ios/RNLlama.mm +82 -3
  101. package/ios/RNLlamaContext.h +5 -1
  102. package/ios/RNLlamaContext.mm +131 -38
  103. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +2 -0
  104. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +29 -21
  105. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  106. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  107. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  108. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  109. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  110. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  111. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +82 -101
  112. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  113. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  114. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
  115. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +44 -33
  116. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  117. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
  118. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
  119. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  120. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
  121. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  122. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +19 -3
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +86 -142
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Info.plist +0 -0
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  131. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  132. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
  133. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  134. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  135. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  136. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  137. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  138. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  139. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
  140. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  141. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  142. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
  143. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
  144. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  145. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
  146. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
  147. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  148. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
  149. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  150. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
  151. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  152. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
  153. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  154. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  155. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
  156. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  160. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +2 -0
  161. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +29 -21
  162. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  163. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  164. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  165. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  166. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  167. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  168. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +82 -101
  169. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  170. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  171. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
  172. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +44 -33
  173. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  174. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
  175. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
  176. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  177. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
  178. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  179. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +19 -3
  180. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  181. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +86 -142
  182. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  183. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  184. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
  185. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist +0 -0
  186. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  187. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  188. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  189. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
  190. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  191. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  192. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  193. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  194. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  195. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  196. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
  197. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  198. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  199. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
  200. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
  201. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  202. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
  203. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
  204. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  205. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
  206. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  207. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
  208. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  209. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
  210. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  211. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  212. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
  213. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  214. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  215. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  216. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  217. package/jest/mock.js +33 -7
  218. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  219. package/lib/commonjs/index.js +153 -21
  220. package/lib/commonjs/index.js.map +1 -1
  221. package/lib/module/NativeRNLlama.js.map +1 -1
  222. package/lib/module/index.js +152 -20
  223. package/lib/module/index.js.map +1 -1
  224. package/lib/typescript/NativeRNLlama.d.ts +54 -4
  225. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  226. package/lib/typescript/index.d.ts +72 -6
  227. package/lib/typescript/index.d.ts.map +1 -1
  228. package/package.json +1 -1
  229. package/src/NativeRNLlama.ts +72 -4
  230. package/src/index.ts +212 -38
  231. package/cpp/binary-ops.h +0 -16
  232. package/cpp/ops.h +0 -128
  233. package/cpp/simd-mappings.h +0 -888
  234. package/cpp/unary-ops.h +0 -28
  235. package/cpp/vec.h +0 -802
  236. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  237. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  238. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  239. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  240. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  241. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +0 -128
  242. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  243. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  244. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +0 -802
  245. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  246. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  247. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  248. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  249. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  250. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  251. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  252. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  253. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  254. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  255. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  256. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  257. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +0 -128
  258. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  259. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  260. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +0 -28
  261. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +0 -16
  262. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  263. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  264. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  265. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  266. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +0 -128
  267. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  268. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +0 -888
  269. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  270. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  271. package/lib/commonjs/chat.js +0 -37
  272. package/lib/commonjs/chat.js.map +0 -1
  273. package/lib/module/chat.js +0 -33
  274. package/lib/module/chat.js.map +0 -1
  275. package/lib/typescript/chat.d.ts +0 -10
  276. package/lib/typescript/chat.d.ts.map +0 -1
  277. package/src/chat.ts +0 -44
  278. /package/cpp/{binary-ops.cpp → ggml-cpu/binary-ops.cpp} +0 -0
  279. /package/cpp/{ggml-cpu-aarch64.h → ggml-cpu/ggml-cpu-aarch64.h} +0 -0
  280. /package/cpp/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -0
  281. /package/cpp/{ggml-cpu-quants.h → ggml-cpu/ggml-cpu-quants.h} +0 -0
  282. /package/cpp/{ggml-cpu-traits.cpp → ggml-cpu/ggml-cpu-traits.cpp} +0 -0
  283. /package/cpp/{ggml-cpu-traits.h → ggml-cpu/ggml-cpu-traits.h} +0 -0
  284. /package/cpp/{sgemm.h → ggml-cpu/sgemm.h} +0 -0
  285. /package/cpp/{unary-ops.cpp → ggml-cpu/unary-ops.cpp} +0 -0
@@ -0,0 +1,4024 @@
1
+ // NOTE: This is modified from clip.cpp only for LLaVA,
2
+ // so there might be still unnecessary artifacts hanging around
3
+ // I'll gradually clean and extend it
4
+ // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
5
+ #include "clip.h"
6
+ #include "clip-impl.h"
7
+ #include "ggml.h"
8
+ #include "ggml-cpp.h"
9
+ #include "ggml-cpu.h"
10
+ #include "ggml-alloc.h"
11
+ #include "ggml-backend.h"
12
+ #include "gguf.h"
13
+
14
+ #define STB_IMAGE_IMPLEMENTATION
15
+ #include "stb_image.h"
16
+
17
+ #include <cassert>
18
+ #include <cmath>
19
+ #include <cstdlib>
20
+ #include <cstring>
21
+ #include <fstream>
22
+ #include <map>
23
+ #include <regex>
24
+ #include <stdexcept>
25
+ #include <unordered_set>
26
+ #include <vector>
27
+ #include <sstream>
28
+ #include <cinttypes>
29
+ #include <limits>
30
+ #include <array>
31
+ #include <numeric>
32
+ #include <functional>
33
+
34
+ struct clip_logger_state g_logger_state = {LM_GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
35
+
36
+ enum ffn_op_type {
37
+ FFN_GELU,
38
+ FFN_GELU_ERF,
39
+ FFN_SILU,
40
+ FFN_GELU_QUICK,
41
+ };
42
+
43
+ enum norm_type {
44
+ NORM_TYPE_NORMAL,
45
+ NORM_TYPE_RMS,
46
+ };
47
+
48
+ //#define CLIP_DEBUG_FUNCTIONS
49
+
50
+ #ifdef CLIP_DEBUG_FUNCTIONS
51
+ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
52
+ std::ofstream file(filename, std::ios::binary);
53
+ if (!file.is_open()) {
54
+ LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
55
+ return;
56
+ }
57
+
58
+ // PPM header: P6 format, width, height, and max color value
59
+ file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
60
+
61
+ // Write pixel data
62
+ for (size_t i = 0; i < img.buf.size(); i += 3) {
63
+ // PPM expects binary data in RGB format, which matches our image buffer
64
+ file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
65
+ }
66
+
67
+ file.close();
68
+ }
69
+
70
+ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
71
+ std::ofstream file(filename, std::ios::binary);
72
+ if (!file.is_open()) {
73
+ LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
74
+ return;
75
+ }
76
+
77
+ int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
78
+ int bytesPerPixel = 3;
79
+ int widthInBytes = img.nx * bytesPerPixel;
80
+ int paddingAmount = (4 - (widthInBytes % 4)) % 4;
81
+ int stride = widthInBytes + paddingAmount;
82
+
83
+ // Bitmap file header
84
+ unsigned char fileHeader[14] = {
85
+ 'B','M', // Signature
86
+ 0,0,0,0, // Image file size in bytes
87
+ 0,0,0,0, // Reserved
88
+ 54,0,0,0 // Start of pixel array
89
+ };
90
+
91
+ // Total file size
92
+ fileSize = 54 + (stride * img.ny);
93
+ fileHeader[2] = (unsigned char)(fileSize);
94
+ fileHeader[3] = (unsigned char)(fileSize >> 8);
95
+ fileHeader[4] = (unsigned char)(fileSize >> 16);
96
+ fileHeader[5] = (unsigned char)(fileSize >> 24);
97
+
98
+ // Bitmap information header (BITMAPINFOHEADER)
99
+ unsigned char infoHeader[40] = {
100
+ 40,0,0,0, // Size of this header (40 bytes)
101
+ 0,0,0,0, // Image width
102
+ 0,0,0,0, // Image height
103
+ 1,0, // Number of color planes
104
+ 24,0, // Bits per pixel
105
+ 0,0,0,0, // No compression
106
+ 0,0,0,0, // Image size (can be 0 for no compression)
107
+ 0,0,0,0, // X pixels per meter (not specified)
108
+ 0,0,0,0, // Y pixels per meter (not specified)
109
+ 0,0,0,0, // Total colors (color table not used)
110
+ 0,0,0,0 // Important colors (all are important)
111
+ };
112
+
113
+ // Width and height in the information header
114
+ infoHeader[4] = (unsigned char)(img.nx);
115
+ infoHeader[5] = (unsigned char)(img.nx >> 8);
116
+ infoHeader[6] = (unsigned char)(img.nx >> 16);
117
+ infoHeader[7] = (unsigned char)(img.nx >> 24);
118
+ infoHeader[8] = (unsigned char)(img.ny);
119
+ infoHeader[9] = (unsigned char)(img.ny >> 8);
120
+ infoHeader[10] = (unsigned char)(img.ny >> 16);
121
+ infoHeader[11] = (unsigned char)(img.ny >> 24);
122
+
123
+ // Write file headers
124
+ file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
125
+ file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
126
+
127
+ // Pixel data
128
+ std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
129
+ for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
130
+ for (int x = 0; x < img.nx; ++x) {
131
+ // Each pixel
132
+ size_t pixelIndex = (y * img.nx + x) * 3;
133
+ unsigned char pixel[3] = {
134
+ img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
135
+ img.buf[pixelIndex + 1],
136
+ img.buf[pixelIndex]
137
+ };
138
+ file.write(reinterpret_cast<char*>(pixel), 3);
139
+ }
140
+ // Write padding for the row
141
+ file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
142
+ }
143
+
144
+ file.close();
145
+ }
146
+
147
+ // debug function to convert f32 to u8
148
+ static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
149
+ dst.nx = src.nx;
150
+ dst.ny = src.ny;
151
+ dst.buf.resize(3 * src.nx * src.ny);
152
+ for (size_t i = 0; i < src.buf.size(); ++i) {
153
+ dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
154
+ }
155
+ }
156
+ #endif
157
+
158
+
159
+ //
160
+ // clip layers
161
+ //
162
+
163
+ enum patch_merge_type {
164
+ PATCH_MERGE_FLAT,
165
+ PATCH_MERGE_SPATIAL_UNPAD,
166
+ };
167
+
168
+ struct clip_hparams {
169
+ bool has_vision = false;
170
+ bool has_audio = false;
171
+
172
+ int32_t image_size;
173
+ int32_t patch_size;
174
+ int32_t n_embd;
175
+ int32_t n_ff;
176
+ int32_t projection_dim;
177
+ int32_t n_head;
178
+ int32_t n_layer;
179
+ int32_t proj_scale_factor = 0; // idefics3
180
+
181
+ // for models using dynamic image size, we need to have a smaller image size to warmup
182
+ // otherwise, user will get OOM everytime they load the model
183
+ int32_t warmup_image_size = 0;
184
+
185
+ ffn_op_type ffn_op = FFN_GELU;
186
+
187
+ patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
188
+
189
+ float eps = 1e-6;
190
+ float rope_theta = 0.0;
191
+
192
+ std::vector<int32_t> image_grid_pinpoints;
193
+ int32_t image_crop_resolution;
194
+ std::unordered_set<int32_t> vision_feature_layer;
195
+ int32_t attn_window_size = 0;
196
+ int32_t n_wa_pattern = 0;
197
+ int32_t spatial_merge_size = 0;
198
+
199
+ // audio
200
+ int32_t n_mel_bins = 0; // whisper preprocessor
201
+ int32_t proj_stack_factor = 0; // ultravox
202
+ };
203
+
204
+ struct clip_layer {
205
+ // attention
206
+ lm_ggml_tensor * k_w = nullptr;
207
+ lm_ggml_tensor * k_b = nullptr;
208
+ lm_ggml_tensor * q_w = nullptr;
209
+ lm_ggml_tensor * q_b = nullptr;
210
+ lm_ggml_tensor * v_w = nullptr;
211
+ lm_ggml_tensor * v_b = nullptr;
212
+
213
+ lm_ggml_tensor * o_w = nullptr;
214
+ lm_ggml_tensor * o_b = nullptr;
215
+
216
+ lm_ggml_tensor * k_norm = nullptr;
217
+ lm_ggml_tensor * q_norm = nullptr;
218
+
219
+ // layernorm 1
220
+ lm_ggml_tensor * ln_1_w = nullptr;
221
+ lm_ggml_tensor * ln_1_b = nullptr;
222
+
223
+ lm_ggml_tensor * ff_up_w = nullptr;
224
+ lm_ggml_tensor * ff_up_b = nullptr;
225
+ lm_ggml_tensor * ff_gate_w = nullptr;
226
+ lm_ggml_tensor * ff_gate_b = nullptr;
227
+ lm_ggml_tensor * ff_down_w = nullptr;
228
+ lm_ggml_tensor * ff_down_b = nullptr;
229
+
230
+ // layernorm 2
231
+ lm_ggml_tensor * ln_2_w = nullptr;
232
+ lm_ggml_tensor * ln_2_b = nullptr;
233
+
234
+ // layer scale (no bias)
235
+ lm_ggml_tensor * ls_1_w = nullptr;
236
+ lm_ggml_tensor * ls_2_w = nullptr;
237
+ };
238
+
239
+ struct clip_vision_model {
240
+ struct clip_hparams hparams;
241
+
242
+ // embeddings
243
+ lm_ggml_tensor * class_embedding = nullptr;
244
+ lm_ggml_tensor * patch_embeddings_0 = nullptr;
245
+ lm_ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
246
+ lm_ggml_tensor * patch_bias = nullptr;
247
+ lm_ggml_tensor * position_embeddings = nullptr;
248
+
249
+ lm_ggml_tensor * pre_ln_w = nullptr;
250
+ lm_ggml_tensor * pre_ln_b = nullptr;
251
+
252
+ std::vector<clip_layer> layers;
253
+
254
+ lm_ggml_tensor * post_ln_w;
255
+ lm_ggml_tensor * post_ln_b;
256
+
257
+ lm_ggml_tensor * projection;
258
+
259
+ // LLaVA projection
260
+ lm_ggml_tensor * mm_input_norm_w = nullptr;
261
+ lm_ggml_tensor * mm_0_w = nullptr;
262
+ lm_ggml_tensor * mm_0_b = nullptr;
263
+ lm_ggml_tensor * mm_2_w = nullptr;
264
+ lm_ggml_tensor * mm_2_b = nullptr;
265
+
266
+ lm_ggml_tensor * image_newline = nullptr;
267
+
268
+ // Yi type models with mlp+normalization projection
269
+ lm_ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
270
+ lm_ggml_tensor * mm_1_b = nullptr;
271
+ lm_ggml_tensor * mm_3_w = nullptr;
272
+ lm_ggml_tensor * mm_3_b = nullptr;
273
+ lm_ggml_tensor * mm_4_w = nullptr;
274
+ lm_ggml_tensor * mm_4_b = nullptr;
275
+
276
+ // GLMV-Edge projection
277
+ lm_ggml_tensor * mm_model_adapter_conv_w = nullptr;
278
+ lm_ggml_tensor * mm_model_adapter_conv_b = nullptr;
279
+ lm_ggml_tensor * mm_glm_tok_boi = nullptr;
280
+ lm_ggml_tensor * mm_glm_tok_eoi = nullptr;
281
+
282
+ // MobileVLM projection
283
+ lm_ggml_tensor * mm_model_mlp_1_w = nullptr;
284
+ lm_ggml_tensor * mm_model_mlp_1_b = nullptr;
285
+ lm_ggml_tensor * mm_model_mlp_3_w = nullptr;
286
+ lm_ggml_tensor * mm_model_mlp_3_b = nullptr;
287
+ lm_ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
288
+ lm_ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
289
+ lm_ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
290
+ lm_ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
291
+ lm_ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
292
+ lm_ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
293
+ lm_ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
294
+ lm_ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
295
+ lm_ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
296
+ lm_ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
297
+ lm_ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
298
+ lm_ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
299
+ lm_ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
300
+ lm_ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
301
+ lm_ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
302
+ lm_ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
303
+ lm_ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
304
+ lm_ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
305
+ lm_ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
306
+ lm_ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
307
+
308
+ // MobileVLM_V2 projection
309
+ lm_ggml_tensor * mm_model_mlp_0_w = nullptr;
310
+ lm_ggml_tensor * mm_model_mlp_0_b = nullptr;
311
+ lm_ggml_tensor * mm_model_mlp_2_w = nullptr;
312
+ lm_ggml_tensor * mm_model_mlp_2_b = nullptr;
313
+ lm_ggml_tensor * mm_model_peg_0_w = nullptr;
314
+ lm_ggml_tensor * mm_model_peg_0_b = nullptr;
315
+
316
+ // MINICPMV projection
317
+ lm_ggml_tensor * mm_model_pos_embed_k = nullptr;
318
+ lm_ggml_tensor * mm_model_query = nullptr;
319
+ lm_ggml_tensor * mm_model_proj = nullptr;
320
+ lm_ggml_tensor * mm_model_kv_proj = nullptr;
321
+ lm_ggml_tensor * mm_model_attn_q_w = nullptr;
322
+ lm_ggml_tensor * mm_model_attn_q_b = nullptr;
323
+ lm_ggml_tensor * mm_model_attn_k_w = nullptr;
324
+ lm_ggml_tensor * mm_model_attn_k_b = nullptr;
325
+ lm_ggml_tensor * mm_model_attn_v_w = nullptr;
326
+ lm_ggml_tensor * mm_model_attn_v_b = nullptr;
327
+ lm_ggml_tensor * mm_model_attn_o_w = nullptr;
328
+ lm_ggml_tensor * mm_model_attn_o_b = nullptr;
329
+ lm_ggml_tensor * mm_model_ln_q_w = nullptr;
330
+ lm_ggml_tensor * mm_model_ln_q_b = nullptr;
331
+ lm_ggml_tensor * mm_model_ln_kv_w = nullptr;
332
+ lm_ggml_tensor * mm_model_ln_kv_b = nullptr;
333
+ lm_ggml_tensor * mm_model_ln_post_w = nullptr;
334
+ lm_ggml_tensor * mm_model_ln_post_b = nullptr;
335
+
336
+ // gemma3
337
+ lm_ggml_tensor * mm_input_proj_w = nullptr;
338
+ lm_ggml_tensor * mm_soft_emb_norm_w = nullptr;
339
+
340
+ // pixtral
341
+ lm_ggml_tensor * token_embd_img_break = nullptr;
342
+ lm_ggml_tensor * mm_patch_merger_w = nullptr;
343
+
344
+ // ultravox / whisper encoder
345
+ lm_ggml_tensor * conv1d_1_w = nullptr;
346
+ lm_ggml_tensor * conv1d_1_b = nullptr;
347
+ lm_ggml_tensor * conv1d_2_w = nullptr;
348
+ lm_ggml_tensor * conv1d_2_b = nullptr;
349
+ lm_ggml_tensor * mm_norm_pre_w = nullptr;
350
+ lm_ggml_tensor * mm_norm_mid_w = nullptr;
351
+ };
352
+
353
+ struct clip_ctx {
354
+ bool has_llava_projector = false;
355
+ int minicpmv_version = 0;
356
+
357
+ struct clip_vision_model vision_model;
358
+ projector_type proj_type = PROJECTOR_TYPE_MLP;
359
+
360
+ float image_mean[3];
361
+ float image_std[3];
362
+
363
+ lm_gguf_context_ptr ctx_gguf;
364
+ lm_ggml_context_ptr ctx_data;
365
+
366
+ std::vector<uint8_t> buf_compute_meta;
367
+
368
+ std::vector<lm_ggml_backend_t> backend_ptrs;
369
+ std::vector<lm_ggml_backend_buffer_type_t> backend_buft;
370
+
371
+ lm_ggml_backend_t backend;
372
+ lm_ggml_backend_t backend_cpu;
373
+ lm_ggml_backend_buffer_ptr buf;
374
+
375
+ int max_nodes = 8192;
376
+ lm_ggml_backend_sched_ptr sched;
377
+
378
+ // for debugging
379
+ bool debug_graph = false;
380
+ std::vector<lm_ggml_tensor *> debug_print_tensors;
381
+
382
+ clip_ctx(clip_context_params & ctx_params) {
383
+ debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
384
+ backend_cpu = lm_ggml_backend_init_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
385
+ if (!backend_cpu) {
386
+ throw std::runtime_error("failed to initialize CPU backend");
387
+ }
388
+ backend = ctx_params.use_gpu
389
+ ? lm_ggml_backend_init_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
390
+ : nullptr;
391
+
392
+ if (backend) {
393
+ LOG_INF("%s: CLIP using %s backend\n", __func__, lm_ggml_backend_name(backend));
394
+ backend_ptrs.push_back(backend);
395
+ backend_buft.push_back(lm_ggml_backend_get_default_buffer_type(backend));
396
+ } else {
397
+ backend = backend_cpu;
398
+ LOG_INF("%s: CLIP using CPU backend\n", __func__);
399
+ }
400
+
401
+ backend_ptrs.push_back(backend_cpu);
402
+ backend_buft.push_back(lm_ggml_backend_get_default_buffer_type(backend_cpu));
403
+
404
+ sched.reset(
405
+ lm_ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
406
+ );
407
+ }
408
+
409
+ ~clip_ctx() {
410
+ lm_ggml_backend_free(backend);
411
+ if (backend != backend_cpu) {
412
+ lm_ggml_backend_free(backend_cpu);
413
+ }
414
+ }
415
+ };
416
+
417
+ struct clip_graph {
418
+ clip_ctx * ctx;
419
+ const clip_vision_model & model;
420
+ const clip_hparams & hparams;
421
+
422
+ // we only support single image per batch
423
+ const clip_image_f32 & img;
424
+
425
+ const int patch_size;
426
+ const int n_patches_x;
427
+ const int n_patches_y;
428
+ const int n_patches;
429
+ const int n_embd;
430
+ const int n_head;
431
+ const int d_head;
432
+ const int n_layer;
433
+ const float eps;
434
+ const float kq_scale;
435
+
436
+ lm_ggml_context_ptr ctx0_ptr;
437
+ lm_ggml_context * ctx0;
438
+ lm_ggml_cgraph * gf;
439
+
440
+ clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
441
+ ctx(ctx),
442
+ model(ctx->vision_model),
443
+ hparams(model.hparams),
444
+ img(img),
445
+ patch_size(hparams.patch_size),
446
+ n_patches_x(img.nx / patch_size),
447
+ n_patches_y(img.ny / patch_size),
448
+ n_patches(n_patches_x * n_patches_y),
449
+ n_embd(hparams.n_embd),
450
+ n_head(hparams.n_head),
451
+ d_head(n_embd / n_head),
452
+ n_layer(hparams.n_layer),
453
+ eps(hparams.eps),
454
+ kq_scale(1.0f / sqrtf((float)d_head)) {
455
+ struct lm_ggml_init_params params = {
456
+ /*.mem_size =*/ ctx->buf_compute_meta.size(),
457
+ /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
458
+ /*.no_alloc =*/ true,
459
+ };
460
+ ctx0_ptr.reset(lm_ggml_init(params));
461
+ ctx0 = ctx0_ptr.get();
462
+ gf = lm_ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
463
+ }
464
+
465
+ lm_ggml_cgraph * build_siglip() {
466
+ lm_ggml_tensor * inp = build_inp();
467
+ lm_ggml_tensor * cur = build_vit(
468
+ inp, n_patches,
469
+ NORM_TYPE_NORMAL,
470
+ hparams.ffn_op,
471
+ model.position_embeddings,
472
+ nullptr);
473
+
474
+ if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
475
+ const int batch_size = 1;
476
+ LM_GGML_ASSERT(n_patches_x == n_patches_y);
477
+ const int patches_per_image = n_patches_x;
478
+ const int kernel_size = hparams.proj_scale_factor;
479
+
480
+ cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
481
+ cur = lm_ggml_reshape_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
482
+
483
+ // doing a pool2d to reduce the number of output tokens
484
+ cur = lm_ggml_pool_2d(ctx0, cur, LM_GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
485
+ cur = lm_ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size);
486
+ cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
487
+
488
+ // apply norm before projection
489
+ cur = lm_ggml_rms_norm(ctx0, cur, eps);
490
+ cur = lm_ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
491
+
492
+ // apply projection
493
+ cur = lm_ggml_mul_mat(ctx0,
494
+ lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, model.mm_input_proj_w)),
495
+ cur);
496
+
497
+ } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
498
+ // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
499
+
500
+ const int scale_factor = model.hparams.proj_scale_factor;
501
+ const int n_embd = cur->ne[0];
502
+ const int seq = cur->ne[1];
503
+ const int bsz = 1; // batch size, always 1 for now since we don't support batching
504
+ const int height = std::sqrt(seq);
505
+ const int width = std::sqrt(seq);
506
+ LM_GGML_ASSERT(scale_factor != 0);
507
+ cur = lm_ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
508
+ cur = lm_ggml_permute(ctx0, cur, 0, 2, 1, 3);
509
+ cur = lm_ggml_reshape_4d(ctx0, lm_ggml_cont(ctx0, cur),
510
+ n_embd * scale_factor * scale_factor,
511
+ height / scale_factor,
512
+ width / scale_factor,
513
+ bsz);
514
+ cur = lm_ggml_permute(ctx0, cur, 0, 2, 1, 3);
515
+ cur = lm_ggml_reshape_3d(ctx0, lm_ggml_cont(ctx0, cur),
516
+ n_embd * scale_factor * scale_factor,
517
+ seq / (scale_factor * scale_factor),
518
+ bsz);
519
+
520
+ cur = lm_ggml_mul_mat(ctx0, model.projection, cur);
521
+ } else {
522
+ LM_GGML_ABORT("SigLIP: Unsupported projector type");
523
+ }
524
+
525
+ // build the graph
526
+ lm_ggml_build_forward_expand(gf, cur);
527
+
528
+ return gf;
529
+ }
530
+
531
+ lm_ggml_cgraph * build_pixtral() {
532
+ const int n_merge = hparams.spatial_merge_size;
533
+
534
+ // 2D input positions
535
+ lm_ggml_tensor * pos_h = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_patches);
536
+ lm_ggml_set_name(pos_h, "pos_h");
537
+ lm_ggml_set_input(pos_h);
538
+
539
+ lm_ggml_tensor * pos_w = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_patches);
540
+ lm_ggml_set_name(pos_w, "pos_w");
541
+ lm_ggml_set_input(pos_w);
542
+
543
+ auto add_pos = [&](lm_ggml_tensor * cur, const clip_layer &) {
544
+ return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true);
545
+ };
546
+
547
+ lm_ggml_tensor * inp = build_inp();
548
+ lm_ggml_tensor * cur = build_vit(
549
+ inp, n_patches,
550
+ NORM_TYPE_RMS,
551
+ hparams.ffn_op,
552
+ nullptr, // no learned pos embd
553
+ add_pos);
554
+
555
+ // mistral small 3.1 patch merger
556
+ // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
557
+ if (model.mm_patch_merger_w) {
558
+ LM_GGML_ASSERT(hparams.spatial_merge_size > 0);
559
+
560
+ cur = lm_ggml_mul(ctx0, lm_ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
561
+
562
+ // reshape image tokens to 2D grid
563
+ cur = lm_ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
564
+ cur = lm_ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
565
+ cur = lm_ggml_cont(ctx0, cur);
566
+
567
+ // torch.nn.functional.unfold is just an im2col under the hood
568
+ // we just need a dummy kernel to make it work
569
+ lm_ggml_tensor * kernel = lm_ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
570
+ cur = lm_ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
571
+
572
+ // project to n_embd
573
+ cur = lm_ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
574
+ cur = lm_ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
575
+ }
576
+
577
+ // LlavaMultiModalProjector (always using GELU activation)
578
+ {
579
+ cur = lm_ggml_mul_mat(ctx0, model.mm_1_w, cur);
580
+ if (model.mm_1_b) {
581
+ cur = lm_ggml_add(ctx0, cur, model.mm_1_b);
582
+ }
583
+
584
+ cur = lm_ggml_gelu(ctx0, cur);
585
+ cur = lm_ggml_mul_mat(ctx0, model.mm_2_w, cur);
586
+ if (model.mm_2_b) {
587
+ cur = lm_ggml_add(ctx0, cur, model.mm_2_b);
588
+ }
589
+ }
590
+
591
+ // arrangement of the [IMG_BREAK] token
592
+ {
593
+ // not efficient, but works
594
+ // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
595
+ // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
596
+ // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
597
+
598
+ const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
599
+ const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
600
+ const int p_total = p_x * p_y;
601
+ const int n_embd_text = cur->ne[0];
602
+ const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row
603
+
604
+ lm_ggml_tensor * tmp = lm_ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y);
605
+ lm_ggml_tensor * tok = lm_ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y);
606
+ tok = lm_ggml_scale(ctx0, tok, 0.0); // clear the tensor
607
+ tok = lm_ggml_add(ctx0, tok, model.token_embd_img_break);
608
+ tmp = lm_ggml_concat(ctx0, tmp, tok, 1);
609
+ cur = lm_ggml_view_2d(ctx0, tmp,
610
+ n_embd_text, n_tokens_output,
611
+ lm_ggml_row_size(tmp->type, n_embd_text), 0);
612
+ }
613
+
614
+ // build the graph
615
+ lm_ggml_build_forward_expand(gf, cur);
616
+
617
+ return gf;
618
+ }
619
+
620
+ // Qwen2VL and Qwen2.5VL use M-RoPE
621
+ lm_ggml_cgraph * build_qwen2vl() {
622
+ LM_GGML_ASSERT(model.patch_bias == nullptr);
623
+ LM_GGML_ASSERT(model.class_embedding == nullptr);
624
+
625
+ const int batch_size = 1;
626
+ const bool use_window_attn = hparams.n_wa_pattern > 0;
627
+ const int n_wa_pattern = hparams.n_wa_pattern;
628
+ const int n_pos = n_patches;
629
+ const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
630
+
631
+ norm_type norm_t = ctx->proj_type == PROJECTOR_TYPE_QWEN25VL
632
+ ? NORM_TYPE_RMS // qwen 2.5 vl
633
+ : NORM_TYPE_NORMAL; // qwen 2 vl
634
+
635
+ int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
636
+
637
+ lm_ggml_tensor * inp_raw = build_inp_raw();
638
+ lm_ggml_tensor * inp = lm_ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
639
+
640
+ LM_GGML_ASSERT(img.nx % (patch_size * 2) == 0);
641
+ LM_GGML_ASSERT(img.ny % (patch_size * 2) == 0);
642
+
643
+ // second conv dimension
644
+ {
645
+ auto inp_1 = lm_ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
646
+ inp = lm_ggml_add(ctx0, inp, inp_1);
647
+
648
+ inp = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
649
+ inp = lm_ggml_reshape_4d(
650
+ ctx0, inp,
651
+ n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
652
+ inp = lm_ggml_reshape_4d(
653
+ ctx0, inp,
654
+ n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
655
+ inp = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, inp, 0, 2, 1, 3));
656
+ inp = lm_ggml_reshape_3d(
657
+ ctx0, inp,
658
+ n_embd, n_patches_x * n_patches_y, batch_size);
659
+ }
660
+
661
+ lm_ggml_tensor * inpL = inp;
662
+ lm_ggml_tensor * window_mask = nullptr;
663
+ lm_ggml_tensor * window_idx = nullptr;
664
+ lm_ggml_tensor * inv_window_idx = nullptr;
665
+
666
+ lm_ggml_tensor * positions = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, num_position_ids);
667
+ lm_ggml_set_name(positions, "positions");
668
+ lm_ggml_set_input(positions);
669
+
670
+ // pre-layernorm
671
+ if (model.pre_ln_w) {
672
+ inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
673
+ }
674
+
675
+ if (use_window_attn) {
676
+ // handle window attention inputs
677
+ inv_window_idx = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_pos / 4);
678
+ lm_ggml_set_name(inv_window_idx, "inv_window_idx");
679
+ lm_ggml_set_input(inv_window_idx);
680
+ // mask for window attention
681
+ window_mask = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_pos, n_pos);
682
+ lm_ggml_set_name(window_mask, "window_mask");
683
+ lm_ggml_set_input(window_mask);
684
+
685
+ // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
686
+ LM_GGML_ASSERT(batch_size == 1);
687
+ inpL = lm_ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
688
+ inpL = lm_ggml_get_rows(ctx0, inpL, inv_window_idx);
689
+ inpL = lm_ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
690
+ }
691
+
692
+ // loop over layers
693
+ for (int il = 0; il < n_layer; il++) {
694
+ auto & layer = model.layers[il];
695
+ const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
696
+
697
+ lm_ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
698
+
699
+ // layernorm1
700
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
701
+ cb(cur, "ln1", il);
702
+
703
+ // self-attention
704
+ {
705
+ lm_ggml_tensor * Qcur = lm_ggml_add(ctx0,
706
+ lm_ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
707
+ lm_ggml_tensor * Kcur = lm_ggml_add(ctx0,
708
+ lm_ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
709
+ lm_ggml_tensor * Vcur = lm_ggml_add(ctx0,
710
+ lm_ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
711
+
712
+ Qcur = lm_ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
713
+ Kcur = lm_ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
714
+ Vcur = lm_ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
715
+
716
+ cb(Qcur, "Qcur", il);
717
+ cb(Kcur, "Kcur", il);
718
+ cb(Vcur, "Vcur", il);
719
+
720
+ // apply M-RoPE
721
+ Qcur = lm_ggml_rope_multi(
722
+ ctx0, Qcur, positions, nullptr,
723
+ d_head/2, mrope_sections, LM_GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
724
+ Kcur = lm_ggml_rope_multi(
725
+ ctx0, Kcur, positions, nullptr,
726
+ d_head/2, mrope_sections, LM_GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
727
+
728
+ cb(Qcur, "Qcur_rope", il);
729
+ cb(Kcur, "Kcur_rope", il);
730
+
731
+ lm_ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
732
+
733
+ cur = build_attn(layer.o_w, layer.o_b,
734
+ Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
735
+ cb(cur, "attn_out", il);
736
+ }
737
+
738
+ // re-add the layer input, e.g., residual
739
+ cur = lm_ggml_add(ctx0, cur, inpL);
740
+
741
+ inpL = cur; // inpL = residual, cur = hidden_states
742
+
743
+ cb(cur, "ffn_inp", il);
744
+
745
+ // layernorm2
746
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
747
+ cb(cur, "ffn_inp_normed", il);
748
+
749
+ // ffn
750
+ cur = build_ffn(cur,
751
+ layer.ff_up_w, layer.ff_up_b,
752
+ layer.ff_gate_w, layer.ff_gate_b,
753
+ layer.ff_down_w, layer.ff_down_b,
754
+ hparams.ffn_op, il);
755
+
756
+ cb(cur, "ffn_out", il);
757
+
758
+ // residual 2
759
+ cur = lm_ggml_add(ctx0, inpL, cur);
760
+ cb(cur, "layer_out", il);
761
+
762
+ inpL = cur;
763
+ }
764
+
765
+ // post-layernorm
766
+ if (model.post_ln_w) {
767
+ inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
768
+ }
769
+
770
+ // multimodal projection
771
+ lm_ggml_tensor * embeddings = inpL;
772
+ embeddings = lm_ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
773
+
774
+ embeddings = lm_ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
775
+ embeddings = lm_ggml_add(ctx0, embeddings, model.mm_0_b);
776
+
777
+ // GELU activation
778
+ embeddings = lm_ggml_gelu(ctx0, embeddings);
779
+
780
+ // Second linear layer
781
+ embeddings = lm_ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
782
+ embeddings = lm_ggml_add(ctx0, embeddings, model.mm_1_b);
783
+
784
+ if (use_window_attn) {
785
+ window_idx = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_pos / 4);
786
+ lm_ggml_set_name(window_idx, "window_idx");
787
+ lm_ggml_set_input(window_idx);
788
+
789
+ // embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size]
790
+ LM_GGML_ASSERT(batch_size == 1);
791
+ embeddings = lm_ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4);
792
+ embeddings = lm_ggml_get_rows(ctx0, embeddings, window_idx);
793
+ embeddings = lm_ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size);
794
+ }
795
+
796
+ // build the graph
797
+ lm_ggml_build_forward_expand(gf, embeddings);
798
+
799
+ return gf;
800
+ }
801
+
802
+ lm_ggml_cgraph * build_minicpmv() {
803
+ const int batch_size = 1;
804
+
805
+ LM_GGML_ASSERT(model.class_embedding == nullptr);
806
+ const int n_pos = n_patches;
807
+
808
+ // position embeddings for the projector (not for ViT)
809
+ int n_output_dim = clip_n_mmproj_embd(ctx);
810
+ lm_ggml_tensor * pos_embed = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_output_dim, n_pos, batch_size);
811
+ lm_ggml_set_name(pos_embed, "pos_embed");
812
+ lm_ggml_set_input(pos_embed);
813
+
814
+ // for selecting learned pos embd, used by ViT
815
+ struct lm_ggml_tensor * positions = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_pos);
816
+ lm_ggml_set_name(positions, "positions");
817
+ lm_ggml_set_input(positions);
818
+
819
+ lm_ggml_tensor * learned_pos_embd = lm_ggml_get_rows(ctx0, model.position_embeddings, positions);
820
+
821
+ lm_ggml_tensor * inp = build_inp();
822
+ lm_ggml_tensor * embeddings = build_vit(
823
+ inp, n_patches,
824
+ NORM_TYPE_NORMAL,
825
+ hparams.ffn_op,
826
+ learned_pos_embd,
827
+ nullptr);
828
+
829
+ // resampler projector (it is just another transformer)
830
+
831
+ lm_ggml_tensor * q = model.mm_model_query;
832
+ lm_ggml_tensor * v = lm_ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
833
+
834
+ // norm
835
+ q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
836
+ v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
837
+
838
+ // k = v + pos_embed
839
+ lm_ggml_tensor * k = lm_ggml_add(ctx0, v, pos_embed);
840
+
841
+ // attention
842
+ {
843
+ int n_embd = clip_n_mmproj_embd(ctx);
844
+ const int d_head = 128;
845
+ int n_head = n_embd/d_head;
846
+ int num_query = 96;
847
+ if (ctx->minicpmv_version == 2) {
848
+ num_query = 96;
849
+ } else if (ctx->minicpmv_version == 3) {
850
+ num_query = 64;
851
+ } else if (ctx->minicpmv_version == 4) {
852
+ num_query = 64;
853
+ }
854
+
855
+ lm_ggml_tensor * Q = lm_ggml_add(ctx0,
856
+ lm_ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
857
+ model.mm_model_attn_q_b);
858
+ lm_ggml_tensor * K = lm_ggml_add(ctx0,
859
+ lm_ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
860
+ model.mm_model_attn_k_b);
861
+ lm_ggml_tensor * V = lm_ggml_add(ctx0,
862
+ lm_ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
863
+ model.mm_model_attn_v_b);
864
+
865
+ Q = lm_ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
866
+ K = lm_ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
867
+ V = lm_ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
868
+
869
+ cb(Q, "resampler_Q", -1);
870
+ cb(K, "resampler_K", -1);
871
+ cb(V, "resampler_V", -1);
872
+
873
+ embeddings = build_attn(
874
+ model.mm_model_attn_o_w,
875
+ model.mm_model_attn_o_b,
876
+ Q, K, V, nullptr, kq_scale, -1);
877
+ cb(embeddings, "resampler_attn_out", -1);
878
+ }
879
+ // layernorm
880
+ embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
881
+
882
+ // projection
883
+ embeddings = lm_ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
884
+
885
+ // build the graph
886
+ lm_ggml_build_forward_expand(gf, embeddings);
887
+
888
+ return gf;
889
+ }
890
+
891
+ lm_ggml_cgraph * build_internvl() {
892
+ LM_GGML_ASSERT(model.class_embedding != nullptr);
893
+ LM_GGML_ASSERT(model.position_embeddings != nullptr);
894
+
895
+ const int n_pos = n_patches + 1;
896
+ lm_ggml_tensor * inp = build_inp();
897
+
898
+ // add CLS token
899
+ inp = lm_ggml_concat(ctx0, inp, model.class_embedding, 1);
900
+
901
+ // The larger models use a different ViT, which uses RMS norm instead of layer norm
902
+ // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
903
+ norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
904
+ ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
905
+ : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
906
+
907
+ lm_ggml_tensor * cur = build_vit(
908
+ inp, n_pos,
909
+ norm_t,
910
+ hparams.ffn_op,
911
+ model.position_embeddings,
912
+ nullptr);
913
+
914
+ // remove CLS token
915
+ cur = lm_ggml_view_2d(ctx0, cur,
916
+ n_embd, n_patches,
917
+ lm_ggml_row_size(cur->type, n_embd), 0);
918
+
919
+ // pixel shuffle
920
+ {
921
+ const int scale_factor = model.hparams.proj_scale_factor;
922
+ const int bsz = 1; // batch size, always 1 for now since we don't support batching
923
+ const int height = n_patches_y;
924
+ const int width = n_patches_x;
925
+ LM_GGML_ASSERT(scale_factor > 0);
926
+ cur = lm_ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
927
+ cur = lm_ggml_permute(ctx0, cur, 0, 2, 1, 3);
928
+ cur = lm_ggml_reshape_4d(ctx0, lm_ggml_cont(ctx0, cur),
929
+ n_embd * scale_factor * scale_factor,
930
+ height / scale_factor,
931
+ width / scale_factor,
932
+ bsz);
933
+ cur = lm_ggml_permute(ctx0, cur, 0, 2, 1, 3);
934
+ // flatten to 2D
935
+ cur = lm_ggml_reshape_2d(ctx0, lm_ggml_cont(ctx0, cur),
936
+ n_embd * scale_factor * scale_factor,
937
+ cur->ne[1] * cur->ne[2]);
938
+ }
939
+
940
+ // projector (always using GELU activation)
941
+ {
942
+ // projector LayerNorm uses pytorch's default eps = 1e-5
943
+ // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
944
+ cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
945
+ cur = lm_ggml_mul_mat(ctx0, model.mm_1_w, cur);
946
+ cur = lm_ggml_add(ctx0, cur, model.mm_1_b);
947
+ cur = lm_ggml_gelu(ctx0, cur);
948
+ cur = lm_ggml_mul_mat(ctx0, model.mm_3_w, cur);
949
+ cur = lm_ggml_add(ctx0, cur, model.mm_3_b);
950
+ }
951
+
952
+ // build the graph
953
+ lm_ggml_build_forward_expand(gf, cur);
954
+
955
+ return gf;
956
+ }
957
+
958
+ lm_ggml_cgraph * build_llama4() {
959
+ LM_GGML_ASSERT(model.class_embedding != nullptr);
960
+ LM_GGML_ASSERT(model.position_embeddings != nullptr);
961
+
962
+ const int n_pos = n_patches + 1; // +1 for [CLS]
963
+
964
+ // 2D input positions
965
+ lm_ggml_tensor * pos_h = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_pos);
966
+ lm_ggml_set_name(pos_h, "pos_h");
967
+ lm_ggml_set_input(pos_h);
968
+
969
+ lm_ggml_tensor * pos_w = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_pos);
970
+ lm_ggml_set_name(pos_w, "pos_w");
971
+ lm_ggml_set_input(pos_w);
972
+
973
+ lm_ggml_tensor * inp = build_inp_raw();
974
+
975
+ // Llama4UnfoldConvolution
976
+ {
977
+ lm_ggml_tensor * kernel = lm_ggml_reshape_4d(ctx0, model.patch_embeddings_0,
978
+ patch_size, patch_size, 3, n_embd);
979
+ inp = lm_ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
980
+ inp = lm_ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
981
+ inp = lm_ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
982
+ cb(inp, "patch_conv", -1);
983
+ }
984
+
985
+ // add CLS token
986
+ inp = lm_ggml_concat(ctx0, inp, model.class_embedding, 1);
987
+
988
+ // build ViT with 2D position embeddings
989
+ auto add_pos = [&](lm_ggml_tensor * cur, const clip_layer &) {
990
+ // first half is X axis and second half is Y axis
991
+ // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
992
+ // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
993
+ return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
994
+ };
995
+ lm_ggml_tensor * cur = build_vit(
996
+ inp, n_pos,
997
+ NORM_TYPE_NORMAL,
998
+ hparams.ffn_op,
999
+ model.position_embeddings,
1000
+ add_pos);
1001
+
1002
+ // remove CLS token
1003
+ cur = lm_ggml_view_2d(ctx0, cur,
1004
+ n_embd, n_patches,
1005
+ lm_ggml_row_size(cur->type, n_embd), 0);
1006
+
1007
+ // pixel shuffle
1008
+ // based on Llama4VisionPixelShuffleMLP
1009
+ // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
1010
+ {
1011
+ const int scale_factor = model.hparams.proj_scale_factor;
1012
+ const int bsz = 1; // batch size, always 1 for now since we don't support batching
1013
+ LM_GGML_ASSERT(scale_factor > 0);
1014
+ LM_GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
1015
+ cur = lm_ggml_reshape_4d(ctx0, cur,
1016
+ n_embd * scale_factor,
1017
+ n_patches_x / scale_factor,
1018
+ n_patches_y,
1019
+ bsz);
1020
+ cur = lm_ggml_permute(ctx0, cur, 0, 2, 1, 3);
1021
+ cur = lm_ggml_reshape_4d(ctx0, lm_ggml_cont(ctx0, cur),
1022
+ n_embd * scale_factor * scale_factor,
1023
+ n_patches_x / scale_factor,
1024
+ n_patches_y / scale_factor,
1025
+ bsz);
1026
+ cur = lm_ggml_permute(ctx0, cur, 0, 2, 1, 3);
1027
+ // flatten to 2D
1028
+ cur = lm_ggml_reshape_2d(ctx0, lm_ggml_cont(ctx0, cur),
1029
+ n_embd * scale_factor * scale_factor,
1030
+ n_patches / scale_factor / scale_factor);
1031
+ cb(cur, "pixel_shuffle", -1);
1032
+ }
1033
+
1034
+ // based on Llama4VisionMLP2 (always uses GELU activation, no bias)
1035
+ {
1036
+ cur = lm_ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
1037
+ cur = lm_ggml_gelu(ctx0, cur);
1038
+ cur = lm_ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
1039
+ cur = lm_ggml_gelu(ctx0, cur);
1040
+ cb(cur, "adapter_mlp", -1);
1041
+ }
1042
+
1043
+ // Llama4MultiModalProjector
1044
+ cur = lm_ggml_mul_mat(ctx0, model.mm_model_proj, cur);
1045
+ cb(cur, "projected", -1);
1046
+
1047
+ // build the graph
1048
+ lm_ggml_build_forward_expand(gf, cur);
1049
+
1050
+ return gf;
1051
+ }
1052
+
1053
+ // this graph is used by llava, granite and glm
1054
+ // due to having embedding_stack (used by granite), we cannot reuse build_vit
1055
+ lm_ggml_cgraph * build_llava() {
1056
+ const int batch_size = 1;
1057
+ const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
1058
+
1059
+ LM_GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
1060
+
1061
+ // Calculate the deepest feature layer based on hparams and projector type
1062
+ int max_feature_layer = n_layer;
1063
+ {
1064
+ // Get the index of the second to last layer; this is the default for models that have a llava projector
1065
+ int il_last = hparams.n_layer - 1;
1066
+ int deepest_feature_layer = -1;
1067
+
1068
+ if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
1069
+ il_last += 1;
1070
+ }
1071
+
1072
+ // If we set explicit vision feature layers, only go up to the deepest one
1073
+ // NOTE: only used by granite-vision models for now
1074
+ for (const auto & feature_layer : hparams.vision_feature_layer) {
1075
+ if (feature_layer > deepest_feature_layer) {
1076
+ deepest_feature_layer = feature_layer;
1077
+ }
1078
+ }
1079
+ max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
1080
+ }
1081
+
1082
+ lm_ggml_tensor * inp = build_inp();
1083
+
1084
+ // concat class_embeddings and patch_embeddings
1085
+ if (model.class_embedding) {
1086
+ inp = lm_ggml_concat(ctx0, inp, model.class_embedding, 1);
1087
+ }
1088
+
1089
+ lm_ggml_tensor * positions = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_pos);
1090
+ lm_ggml_set_name(positions, "positions");
1091
+ lm_ggml_set_input(positions);
1092
+
1093
+ inp = lm_ggml_add(ctx0, inp, lm_ggml_get_rows(ctx0, model.position_embeddings, positions));
1094
+
1095
+ lm_ggml_tensor * inpL = inp;
1096
+
1097
+ // pre-layernorm
1098
+ if (model.pre_ln_w) {
1099
+ inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
1100
+ cb(inpL, "pre_ln", -1);
1101
+ }
1102
+
1103
+ std::vector<lm_ggml_tensor *> embedding_stack;
1104
+ const auto & vision_feature_layer = hparams.vision_feature_layer;
1105
+
1106
+ // loop over layers
1107
+ for (int il = 0; il < max_feature_layer; il++) {
1108
+ auto & layer = model.layers[il];
1109
+ lm_ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
1110
+
1111
+ // If this is an embedding feature layer, save the output.
1112
+ // NOTE: 0 index here refers to the input to the encoder.
1113
+ if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
1114
+ embedding_stack.push_back(cur);
1115
+ }
1116
+
1117
+ // layernorm1
1118
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
1119
+ cb(cur, "layer_inp_normed", il);
1120
+
1121
+ // self-attention
1122
+ {
1123
+ lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, layer.q_w, cur);
1124
+ if (layer.q_b) {
1125
+ Qcur = lm_ggml_add(ctx0, Qcur, layer.q_b);
1126
+ }
1127
+
1128
+ lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, layer.k_w, cur);
1129
+ if (layer.k_b) {
1130
+ Kcur = lm_ggml_add(ctx0, Kcur, layer.k_b);
1131
+ }
1132
+
1133
+ lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, layer.v_w, cur);
1134
+ if (layer.v_b) {
1135
+ Vcur = lm_ggml_add(ctx0, Vcur, layer.v_b);
1136
+ }
1137
+
1138
+ Qcur = lm_ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
1139
+ Kcur = lm_ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
1140
+ Vcur = lm_ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
1141
+
1142
+ cb(Qcur, "Qcur", il);
1143
+ cb(Kcur, "Kcur", il);
1144
+ cb(Vcur, "Vcur", il);
1145
+
1146
+ cur = build_attn(layer.o_w, layer.o_b,
1147
+ Qcur, Kcur, Vcur, nullptr, kq_scale, il);
1148
+ cb(cur, "attn_out", il);
1149
+ }
1150
+
1151
+ // re-add the layer input, e.g., residual
1152
+ cur = lm_ggml_add(ctx0, cur, inpL);
1153
+
1154
+ inpL = cur; // inpL = residual, cur = hidden_states
1155
+
1156
+ cb(cur, "ffn_inp", il);
1157
+
1158
+ // layernorm2
1159
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
1160
+ cb(cur, "ffn_inp_normed", il);
1161
+
1162
+ // ffn
1163
+ cur = build_ffn(cur,
1164
+ layer.ff_up_w, layer.ff_up_b,
1165
+ layer.ff_gate_w, layer.ff_gate_b,
1166
+ layer.ff_down_w, layer.ff_down_b,
1167
+ hparams.ffn_op, il);
1168
+
1169
+ cb(cur, "ffn_out", il);
1170
+
1171
+ // residual 2
1172
+ cur = lm_ggml_add(ctx0, inpL, cur);
1173
+ cb(cur, "layer_out", il);
1174
+
1175
+ inpL = cur;
1176
+ }
1177
+
1178
+ // post-layernorm
1179
+ if (model.post_ln_w) {
1180
+ inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
1181
+ }
1182
+
1183
+ lm_ggml_tensor * embeddings = inpL;
1184
+
1185
+ // process vision feature layers (used by granite)
1186
+ {
1187
+ // final layer is a vision feature layer
1188
+ if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
1189
+ embedding_stack.push_back(inpL);
1190
+ }
1191
+
1192
+ // If feature layers are explicitly set, stack them (if we have multiple)
1193
+ if (!embedding_stack.empty()) {
1194
+ embeddings = embedding_stack[0];
1195
+ for (size_t i = 1; i < embedding_stack.size(); i++) {
1196
+ embeddings = lm_ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
1197
+ }
1198
+ }
1199
+ }
1200
+
1201
+ // llava projector (also used by granite)
1202
+ if (ctx->has_llava_projector) {
1203
+ embeddings = lm_ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
1204
+
1205
+ lm_ggml_tensor * patches = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_patches);
1206
+ lm_ggml_set_name(patches, "patches");
1207
+ lm_ggml_set_input(patches);
1208
+
1209
+ // shape [1, 576, 1024]
1210
+ // ne is whcn, ne = [1024, 576, 1, 1]
1211
+ embeddings = lm_ggml_get_rows(ctx0, embeddings, patches);
1212
+
1213
+ // print_tensor_info(embeddings, "embeddings");
1214
+
1215
+ // llava projector
1216
+ if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
1217
+ embeddings = lm_ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
1218
+ embeddings = lm_ggml_add(ctx0, embeddings, model.mm_0_b);
1219
+
1220
+ embeddings = lm_ggml_gelu(ctx0, embeddings);
1221
+ if (model.mm_2_w) {
1222
+ embeddings = lm_ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
1223
+ embeddings = lm_ggml_add(ctx0, embeddings, model.mm_2_b);
1224
+ }
1225
+ }
1226
+ else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
1227
+ embeddings = lm_ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
1228
+ embeddings = lm_ggml_add(ctx0, embeddings, model.mm_0_b);
1229
+ // lm_ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
1230
+ // First LayerNorm
1231
+ embeddings = lm_ggml_norm(ctx0, embeddings, eps);
1232
+ embeddings = lm_ggml_add(ctx0, lm_ggml_mul(ctx0, embeddings, model.mm_1_w),
1233
+ model.mm_1_b);
1234
+
1235
+ // GELU activation
1236
+ embeddings = lm_ggml_gelu(ctx0, embeddings);
1237
+
1238
+ // Second linear layer
1239
+ embeddings = lm_ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
1240
+ embeddings = lm_ggml_add(ctx0, embeddings, model.mm_3_b);
1241
+
1242
+ // Second LayerNorm
1243
+ embeddings = lm_ggml_norm(ctx0, embeddings, eps);
1244
+ embeddings = lm_ggml_add(ctx0, lm_ggml_mul(ctx0, embeddings, model.mm_4_w),
1245
+ model.mm_4_b);
1246
+ }
1247
+ else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
1248
+ // MobileVLM projector
1249
+ int n_patch = 24;
1250
+ lm_ggml_tensor * mlp_1 = lm_ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
1251
+ mlp_1 = lm_ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
1252
+ mlp_1 = lm_ggml_gelu(ctx0, mlp_1);
1253
+ lm_ggml_tensor * mlp_3 = lm_ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
1254
+ mlp_3 = lm_ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
1255
+ // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
1256
+
1257
+ // block 1
1258
+ lm_ggml_tensor * block_1 = nullptr;
1259
+ {
1260
+ // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
1261
+ mlp_3 = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
1262
+ mlp_3 = lm_ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
1263
+ // stride = 1, padding = 1, bias is nullptr
1264
+ block_1 = lm_ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
1265
+
1266
+ // layer norm
1267
+ // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
1268
+ block_1 = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, block_1, 1, 2, 0, 3));
1269
+ // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
1270
+ block_1 = lm_ggml_norm(ctx0, block_1, eps);
1271
+ block_1 = lm_ggml_add(ctx0, lm_ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
1272
+ block_1 = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, block_1, 2, 0, 1, 3));
1273
+
1274
+ // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
1275
+ // hardswish
1276
+ lm_ggml_tensor * block_1_hw = lm_ggml_hardswish(ctx0, block_1);
1277
+
1278
+ block_1 = lm_ggml_pool_2d(ctx0, block_1_hw, LM_GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
1279
+ // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
1280
+ // pointwise conv
1281
+ block_1 = lm_ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
1282
+ block_1 = lm_ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
1283
+ block_1 = lm_ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
1284
+ block_1 = lm_ggml_relu(ctx0, block_1);
1285
+ block_1 = lm_ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
1286
+ block_1 = lm_ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
1287
+ block_1 = lm_ggml_hardsigmoid(ctx0, block_1);
1288
+ // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
1289
+ block_1 = lm_ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
1290
+ block_1 = lm_ggml_mul(ctx0, block_1_hw, block_1);
1291
+
1292
+ int w = block_1->ne[0], h = block_1->ne[1];
1293
+ block_1 = lm_ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
1294
+ block_1 = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, block_1, 1, 0, 2, 3));
1295
+
1296
+ // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
1297
+ block_1 = lm_ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
1298
+ block_1 = lm_ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
1299
+
1300
+ // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
1301
+ block_1 = lm_ggml_norm(ctx0, block_1, eps);
1302
+ block_1 = lm_ggml_add(ctx0, lm_ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
1303
+ block_1 = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, block_1, 2, 0, 1, 3));
1304
+ // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
1305
+ // residual
1306
+ block_1 = lm_ggml_add(ctx0, mlp_3, block_1);
1307
+ }
1308
+
1309
+ // block_2
1310
+ {
1311
+ // stride = 2
1312
+ block_1 = lm_ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
1313
+
1314
+ // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
1315
+ // layer norm
1316
+ block_1 = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, block_1, 1, 2, 0, 3));
1317
+ // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
1318
+ block_1 = lm_ggml_norm(ctx0, block_1, eps);
1319
+ block_1 = lm_ggml_add(ctx0, lm_ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
1320
+ block_1 = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, block_1, 2, 0, 1, 3));
1321
+ // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
1322
+ // hardswish
1323
+ lm_ggml_tensor * block_1_hw = lm_ggml_hardswish(ctx0, block_1);
1324
+
1325
+ // not sure the parameters is right for globalAvgPooling
1326
+ block_1 = lm_ggml_pool_2d(ctx0, block_1_hw, LM_GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
1327
+ // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
1328
+ // pointwise conv
1329
+ block_1 = lm_ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
1330
+ block_1 = lm_ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
1331
+ block_1 = lm_ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
1332
+ block_1 = lm_ggml_relu(ctx0, block_1);
1333
+ block_1 = lm_ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
1334
+ block_1 = lm_ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
1335
+ block_1 = lm_ggml_hardsigmoid(ctx0, block_1);
1336
+
1337
+ // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
1338
+ block_1 = lm_ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
1339
+ block_1 = lm_ggml_mul(ctx0, block_1_hw, block_1);
1340
+
1341
+ int w = block_1->ne[0], h = block_1->ne[1];
1342
+ block_1 = lm_ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
1343
+ block_1 = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, block_1, 1, 0, 2, 3));
1344
+ // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
1345
+ block_1 = lm_ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
1346
+ block_1 = lm_ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
1347
+
1348
+
1349
+ // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
1350
+ block_1 = lm_ggml_norm(ctx0, block_1, eps);
1351
+ block_1 = lm_ggml_add(ctx0, lm_ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
1352
+ block_1 = lm_ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
1353
+ // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
1354
+ }
1355
+ embeddings = block_1;
1356
+ }
1357
+ else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
1358
+ {
1359
+ int n_patch = 24;
1360
+ lm_ggml_tensor * mlp_0 = lm_ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
1361
+ mlp_0 = lm_ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
1362
+ mlp_0 = lm_ggml_gelu(ctx0, mlp_0);
1363
+ lm_ggml_tensor * mlp_2 = lm_ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
1364
+ mlp_2 = lm_ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
1365
+ // mlp_2 ne = [2048, 576, 1, 1]
1366
+ // // AVG Pool Layer 2*2, strides = 2
1367
+ mlp_2 = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
1368
+ // mlp_2 ne = [576, 2048, 1, 1]
1369
+ mlp_2 = lm_ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
1370
+ // mlp_2 ne [24, 24, 2048, 1]
1371
+ mlp_2 = lm_ggml_pool_2d(ctx0, mlp_2, LM_GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
1372
+ // weight ne = [3, 3, 2048, 1]
1373
+ lm_ggml_tensor * peg_0 = lm_ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
1374
+ peg_0 = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
1375
+ peg_0 = lm_ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
1376
+ mlp_2 = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
1377
+ peg_0 = lm_ggml_add(ctx0, peg_0, mlp_2);
1378
+ peg_0 = lm_ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
1379
+ embeddings = peg_0;
1380
+ }
1381
+ else {
1382
+ LM_GGML_ABORT("fatal error");
1383
+ }
1384
+ }
1385
+
1386
+ // glm projector
1387
+ else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
1388
+ size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
1389
+ embeddings = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0,embeddings,1,0,2,3));
1390
+ embeddings = lm_ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
1391
+ embeddings = lm_ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
1392
+ embeddings = lm_ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
1393
+ embeddings = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
1394
+ embeddings = lm_ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
1395
+ // GLU
1396
+ {
1397
+ embeddings = lm_ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
1398
+ embeddings = lm_ggml_norm(ctx0, embeddings, eps);
1399
+ embeddings = lm_ggml_add(ctx0, lm_ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
1400
+ embeddings = lm_ggml_gelu_inplace(ctx0, embeddings);
1401
+ lm_ggml_tensor * x = embeddings;
1402
+ embeddings = lm_ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
1403
+ x = lm_ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
1404
+ embeddings = lm_ggml_silu_inplace(ctx0, embeddings);
1405
+ embeddings = lm_ggml_mul(ctx0, embeddings,x);
1406
+ embeddings = lm_ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
1407
+ }
1408
+ // arrangement of BOI/EOI token embeddings
1409
+ // note: these embeddings are not present in text model, hence we cannot process them as text tokens
1410
+ // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
1411
+ {
1412
+ embeddings = lm_ggml_concat(ctx0, model.mm_glm_tok_boi, embeddings, 1); // BOI
1413
+ embeddings = lm_ggml_concat(ctx0, embeddings, model.mm_glm_tok_eoi, 1); // EOI
1414
+ }
1415
+ }
1416
+
1417
+ else {
1418
+ LM_GGML_ABORT("llava: unknown projector type");
1419
+ }
1420
+
1421
+ // build the graph
1422
+ lm_ggml_build_forward_expand(gf, embeddings);
1423
+
1424
+ return gf;
1425
+ }
1426
+
1427
+ // whisper encoder with custom projector
1428
+ lm_ggml_cgraph * build_whisper_enc() {
1429
+ const int n_frames = img.nx;
1430
+ const int n_pos = n_frames / 2;
1431
+ LM_GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
1432
+
1433
+ lm_ggml_tensor * inp = build_inp_raw(1);
1434
+
1435
+ // conv1d block
1436
+ {
1437
+ // convolution + gelu
1438
+ lm_ggml_tensor * cur = lm_ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
1439
+ cur = lm_ggml_add(ctx0, cur, model.conv1d_1_b);
1440
+
1441
+ cur = lm_ggml_gelu_erf(ctx0, cur);
1442
+
1443
+ cur = lm_ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
1444
+ cur = lm_ggml_add(ctx0, cur, model.conv1d_2_b);
1445
+
1446
+ cur = lm_ggml_gelu_erf(ctx0, cur);
1447
+ // transpose
1448
+ inp = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
1449
+ cb(inp, "after_conv1d", -1);
1450
+ }
1451
+
1452
+ // sanity check (only check one layer, but it should be the same for all)
1453
+ LM_GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
1454
+ LM_GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
1455
+ LM_GGML_ASSERT(model.layers[0].q_b);
1456
+ LM_GGML_ASSERT(model.layers[0].v_b);
1457
+ LM_GGML_ASSERT(!model.layers[0].k_b); // no bias for k
1458
+ LM_GGML_ASSERT(model.post_ln_w && model.post_ln_b);
1459
+
1460
+ lm_ggml_tensor * pos_embd_selected = lm_ggml_view_2d(
1461
+ ctx0, model.position_embeddings,
1462
+ model.position_embeddings->ne[0], n_pos,
1463
+ model.position_embeddings->nb[1], 0
1464
+ );
1465
+ lm_ggml_tensor * cur = build_vit(
1466
+ inp, n_pos,
1467
+ NORM_TYPE_NORMAL,
1468
+ hparams.ffn_op,
1469
+ pos_embd_selected,
1470
+ nullptr);
1471
+
1472
+ cb(cur, "after_transformer", -1);
1473
+
1474
+ // StackAudioFrames
1475
+ // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1476
+ {
1477
+ int64_t stride = n_embd * hparams.proj_stack_factor;
1478
+ int64_t padded_len = LM_GGML_PAD(lm_ggml_nelements(cur), stride);
1479
+ int64_t pad = padded_len - lm_ggml_nelements(cur);
1480
+ if (pad > 0) {
1481
+ cur = lm_ggml_view_1d(ctx0, cur, lm_ggml_nelements(cur), 0);
1482
+ cur = lm_ggml_pad(ctx0, cur, pad, 0, 0, 0);
1483
+ }
1484
+ cur = lm_ggml_view_2d(ctx0, cur, stride, padded_len / stride,
1485
+ lm_ggml_row_size(cur->type, stride), 0);
1486
+ }
1487
+
1488
+ cb(cur, "after_stacked", -1);
1489
+
1490
+ // UltravoxProjector
1491
+ {
1492
+ // pre-norm
1493
+ cur = lm_ggml_rms_norm(ctx0, cur, 1e-6);
1494
+ cur = lm_ggml_mul(ctx0, cur, model.mm_norm_pre_w);
1495
+
1496
+ // ffn in
1497
+ cur = lm_ggml_mul_mat(ctx0, model.mm_1_w, cur);
1498
+
1499
+ // swiglu
1500
+ {
1501
+ int64_t split_point = cur->ne[0] / 2;
1502
+ lm_ggml_tensor * x0 = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
1503
+ lm_ggml_tensor * x1 = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * lm_ggml_element_size(cur)));
1504
+
1505
+ // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
1506
+ x1 = lm_ggml_silu(ctx0, x1);
1507
+ cur = lm_ggml_mul(ctx0, x0, x1);
1508
+ }
1509
+
1510
+ // mid-norm
1511
+ cur = lm_ggml_rms_norm(ctx0, cur, 1e-6);
1512
+ cur = lm_ggml_mul(ctx0, cur, model.mm_norm_mid_w);
1513
+
1514
+ // ffn out
1515
+ cur = lm_ggml_mul_mat(ctx0, model.mm_2_w, cur);
1516
+ }
1517
+
1518
+ cb(cur, "projected", -1);
1519
+
1520
+ lm_ggml_build_forward_expand(gf, cur);
1521
+
1522
+ return gf;
1523
+ }
1524
+
1525
+ private:
1526
+ //
1527
+ // utility functions
1528
+ //
1529
+
1530
+ void cb(lm_ggml_tensor * cur0, const char * name, int il) const {
1531
+ if (ctx->debug_graph) {
1532
+ lm_ggml_tensor * cur = lm_ggml_cpy(ctx0, cur0, lm_ggml_dup_tensor(ctx0, cur0));
1533
+ std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
1534
+ lm_ggml_set_name(cur, cur_name.c_str());
1535
+ lm_ggml_set_output(cur);
1536
+ lm_ggml_build_forward_expand(gf, cur);
1537
+ ctx->debug_print_tensors.push_back(cur);
1538
+ }
1539
+ }
1540
+
1541
+ // build vision transformer (ViT) cgraph
1542
+ // this function should cover most of the models
1543
+ // if your model has specific features, you should probably duplicate this function
1544
+ lm_ggml_tensor * build_vit(
1545
+ lm_ggml_tensor * inp,
1546
+ int64_t n_pos,
1547
+ norm_type norm_t,
1548
+ ffn_op_type ffn_t,
1549
+ lm_ggml_tensor * learned_pos_embd,
1550
+ std::function<lm_ggml_tensor *(lm_ggml_tensor *, const clip_layer &)> add_pos
1551
+ ) {
1552
+ if (learned_pos_embd) {
1553
+ inp = lm_ggml_add(ctx0, inp, learned_pos_embd);
1554
+ cb(inp, "pos_embed", -1);
1555
+ }
1556
+
1557
+ lm_ggml_tensor * inpL = inp;
1558
+
1559
+ // pre-layernorm
1560
+ if (model.pre_ln_w) {
1561
+ inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
1562
+ cb(inpL, "pre_ln", -1);
1563
+ }
1564
+
1565
+ // loop over layers
1566
+ for (int il = 0; il < n_layer; il++) {
1567
+ auto & layer = model.layers[il];
1568
+ lm_ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
1569
+
1570
+ // layernorm1
1571
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
1572
+ cb(cur, "layer_inp_normed", il);
1573
+
1574
+ // self-attention
1575
+ {
1576
+ lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, layer.q_w, cur);
1577
+ if (layer.q_b) {
1578
+ Qcur = lm_ggml_add(ctx0, Qcur, layer.q_b);
1579
+ }
1580
+
1581
+ lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, layer.k_w, cur);
1582
+ if (layer.k_b) {
1583
+ Kcur = lm_ggml_add(ctx0, Kcur, layer.k_b);
1584
+ }
1585
+
1586
+ lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, layer.v_w, cur);
1587
+ if (layer.v_b) {
1588
+ Vcur = lm_ggml_add(ctx0, Vcur, layer.v_b);
1589
+ }
1590
+
1591
+ if (layer.q_norm) {
1592
+ Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
1593
+ cb(Qcur, "Qcur_norm", il);
1594
+ }
1595
+
1596
+ if (layer.k_norm) {
1597
+ Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
1598
+ cb(Kcur, "Kcur_norm", il);
1599
+ }
1600
+
1601
+ Qcur = lm_ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
1602
+ Kcur = lm_ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
1603
+ Vcur = lm_ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
1604
+
1605
+ cb(Qcur, "Qcur", il);
1606
+ cb(Kcur, "Kcur", il);
1607
+ cb(Vcur, "Vcur", il);
1608
+
1609
+ if (add_pos) {
1610
+ Qcur = add_pos(Qcur, layer);
1611
+ Kcur = add_pos(Kcur, layer);
1612
+ cb(Qcur, "Qcur_pos", il);
1613
+ cb(Kcur, "Kcur_pos", il);
1614
+ }
1615
+
1616
+ cur = build_attn(layer.o_w, layer.o_b,
1617
+ Qcur, Kcur, Vcur, nullptr, kq_scale, il);
1618
+ cb(cur, "attn_out", il);
1619
+ }
1620
+
1621
+ if (layer.ls_1_w) {
1622
+ cur = lm_ggml_mul(ctx0, cur, layer.ls_1_w);
1623
+ cb(cur, "attn_out_scaled", il);
1624
+ }
1625
+
1626
+ // re-add the layer input, e.g., residual
1627
+ cur = lm_ggml_add(ctx0, cur, inpL);
1628
+
1629
+ inpL = cur; // inpL = residual, cur = hidden_states
1630
+
1631
+ cb(cur, "ffn_inp", il);
1632
+
1633
+ // layernorm2
1634
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
1635
+ cb(cur, "ffn_inp_normed", il);
1636
+
1637
+ // ffn
1638
+ cur = build_ffn(cur,
1639
+ layer.ff_up_w, layer.ff_up_b,
1640
+ layer.ff_gate_w, layer.ff_gate_b,
1641
+ layer.ff_down_w, layer.ff_down_b,
1642
+ ffn_t, il);
1643
+
1644
+ cb(cur, "ffn_out", il);
1645
+
1646
+ if (layer.ls_2_w) {
1647
+ cur = lm_ggml_mul(ctx0, cur, layer.ls_2_w);
1648
+ cb(cur, "ffn_out_scaled", il);
1649
+ }
1650
+
1651
+ // residual 2
1652
+ cur = lm_ggml_add(ctx0, inpL, cur);
1653
+ cb(cur, "layer_out", il);
1654
+
1655
+ inpL = cur;
1656
+ }
1657
+
1658
+ // post-layernorm
1659
+ if (model.post_ln_w) {
1660
+ inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1);
1661
+ }
1662
+ return inpL;
1663
+ }
1664
+
1665
+ // build the input after conv2d (inp_raw --> patches)
1666
+ // returns tensor with shape [n_embd, n_patches]
1667
+ lm_ggml_tensor * build_inp() {
1668
+ lm_ggml_tensor * inp_raw = build_inp_raw();
1669
+ lm_ggml_tensor * inp = lm_ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
1670
+ inp = lm_ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
1671
+ inp = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, inp));
1672
+ if (model.patch_bias) {
1673
+ inp = lm_ggml_add(ctx0, inp, model.patch_bias);
1674
+ cb(inp, "patch_bias", -1);
1675
+ }
1676
+ return inp;
1677
+ }
1678
+
1679
+ lm_ggml_tensor * build_inp_raw(int channels = 3) {
1680
+ lm_ggml_tensor * inp_raw = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, img.nx, img.ny, channels);
1681
+ lm_ggml_set_name(inp_raw, "inp_raw");
1682
+ lm_ggml_set_input(inp_raw);
1683
+ return inp_raw;
1684
+ }
1685
+
1686
+ lm_ggml_tensor * build_norm(
1687
+ lm_ggml_tensor * cur,
1688
+ lm_ggml_tensor * mw,
1689
+ lm_ggml_tensor * mb,
1690
+ norm_type type,
1691
+ float norm_eps,
1692
+ int il) const {
1693
+
1694
+ cur = type == NORM_TYPE_RMS
1695
+ ? lm_ggml_rms_norm(ctx0, cur, norm_eps)
1696
+ : lm_ggml_norm(ctx0, cur, norm_eps);
1697
+
1698
+ if (mw || mb) {
1699
+ cb(cur, "norm", il);
1700
+ }
1701
+
1702
+ if (mw) {
1703
+ cur = lm_ggml_mul(ctx0, cur, mw);
1704
+ if (mb) {
1705
+ cb(cur, "norm_w", il);
1706
+ }
1707
+ }
1708
+
1709
+ if (mb) {
1710
+ cur = lm_ggml_add(ctx0, cur, mb);
1711
+ }
1712
+
1713
+ return cur;
1714
+ }
1715
+
1716
+ lm_ggml_tensor * build_ffn(
1717
+ lm_ggml_tensor * cur,
1718
+ lm_ggml_tensor * up,
1719
+ lm_ggml_tensor * up_b,
1720
+ lm_ggml_tensor * gate,
1721
+ lm_ggml_tensor * gate_b,
1722
+ lm_ggml_tensor * down,
1723
+ lm_ggml_tensor * down_b,
1724
+ ffn_op_type type_op,
1725
+ int il) const {
1726
+
1727
+ lm_ggml_tensor * tmp = up ? lm_ggml_mul_mat(ctx0, up, cur) : cur;
1728
+ cb(tmp, "ffn_up", il);
1729
+
1730
+ if (up_b) {
1731
+ tmp = lm_ggml_add(ctx0, tmp, up_b);
1732
+ cb(tmp, "ffn_up_b", il);
1733
+ }
1734
+
1735
+ if (gate) {
1736
+ cur = lm_ggml_mul_mat(ctx0, gate, cur);
1737
+ cb(cur, "ffn_gate", il);
1738
+
1739
+ if (gate_b) {
1740
+ cur = lm_ggml_add(ctx0, cur, gate_b);
1741
+ cb(cur, "ffn_gate_b", il);
1742
+ }
1743
+ } else {
1744
+ cur = tmp;
1745
+ }
1746
+
1747
+ switch (type_op) {
1748
+ case FFN_SILU:
1749
+ {
1750
+ cur = lm_ggml_silu(ctx0, cur);
1751
+ cb(cur, "ffn_silu", il);
1752
+ } break;
1753
+ case FFN_GELU:
1754
+ {
1755
+ cur = lm_ggml_gelu(ctx0, cur);
1756
+ cb(cur, "ffn_gelu", il);
1757
+ } break;
1758
+ case FFN_GELU_ERF:
1759
+ {
1760
+ cur = lm_ggml_gelu_erf(ctx0, cur);
1761
+ cb(cur, "lm_ggml_gelu_erf", il);
1762
+ } break;
1763
+ case FFN_GELU_QUICK:
1764
+ {
1765
+ cur = lm_ggml_gelu_quick(ctx0, cur);
1766
+ cb(cur, "ffn_relu", il);
1767
+ } break;
1768
+ }
1769
+
1770
+ // we only support parallel ffn for now
1771
+ if (gate) {
1772
+ cur = lm_ggml_mul(ctx0, cur, tmp);
1773
+ cb(cur, "ffn_gate_par", il);
1774
+ }
1775
+
1776
+ if (down) {
1777
+ cur = lm_ggml_mul_mat(ctx0, down, cur);
1778
+ }
1779
+
1780
+ if (down_b) {
1781
+ cb(cur, "ffn_down", il);
1782
+ }
1783
+
1784
+ if (down_b) {
1785
+ cur = lm_ggml_add(ctx0, cur, down_b);
1786
+ }
1787
+
1788
+ return cur;
1789
+ }
1790
+
1791
+ lm_ggml_tensor * build_attn(
1792
+ lm_ggml_tensor * wo,
1793
+ lm_ggml_tensor * wo_b,
1794
+ lm_ggml_tensor * q_cur,
1795
+ lm_ggml_tensor * k_cur,
1796
+ lm_ggml_tensor * v_cur,
1797
+ lm_ggml_tensor * kq_mask,
1798
+ float kq_scale,
1799
+ int il) const {
1800
+ // these nodes are added to the graph together so that they are not reordered
1801
+ // by doing so, the number of splits in the graph is reduced
1802
+ lm_ggml_build_forward_expand(gf, q_cur);
1803
+ lm_ggml_build_forward_expand(gf, k_cur);
1804
+ lm_ggml_build_forward_expand(gf, v_cur);
1805
+
1806
+ lm_ggml_tensor * q = lm_ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
1807
+ //cb(q, "q", il);
1808
+
1809
+ lm_ggml_tensor * k = lm_ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
1810
+ //cb(k, "k", il);
1811
+
1812
+ lm_ggml_tensor * v = lm_ggml_permute(ctx0, v_cur, 1, 2, 0, 3);
1813
+ v = lm_ggml_cont(ctx0, v);
1814
+ //cb(k, "v", il);
1815
+
1816
+ lm_ggml_tensor * cur;
1817
+
1818
+ // TODO @ngxson : support flash attention
1819
+ {
1820
+ const auto n_tokens = q->ne[1];
1821
+ const auto n_head = q->ne[2];
1822
+ // const auto n_kv = k->ne[1]; // for flash attention
1823
+
1824
+ lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
1825
+ // F32 may not needed for vision encoders?
1826
+ // lm_ggml_mul_mat_set_prec(kq, LM_GGML_PREC_F32);
1827
+
1828
+ kq = lm_ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f);
1829
+
1830
+ lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, v, kq);
1831
+ cur = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
1832
+ cur = lm_ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
1833
+ }
1834
+
1835
+ cb(cur, "kqv_out", il);
1836
+
1837
+ if (wo) {
1838
+ cur = lm_ggml_mul_mat(ctx0, wo, cur);
1839
+ }
1840
+
1841
+ if (wo_b) {
1842
+ cur = lm_ggml_add(ctx0, cur, wo_b);
1843
+ }
1844
+
1845
+ return cur;
1846
+ }
1847
+
1848
+ // implementation of the 2D RoPE without adding a new op in ggml
1849
+ // this is not efficient (use double the memory), but works on all backends
1850
+ // TODO: there was a more efficient which relies on lm_ggml_view and lm_ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
1851
+ static lm_ggml_tensor * build_rope_2d(
1852
+ lm_ggml_context * ctx0,
1853
+ lm_ggml_tensor * cur,
1854
+ lm_ggml_tensor * pos_a, // first half
1855
+ lm_ggml_tensor * pos_b, // second half
1856
+ const float freq_base,
1857
+ const bool interleave_freq
1858
+ ) {
1859
+ const int64_t n_dim = cur->ne[0];
1860
+ const int64_t n_head = cur->ne[1];
1861
+ const int64_t n_pos = cur->ne[2];
1862
+
1863
+ // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
1864
+ // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
1865
+ // first half of cur will use 1e-0, 1e-2 (even)
1866
+ // second half of cur will use 1e-1, 1e-3 (odd)
1867
+ // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
1868
+ // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
1869
+ // then for the second half, we use freq_scale to shift the inv_freq
1870
+ // ^ why? replace (2i) with (2i+1) in the above equation
1871
+ const float freq_scale_odd = interleave_freq
1872
+ ? std::pow(freq_base, (float)-2/n_dim)
1873
+ : 1.0;
1874
+
1875
+ // first half
1876
+ lm_ggml_tensor * first;
1877
+ {
1878
+ first = lm_ggml_view_3d(ctx0, cur,
1879
+ n_dim/2, n_head, n_pos,
1880
+ lm_ggml_row_size(cur->type, n_dim),
1881
+ lm_ggml_row_size(cur->type, n_dim*n_head),
1882
+ 0);
1883
+ first = lm_ggml_rope_ext(
1884
+ ctx0,
1885
+ first,
1886
+ pos_a, // positions
1887
+ nullptr, // freq factors
1888
+ n_dim/2, // n_dims
1889
+ 0, 0, freq_base,
1890
+ 1.0f, 0.0f, 1.0f, 0.0f, 0.0f
1891
+ );
1892
+ }
1893
+
1894
+ // second half
1895
+ lm_ggml_tensor * second;
1896
+ {
1897
+ second = lm_ggml_view_3d(ctx0, cur,
1898
+ n_dim/2, n_head, n_pos,
1899
+ lm_ggml_row_size(cur->type, n_dim),
1900
+ lm_ggml_row_size(cur->type, n_dim*n_head),
1901
+ n_dim/2 * lm_ggml_element_size(cur));
1902
+ second = lm_ggml_cont(ctx0, second); // copy, because lm_ggml_rope don't play well with non-contiguous tensors
1903
+ second = lm_ggml_rope_ext(
1904
+ ctx0,
1905
+ second,
1906
+ pos_b, // positions
1907
+ nullptr, // freq factors
1908
+ n_dim/2, // n_dims
1909
+ 0, 0, freq_base,
1910
+ freq_scale_odd,
1911
+ 0.0f, 1.0f, 0.0f, 0.0f
1912
+ );
1913
+ }
1914
+
1915
+ cur = lm_ggml_concat(ctx0, first, second, 0);
1916
+ return cur;
1917
+ }
1918
+
1919
+ };
1920
+
1921
+ static lm_ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
1922
+ LM_GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
1923
+ clip_graph graph(ctx, *imgs.entries[0]);
1924
+
1925
+ lm_ggml_cgraph * res;
1926
+
1927
+ switch (ctx->proj_type) {
1928
+ case PROJECTOR_TYPE_GEMMA3:
1929
+ case PROJECTOR_TYPE_IDEFICS3:
1930
+ {
1931
+ res = graph.build_siglip();
1932
+ } break;
1933
+ case PROJECTOR_TYPE_PIXTRAL:
1934
+ {
1935
+ res = graph.build_pixtral();
1936
+ } break;
1937
+ case PROJECTOR_TYPE_QWEN2VL:
1938
+ case PROJECTOR_TYPE_QWEN25VL:
1939
+ {
1940
+ res = graph.build_qwen2vl();
1941
+ } break;
1942
+ case PROJECTOR_TYPE_MINICPMV:
1943
+ {
1944
+ res = graph.build_minicpmv();
1945
+ } break;
1946
+ case PROJECTOR_TYPE_INTERNVL:
1947
+ {
1948
+ res = graph.build_internvl();
1949
+ } break;
1950
+ case PROJECTOR_TYPE_LLAMA4:
1951
+ {
1952
+ res = graph.build_llama4();
1953
+ } break;
1954
+ case PROJECTOR_TYPE_ULTRAVOX:
1955
+ {
1956
+ res = graph.build_whisper_enc();
1957
+ } break;
1958
+ default:
1959
+ {
1960
+ res = graph.build_llava();
1961
+ } break;
1962
+ }
1963
+ return res;
1964
+ }
1965
+
1966
+ struct clip_model_loader {
1967
+ lm_ggml_context_ptr ctx_meta;
1968
+ lm_gguf_context_ptr ctx_gguf;
1969
+
1970
+ clip_ctx & ctx_clip;
1971
+ std::string fname;
1972
+
1973
+ size_t model_size = 0; // in bytes
1974
+
1975
+ // TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model
1976
+ clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) {
1977
+ struct lm_ggml_context * meta = nullptr;
1978
+
1979
+ struct lm_gguf_init_params params = {
1980
+ /*.no_alloc = */ true,
1981
+ /*.ctx = */ &meta,
1982
+ };
1983
+
1984
+ ctx_gguf = lm_gguf_context_ptr(lm_gguf_init_from_file(fname, params));
1985
+ if (!ctx_gguf.get()) {
1986
+ throw std::runtime_error(string_format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
1987
+ }
1988
+
1989
+ ctx_meta.reset(meta);
1990
+
1991
+ const int n_tensors = lm_gguf_get_n_tensors(ctx_gguf.get());
1992
+
1993
+ // print gguf info
1994
+ {
1995
+ std::string name;
1996
+ get_string(KEY_NAME, name, false);
1997
+ std::string description;
1998
+ get_string(KEY_DESCRIPTION, description, false);
1999
+ LOG_INF("%s: model name: %s\n", __func__, name.c_str());
2000
+ LOG_INF("%s: description: %s\n", __func__, description.c_str());
2001
+ LOG_INF("%s: GGUF version: %d\n", __func__, lm_gguf_get_version(ctx_gguf.get()));
2002
+ LOG_INF("%s: alignment: %zu\n", __func__, lm_gguf_get_alignment(ctx_gguf.get()));
2003
+ LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
2004
+ LOG_INF("%s: n_kv: %d\n", __func__, (int)lm_gguf_get_n_kv(ctx_gguf.get()));
2005
+ LOG_INF("\n");
2006
+ }
2007
+
2008
+ // tensors
2009
+ {
2010
+ for (int i = 0; i < n_tensors; ++i) {
2011
+ const char * name = lm_gguf_get_tensor_name(ctx_gguf.get(), i);
2012
+ const size_t offset = lm_gguf_get_tensor_offset(ctx_gguf.get(), i);
2013
+ enum lm_ggml_type type = lm_gguf_get_tensor_type(ctx_gguf.get(), i);
2014
+ lm_ggml_tensor * cur = lm_ggml_get_tensor(meta, name);
2015
+ size_t tensor_size = lm_ggml_nbytes(cur);
2016
+ model_size += tensor_size;
2017
+ LOG_DBG("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
2018
+ __func__, i, lm_ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], lm_ggml_type_name(type));
2019
+ }
2020
+ }
2021
+ }
2022
+
2023
+ void load_hparams() {
2024
+ auto & hparams = ctx_clip.vision_model.hparams;
2025
+ std::string log_ffn_op; // for logging
2026
+
2027
+ // projector type
2028
+ std::string proj_type;
2029
+ {
2030
+ get_string(KEY_PROJ_TYPE, proj_type, false);
2031
+ if (!proj_type.empty()) {
2032
+ ctx_clip.proj_type = clip_projector_type_from_string(proj_type);
2033
+ }
2034
+ if (ctx_clip.proj_type == PROJECTOR_TYPE_UNKNOWN) {
2035
+ throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
2036
+ }
2037
+ }
2038
+
2039
+ // other hparams
2040
+ {
2041
+ get_bool(KEY_HAS_AUDIO_ENC, hparams.has_audio, false);
2042
+ get_bool(KEY_HAS_VISION_ENC, hparams.has_vision, false);
2043
+
2044
+ const char * prefix = hparams.has_vision ? "vision" : "audio";
2045
+ get_u32(string_format(KEY_N_EMBD, prefix), hparams.n_embd);
2046
+ get_u32(string_format(KEY_N_HEAD, prefix), hparams.n_head);
2047
+ get_u32(string_format(KEY_N_FF, prefix), hparams.n_ff);
2048
+ get_u32(string_format(KEY_N_BLOCK, prefix), hparams.n_layer);
2049
+ get_u32(string_format(KEY_PROJ_DIM, prefix), hparams.projection_dim);
2050
+ get_f32(string_format(KEY_LAYER_NORM_EPS, prefix), hparams.eps);
2051
+
2052
+ if (hparams.has_vision) {
2053
+ get_u32(KEY_IMAGE_SIZE, hparams.image_size);
2054
+ get_u32(KEY_PATCH_SIZE, hparams.patch_size);
2055
+ get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
2056
+ get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
2057
+ get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); // legacy
2058
+
2059
+ } else if (hparams.has_audio) {
2060
+ get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
2061
+
2062
+ } else {
2063
+ throw std::runtime_error(string_format("%s: neither vision nor audio encoder is present\n", __func__));
2064
+ }
2065
+
2066
+ // default warmup value
2067
+ hparams.warmup_image_size = hparams.image_size;
2068
+
2069
+ ctx_clip.has_llava_projector = ctx_clip.proj_type == PROJECTOR_TYPE_MLP
2070
+ || ctx_clip.proj_type == PROJECTOR_TYPE_MLP_NORM
2071
+ || ctx_clip.proj_type == PROJECTOR_TYPE_LDP
2072
+ || ctx_clip.proj_type == PROJECTOR_TYPE_LDPV2;
2073
+
2074
+ {
2075
+ bool use_gelu = false;
2076
+ bool use_silu = false;
2077
+ get_bool(KEY_USE_GELU, use_gelu, false);
2078
+ get_bool(KEY_USE_SILU, use_silu, false);
2079
+ if (use_gelu && use_silu) {
2080
+ throw std::runtime_error(string_format("%s: both use_gelu and use_silu are set to true\n", __func__));
2081
+ }
2082
+ if (use_gelu) {
2083
+ hparams.ffn_op = FFN_GELU;
2084
+ log_ffn_op = "gelu";
2085
+ } else if (use_silu) {
2086
+ hparams.ffn_op = FFN_SILU;
2087
+ log_ffn_op = "silu";
2088
+ } else {
2089
+ hparams.ffn_op = FFN_GELU_QUICK;
2090
+ log_ffn_op = "gelu_quick";
2091
+ }
2092
+ }
2093
+
2094
+ {
2095
+ std::string mm_patch_merge_type;
2096
+ get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false);
2097
+ if (mm_patch_merge_type == "spatial_unpad") {
2098
+ hparams.mm_patch_merge_type = PATCH_MERGE_SPATIAL_UNPAD;
2099
+ }
2100
+ }
2101
+
2102
+ if (hparams.has_vision) {
2103
+ int idx_mean = lm_gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN);
2104
+ int idx_std = lm_gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD);
2105
+ LM_GGML_ASSERT(idx_mean >= 0 && "image_mean not found");
2106
+ LM_GGML_ASSERT(idx_std >= 0 && "image_std not found");
2107
+ const float * mean_data = (const float *) lm_gguf_get_arr_data(ctx_gguf.get(), idx_mean);
2108
+ const float * std_data = (const float *) lm_gguf_get_arr_data(ctx_gguf.get(), idx_std);
2109
+ for (int i = 0; i < 3; ++i) {
2110
+ ctx_clip.image_mean[i] = mean_data[i];
2111
+ ctx_clip.image_std[i] = std_data[i];
2112
+ }
2113
+ }
2114
+
2115
+ // Load the vision feature layer indices if they are explicitly provided;
2116
+ // if multiple vision feature layers are present, the values will be concatenated
2117
+ // to form the final visual features.
2118
+ // NOTE: gguf conversions should standardize the values of the vision feature layer to
2119
+ // be non-negative, since we use -1 to mark values as unset here.
2120
+ std::vector<int> vision_feature_layer;
2121
+ get_arr_int(KEY_FEATURE_LAYER, vision_feature_layer, false);
2122
+ // convert std::vector to std::unordered_set
2123
+ for (auto & layer : vision_feature_layer) {
2124
+ hparams.vision_feature_layer.insert(layer);
2125
+ }
2126
+
2127
+ // model-specific params
2128
+ switch (ctx_clip.proj_type) {
2129
+ case PROJECTOR_TYPE_MINICPMV:
2130
+ {
2131
+ if (ctx_clip.minicpmv_version == 0) {
2132
+ ctx_clip.minicpmv_version = 2; // default to 2 if not set
2133
+ }
2134
+ } break;
2135
+ case PROJECTOR_TYPE_IDEFICS3:
2136
+ case PROJECTOR_TYPE_INTERNVL:
2137
+ {
2138
+ get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
2139
+ } break;
2140
+ case PROJECTOR_TYPE_PIXTRAL:
2141
+ {
2142
+ hparams.rope_theta = 10000.0f;
2143
+ hparams.warmup_image_size = hparams.patch_size * 8;
2144
+ get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
2145
+ } break;
2146
+ case PROJECTOR_TYPE_GEMMA3:
2147
+ {
2148
+ // default value (used by all model sizes in gemma 3 family)
2149
+ // number of patches for each **side** is reduced by a factor of 4
2150
+ hparams.proj_scale_factor = 4;
2151
+ // test model (tinygemma3) has a different value, we optionally read it
2152
+ get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
2153
+ } break;
2154
+ case PROJECTOR_TYPE_QWEN2VL:
2155
+ {
2156
+ // max image size = sqrt(max_pixels) = 3584
2157
+ // ref: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json
2158
+ // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
2159
+ // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
2160
+ hparams.image_size = 1024;
2161
+ hparams.warmup_image_size = hparams.patch_size * 8;
2162
+ } break;
2163
+ case PROJECTOR_TYPE_QWEN25VL:
2164
+ {
2165
+ // max image size = sqrt(max_pixels)
2166
+ // https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
2167
+ // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
2168
+ // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
2169
+ hparams.image_size = 1024;
2170
+ hparams.warmup_image_size = hparams.patch_size * 8;
2171
+ get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
2172
+ } break;
2173
+ case PROJECTOR_TYPE_LLAMA4:
2174
+ {
2175
+ hparams.rope_theta = 10000.0f;
2176
+ get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor);
2177
+
2178
+ // borrowed from llava-1.6
2179
+ const int isize = hparams.image_size;
2180
+ hparams.image_grid_pinpoints = {
2181
+ isize, isize*2, // 336, 672
2182
+ isize*2, isize, // 672, 336
2183
+ isize*2, isize*2, // 672, 672
2184
+ isize*3, isize, // 1008, 336
2185
+ isize, isize*3, // 336, 1008
2186
+ };
2187
+ } break;
2188
+ case PROJECTOR_TYPE_ULTRAVOX:
2189
+ {
2190
+ get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor);
2191
+ if (hparams.n_mel_bins != 128) {
2192
+ throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
2193
+ }
2194
+ hparams.ffn_op = FFN_GELU_ERF;
2195
+ log_ffn_op = "gelu_erf"; // temporary solution for logging
2196
+ } break;
2197
+ default:
2198
+ break;
2199
+ }
2200
+
2201
+ LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
2202
+ LOG_INF("%s: has_vision_encoder: %d\n", __func__, hparams.has_vision);
2203
+ LOG_INF("%s: has_audio_encoder: %d\n", __func__, hparams.has_audio);
2204
+ LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd);
2205
+ LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head);
2206
+ LOG_INF("%s: n_ff: %d\n", __func__, hparams.n_ff);
2207
+ LOG_INF("%s: n_layer: %d\n", __func__, hparams.n_layer);
2208
+ LOG_INF("%s: ffn_op: %s\n", __func__, log_ffn_op.c_str());
2209
+ LOG_INF("%s: projection_dim: %d\n", __func__, hparams.projection_dim);
2210
+ LOG_INF("\n");
2211
+ if (hparams.has_vision) {
2212
+ LOG_INF("%s: image_size: %d\n", __func__, hparams.image_size);
2213
+ LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size);
2214
+ LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector);
2215
+ LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version);
2216
+ LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
2217
+ LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
2218
+ } else if (hparams.has_audio) {
2219
+ LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
2220
+ LOG_INF("%s: proj_stack_factor: %d\n", __func__, hparams.proj_stack_factor);
2221
+ }
2222
+ LOG_INF("\n");
2223
+ LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
2224
+ LOG_INF("%s: metadata size: %.2f MiB\n", __func__, lm_ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
2225
+ }
2226
+ }
2227
+
2228
+ void load_tensors() {
2229
+ auto & hparams = ctx_clip.vision_model.hparams;
2230
+ std::map<std::string, size_t> tensor_offset;
2231
+ std::vector<lm_ggml_tensor *> tensors_to_load;
2232
+
2233
+ // TODO @ngxson : support both audio and video in the future
2234
+ const char * prefix = hparams.has_audio ? "a" : "v";
2235
+
2236
+ // get offsets
2237
+ for (int64_t i = 0; i < lm_gguf_get_n_tensors(ctx_gguf.get()); ++i) {
2238
+ const char * name = lm_gguf_get_tensor_name(ctx_gguf.get(), i);
2239
+ tensor_offset[name] = lm_gguf_get_data_offset(ctx_gguf.get()) + lm_gguf_get_tensor_offset(ctx_gguf.get(), i);
2240
+ }
2241
+
2242
+ // create data context
2243
+ struct lm_ggml_init_params params = {
2244
+ /*.mem_size =*/ (lm_gguf_get_n_tensors(ctx_gguf.get()) + 1) * lm_ggml_tensor_overhead(),
2245
+ /*.mem_buffer =*/ NULL,
2246
+ /*.no_alloc =*/ true,
2247
+ };
2248
+ ctx_clip.ctx_data.reset(lm_ggml_init(params));
2249
+ if (!ctx_clip.ctx_data) {
2250
+ throw std::runtime_error(string_format("%s: failed to init ggml context\n", __func__));
2251
+ }
2252
+
2253
+ // helper function
2254
+ auto get_tensor = [&](const std::string & name, bool required = true) {
2255
+ lm_ggml_tensor * cur = lm_ggml_get_tensor(ctx_meta.get(), name.c_str());
2256
+ if (!cur && required) {
2257
+ throw std::runtime_error(string_format("%s: unable to find tensor %s\n", __func__, name.c_str()));
2258
+ }
2259
+ if (cur) {
2260
+ tensors_to_load.push_back(cur);
2261
+ // add tensors to context
2262
+ lm_ggml_tensor * data_tensor = lm_ggml_dup_tensor(ctx_clip.ctx_data.get(), cur);
2263
+ lm_ggml_set_name(data_tensor, cur->name);
2264
+ cur = data_tensor;
2265
+ }
2266
+ return cur;
2267
+ };
2268
+
2269
+ auto & vision_model = ctx_clip.vision_model;
2270
+
2271
+ vision_model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
2272
+
2273
+ vision_model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
2274
+ vision_model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, prefix, "bias"), false);
2275
+
2276
+ vision_model.post_ln_w = get_tensor(string_format(TN_LN_POST, prefix, "weight"), false);
2277
+ vision_model.post_ln_b = get_tensor(string_format(TN_LN_POST, prefix, "bias"), false);
2278
+
2279
+ vision_model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
2280
+ vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
2281
+ vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
2282
+
2283
+ vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
2284
+
2285
+ // layers
2286
+ vision_model.layers.resize(hparams.n_layer);
2287
+ for (int il = 0; il < hparams.n_layer; ++il) {
2288
+ auto & layer = vision_model.layers[il];
2289
+ layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"));
2290
+ layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"));
2291
+ layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"));
2292
+ layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight"));
2293
+ layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false);
2294
+ layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
2295
+ layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false);
2296
+ layer.ln_2_w = get_tensor(string_format(TN_LN_2, prefix, il, "weight"), false);
2297
+ layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias
2298
+ layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias
2299
+
2300
+ layer.k_b = get_tensor(string_format(TN_ATTN_K, prefix, il, "bias"), false);
2301
+ layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false);
2302
+ layer.v_b = get_tensor(string_format(TN_ATTN_V, prefix, il, "bias"), false);
2303
+ layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false);
2304
+ layer.ln_1_b = get_tensor(string_format(TN_LN_1, prefix, il, "bias"), false);
2305
+ layer.ln_2_b = get_tensor(string_format(TN_LN_2, prefix, il, "bias"), false);
2306
+
2307
+ // ffn
2308
+ layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, prefix, il, "weight"));
2309
+ layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, prefix, il, "bias"), false);
2310
+ layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, prefix, il, "weight"), false);
2311
+ layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, prefix, il, "bias"), false);
2312
+ layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight"));
2313
+ layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"), false);
2314
+
2315
+ // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
2316
+ // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
2317
+ if (layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd) {
2318
+ // swap up and down weights
2319
+ lm_ggml_tensor * tmp = layer.ff_up_w;
2320
+ layer.ff_up_w = layer.ff_down_w;
2321
+ layer.ff_down_w = tmp;
2322
+ // swap up and down biases
2323
+ tmp = layer.ff_up_b;
2324
+ layer.ff_up_b = layer.ff_down_b;
2325
+ layer.ff_down_b = tmp;
2326
+ }
2327
+ }
2328
+
2329
+ switch (ctx_clip.proj_type) {
2330
+ case PROJECTOR_TYPE_MLP:
2331
+ case PROJECTOR_TYPE_MLP_NORM:
2332
+ {
2333
+ // LLaVA projection
2334
+ vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"), false);
2335
+ vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
2336
+ // Yi-type llava
2337
+ vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"), false);
2338
+ vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
2339
+ // missing in Yi-type llava
2340
+ vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"), false);
2341
+ vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
2342
+ // Yi-type llava
2343
+ vision_model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, 3, "weight"), false);
2344
+ vision_model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, 3, "bias"), false);
2345
+ vision_model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, 4, "weight"), false);
2346
+ vision_model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, 4, "bias"), false);
2347
+ if (vision_model.mm_3_w) {
2348
+ // TODO: this is a hack to support Yi-type llava
2349
+ ctx_clip.proj_type = PROJECTOR_TYPE_MLP_NORM;
2350
+ }
2351
+ vision_model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
2352
+ } break;
2353
+ case PROJECTOR_TYPE_LDP:
2354
+ {
2355
+ // MobileVLM projection
2356
+ vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
2357
+ vision_model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
2358
+ vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
2359
+ vision_model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
2360
+ vision_model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
2361
+ vision_model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
2362
+ vision_model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
2363
+ vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
2364
+ vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
2365
+ vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
2366
+ vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
2367
+ vision_model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
2368
+ vision_model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
2369
+ vision_model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
2370
+ vision_model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
2371
+ vision_model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
2372
+ vision_model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
2373
+ vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
2374
+ vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
2375
+ vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
2376
+ vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
2377
+ vision_model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
2378
+ vision_model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
2379
+ vision_model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
2380
+ } break;
2381
+ case PROJECTOR_TYPE_LDPV2:
2382
+ {
2383
+ // MobilVLM_V2 projection
2384
+ vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
2385
+ vision_model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
2386
+ vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
2387
+ vision_model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "bias"));
2388
+ vision_model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight"));
2389
+ vision_model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias"));
2390
+ } break;
2391
+ case PROJECTOR_TYPE_MINICPMV:
2392
+ {
2393
+ // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
2394
+ vision_model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
2395
+ vision_model.mm_model_query = get_tensor(TN_MINICPMV_QUERY);
2396
+ vision_model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ);
2397
+ vision_model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ);
2398
+ vision_model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight"));
2399
+ vision_model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight"));
2400
+ vision_model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight"));
2401
+ vision_model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias"));
2402
+ vision_model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias"));
2403
+ vision_model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias"));
2404
+ vision_model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight"));
2405
+ vision_model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias"));
2406
+ vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight"));
2407
+ vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias"));
2408
+ vision_model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight"));
2409
+ vision_model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias"));
2410
+ vision_model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight"));
2411
+ vision_model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias"));
2412
+ } break;
2413
+ case PROJECTOR_TYPE_GLM_EDGE:
2414
+ {
2415
+ vision_model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
2416
+ vision_model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
2417
+ vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight"));
2418
+ vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight"));
2419
+ vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias"));
2420
+ vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
2421
+ vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
2422
+ vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
2423
+ vision_model.mm_glm_tok_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
2424
+ vision_model.mm_glm_tok_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
2425
+ } break;
2426
+ case PROJECTOR_TYPE_QWEN2VL:
2427
+ case PROJECTOR_TYPE_QWEN25VL:
2428
+ {
2429
+ vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
2430
+ vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
2431
+ vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
2432
+ vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
2433
+ } break;
2434
+ case PROJECTOR_TYPE_GEMMA3:
2435
+ {
2436
+ vision_model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
2437
+ vision_model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
2438
+ } break;
2439
+ case PROJECTOR_TYPE_IDEFICS3:
2440
+ {
2441
+ vision_model.projection = get_tensor(TN_MM_PROJECTOR);
2442
+ } break;
2443
+ case PROJECTOR_TYPE_PIXTRAL:
2444
+ {
2445
+ vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
2446
+ vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
2447
+ vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
2448
+ vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
2449
+ // [IMG_BREAK] token embedding
2450
+ vision_model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
2451
+ // for mistral small 3.1
2452
+ vision_model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
2453
+ vision_model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
2454
+ } break;
2455
+ case PROJECTOR_TYPE_ULTRAVOX:
2456
+ {
2457
+ vision_model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
2458
+ vision_model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
2459
+ vision_model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
2460
+ vision_model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
2461
+ vision_model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
2462
+ vision_model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
2463
+ vision_model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
2464
+ vision_model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
2465
+ } break;
2466
+ case PROJECTOR_TYPE_INTERNVL:
2467
+ {
2468
+ vision_model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
2469
+ vision_model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
2470
+ vision_model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
2471
+ vision_model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
2472
+ vision_model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
2473
+ vision_model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
2474
+ } break;
2475
+ case PROJECTOR_TYPE_LLAMA4:
2476
+ {
2477
+ vision_model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
2478
+ vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
2479
+ vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
2480
+ } break;
2481
+ default:
2482
+ LM_GGML_ASSERT(false && "unknown projector type");
2483
+ }
2484
+
2485
+ // load data
2486
+ {
2487
+ std::vector<uint8_t> read_buf;
2488
+
2489
+ auto fin = std::ifstream(fname, std::ios::binary);
2490
+ if (!fin) {
2491
+ throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
2492
+ }
2493
+
2494
+ // alloc memory and offload data
2495
+ lm_ggml_backend_buffer_type_t buft = lm_ggml_backend_get_default_buffer_type(ctx_clip.backend);
2496
+ ctx_clip.buf.reset(lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
2497
+ lm_ggml_backend_buffer_set_usage(ctx_clip.buf.get(), LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
2498
+ for (auto & t : tensors_to_load) {
2499
+ lm_ggml_tensor * cur = lm_ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
2500
+ const size_t offset = tensor_offset[t->name];
2501
+ fin.seekg(offset, std::ios::beg);
2502
+ if (!fin) {
2503
+ throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));
2504
+ }
2505
+ size_t num_bytes = lm_ggml_nbytes(cur);
2506
+ if (lm_ggml_backend_buft_is_host(buft)) {
2507
+ // for the CPU and Metal backend, we can read directly into the tensor
2508
+ fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
2509
+ } else {
2510
+ // read into a temporary buffer first, then copy to device memory
2511
+ read_buf.resize(num_bytes);
2512
+ fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
2513
+ lm_ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
2514
+ }
2515
+ }
2516
+ fin.close();
2517
+
2518
+ LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
2519
+ }
2520
+ }
2521
+
2522
+ void alloc_compute_meta() {
2523
+ const auto & hparams = ctx_clip.vision_model.hparams;
2524
+ ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * lm_ggml_tensor_overhead() + lm_ggml_graph_overhead());
2525
+
2526
+ // create a fake batch
2527
+ clip_image_f32_batch batch;
2528
+ clip_image_f32_ptr img(clip_image_f32_init());
2529
+ if (hparams.has_vision) {
2530
+ img->nx = hparams.warmup_image_size;
2531
+ img->ny = hparams.warmup_image_size;
2532
+ } else {
2533
+ img->nx = 1024; // TODO @ngxson : use a better default
2534
+ img->ny = hparams.n_mel_bins;
2535
+ }
2536
+ img->buf.resize(img->nx * img->ny * 3);
2537
+ batch.entries.push_back(std::move(img));
2538
+
2539
+ lm_ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
2540
+ lm_ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
2541
+
2542
+ for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
2543
+ lm_ggml_backend_t backend = ctx_clip.backend_ptrs[i];
2544
+ lm_ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i];
2545
+ size_t size = lm_ggml_backend_sched_get_buffer_size(ctx_clip.sched.get(), backend);
2546
+ if (size > 1) {
2547
+ LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
2548
+ lm_ggml_backend_buft_name(buft),
2549
+ size / 1024.0 / 1024.0);
2550
+ }
2551
+ }
2552
+ }
2553
+
2554
+ void get_bool(const std::string & key, bool & output, bool required = true) {
2555
+ const int i = lm_gguf_find_key(ctx_gguf.get(), key.c_str());
2556
+ if (i < 0) {
2557
+ if (required) throw std::runtime_error("Key not found: " + key);
2558
+ return;
2559
+ }
2560
+ output = lm_gguf_get_val_bool(ctx_gguf.get(), i);
2561
+ }
2562
+
2563
+ void get_i32(const std::string & key, int & output, bool required = true) {
2564
+ const int i = lm_gguf_find_key(ctx_gguf.get(), key.c_str());
2565
+ if (i < 0) {
2566
+ if (required) throw std::runtime_error("Key not found: " + key);
2567
+ return;
2568
+ }
2569
+ output = lm_gguf_get_val_i32(ctx_gguf.get(), i);
2570
+ }
2571
+
2572
+ void get_u32(const std::string & key, int & output, bool required = true) {
2573
+ const int i = lm_gguf_find_key(ctx_gguf.get(), key.c_str());
2574
+ if (i < 0) {
2575
+ if (required) throw std::runtime_error("Key not found: " + key);
2576
+ return;
2577
+ }
2578
+ output = lm_gguf_get_val_u32(ctx_gguf.get(), i);
2579
+ }
2580
+
2581
+ void get_f32(const std::string & key, float & output, bool required = true) {
2582
+ const int i = lm_gguf_find_key(ctx_gguf.get(), key.c_str());
2583
+ if (i < 0) {
2584
+ if (required) throw std::runtime_error("Key not found: " + key);
2585
+ return;
2586
+ }
2587
+ output = lm_gguf_get_val_f32(ctx_gguf.get(), i);
2588
+ }
2589
+
2590
+ void get_string(const std::string & key, std::string & output, bool required = true) {
2591
+ const int i = lm_gguf_find_key(ctx_gguf.get(), key.c_str());
2592
+ if (i < 0) {
2593
+ if (required) throw std::runtime_error("Key not found: " + key);
2594
+ return;
2595
+ }
2596
+ output = std::string(lm_gguf_get_val_str(ctx_gguf.get(), i));
2597
+ }
2598
+
2599
+ void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) {
2600
+ const int i = lm_gguf_find_key(ctx_gguf.get(), key.c_str());
2601
+ if (i < 0) {
2602
+ if (required) throw std::runtime_error("Key not found: " + key);
2603
+ return;
2604
+ }
2605
+ int n = lm_gguf_get_arr_n(ctx_gguf.get(), i);
2606
+ output.resize(n);
2607
+ const int32_t * values = (const int32_t *)lm_gguf_get_arr_data(ctx_gguf.get(), i);
2608
+ for (int i = 0; i < n; ++i) {
2609
+ output[i] = values[i];
2610
+ }
2611
+ }
2612
+ };
2613
+
2614
+ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
2615
+ g_logger_state.verbosity_thold = ctx_params.verbosity;
2616
+ clip_ctx * ctx_clip = nullptr;
2617
+
2618
+ try {
2619
+ ctx_clip = new clip_ctx(ctx_params);
2620
+ clip_model_loader loader(fname, *ctx_clip);
2621
+ loader.load_hparams();
2622
+ loader.load_tensors();
2623
+ loader.alloc_compute_meta();
2624
+ } catch (const std::exception & e) {
2625
+ LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
2626
+ delete ctx_clip;
2627
+ return nullptr;
2628
+ }
2629
+
2630
+ return ctx_clip;
2631
+ }
2632
+
2633
+ struct clip_image_size * clip_image_size_init() {
2634
+ struct clip_image_size * load_image_size = new struct clip_image_size();
2635
+ load_image_size->width = 448;
2636
+ load_image_size->height = 448;
2637
+ return load_image_size;
2638
+ }
2639
+
2640
+ struct clip_image_u8 * clip_image_u8_init() {
2641
+ return new clip_image_u8();
2642
+ }
2643
+
2644
+ struct clip_image_f32 * clip_image_f32_init() {
2645
+ return new clip_image_f32();
2646
+ }
2647
+
2648
+ struct clip_image_f32_batch * clip_image_f32_batch_init() {
2649
+ return new clip_image_f32_batch();
2650
+ }
2651
+
2652
+ unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
2653
+ if (nx) *nx = img->nx;
2654
+ if (ny) *ny = img->ny;
2655
+ return img->buf.data();
2656
+ }
2657
+
2658
+ void clip_image_size_free(struct clip_image_size * load_image_size) {
2659
+ if (load_image_size == nullptr) {
2660
+ return;
2661
+ }
2662
+ delete load_image_size;
2663
+ }
2664
+ void clip_image_u8_free(struct clip_image_u8 * img) { if (img) delete img; }
2665
+ void clip_image_f32_free(struct clip_image_f32 * img) { if (img) delete img; }
2666
+ void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { if (batch) delete batch; }
2667
+ void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { if (batch) delete batch; }
2668
+
2669
+ size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) {
2670
+ return batch->entries.size();
2671
+ }
2672
+
2673
+ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) {
2674
+ if (idx < 0 || idx >= (int)batch->entries.size()) {
2675
+ LOG_ERR("%s: invalid index %d\n", __func__, idx);
2676
+ return 0;
2677
+ }
2678
+ return batch->entries[idx]->nx;
2679
+ }
2680
+
2681
+ size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) {
2682
+ if (idx < 0 || idx >= (int)batch->entries.size()) {
2683
+ LOG_ERR("%s: invalid index %d\n", __func__, idx);
2684
+ return 0;
2685
+ }
2686
+ return batch->entries[idx]->ny;
2687
+ }
2688
+
2689
+ clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) {
2690
+ if (idx < 0 || idx >= (int)batch->entries.size()) {
2691
+ LOG_ERR("%s: invalid index %d\n", __func__, idx);
2692
+ return nullptr;
2693
+ }
2694
+ return batch->entries[idx].get();
2695
+ }
2696
+
2697
+ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
2698
+ img->nx = nx;
2699
+ img->ny = ny;
2700
+ img->buf.resize(3 * nx * ny);
2701
+ memcpy(img->buf.data(), rgb_pixels, img->buf.size());
2702
+ }
2703
+
2704
+ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
2705
+ int nx, ny, nc;
2706
+ auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
2707
+ if (!data) {
2708
+ LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
2709
+ return false;
2710
+ }
2711
+ clip_build_img_from_pixels(data, nx, ny, img);
2712
+ stbi_image_free(data);
2713
+ return true;
2714
+ }
2715
+
2716
+ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
2717
+ int nx, ny, nc;
2718
+ auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
2719
+ if (!data) {
2720
+ LOG_ERR("%s: failed to decode image bytes\n", __func__);
2721
+ return false;
2722
+ }
2723
+ clip_build_img_from_pixels(data, nx, ny, img);
2724
+ stbi_image_free(data);
2725
+ return true;
2726
+ }
2727
+
2728
+ // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
2729
+ static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
2730
+ dst.nx = src.nx;
2731
+ dst.ny = src.ny;
2732
+ dst.buf.resize(src.buf.size());
2733
+
2734
+ // TODO @ngxson : seems like this could be done more efficiently on cgraph
2735
+ for (size_t i = 0; i < src.buf.size(); ++i) {
2736
+ int c = i % 3; // rgb
2737
+ dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
2738
+ }
2739
+ }
2740
+
2741
+ // set of tools to manupulate images
2742
+ // in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
2743
+ struct image_manipulation {
2744
+ // Bilinear resize function
2745
+ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
2746
+ dst.nx = target_width;
2747
+ dst.ny = target_height;
2748
+ dst.buf.resize(3 * target_width * target_height);
2749
+
2750
+ float x_ratio = static_cast<float>(src.nx - 1) / target_width;
2751
+ float y_ratio = static_cast<float>(src.ny - 1) / target_height;
2752
+
2753
+ for (int y = 0; y < target_height; y++) {
2754
+ for (int x = 0; x < target_width; x++) {
2755
+ float px = x_ratio * x;
2756
+ float py = y_ratio * y;
2757
+ int x_floor = static_cast<int>(px);
2758
+ int y_floor = static_cast<int>(py);
2759
+ float x_lerp = px - x_floor;
2760
+ float y_lerp = py - y_floor;
2761
+
2762
+ for (int c = 0; c < 3; c++) {
2763
+ float top = lerp(
2764
+ static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
2765
+ static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
2766
+ x_lerp
2767
+ );
2768
+ float bottom = lerp(
2769
+ static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
2770
+ static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
2771
+ x_lerp
2772
+ );
2773
+ dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
2774
+ }
2775
+ }
2776
+ }
2777
+ }
2778
+
2779
+ // Bicubic resize function
2780
+ // part of image will be cropped if the aspect ratio is different
2781
+ static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
2782
+ const int nx = img.nx;
2783
+ const int ny = img.ny;
2784
+
2785
+ dst.nx = target_width;
2786
+ dst.ny = target_height;
2787
+ dst.buf.resize(3 * target_width * target_height);
2788
+
2789
+ float Cc;
2790
+ float C[5];
2791
+ float d0, d2, d3, a0, a1, a2, a3;
2792
+ int i, j, k, jj;
2793
+ int x, y;
2794
+ float dx, dy;
2795
+ float tx, ty;
2796
+
2797
+ tx = (float)nx / (float)target_width;
2798
+ ty = (float)ny / (float)target_height;
2799
+
2800
+ // Bicubic interpolation; adapted from ViT.cpp, inspired from :
2801
+ // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
2802
+ // -> https://en.wikipedia.org/wiki/Bicubic_interpolation
2803
+
2804
+ for (i = 0; i < target_height; i++) {
2805
+ for (j = 0; j < target_width; j++) {
2806
+ x = (int)(tx * j);
2807
+ y = (int)(ty * i);
2808
+
2809
+ dx = tx * j - x;
2810
+ dy = ty * i - y;
2811
+
2812
+ for (k = 0; k < 3; k++) {
2813
+ for (jj = 0; jj <= 3; jj++) {
2814
+ d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2815
+ d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2816
+ d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2817
+ a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2818
+
2819
+ a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
2820
+ a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
2821
+ a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
2822
+
2823
+ C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
2824
+
2825
+ d0 = C[0] - C[1];
2826
+ d2 = C[2] - C[1];
2827
+ d3 = C[3] - C[1];
2828
+ a0 = C[1];
2829
+ a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
2830
+ a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
2831
+ a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
2832
+ Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
2833
+
2834
+ const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
2835
+ dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
2836
+ }
2837
+ }
2838
+ }
2839
+ }
2840
+
2841
+ return true;
2842
+ }
2843
+
2844
+ // llava-1.6 type of resize_and_pad
2845
+ // if the ratio is not 1:1, padding with pad_color will be applied
2846
+ // pad_color is single channel, default is 0 (black)
2847
+ static void resize_and_pad_image(const clip_image_u8 & image, clip_image_u8 & dst, const clip_image_size & target_resolution, std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
2848
+ int target_width = target_resolution.width;
2849
+ int target_height = target_resolution.height;
2850
+
2851
+ float scale_w = static_cast<float>(target_width) / image.nx;
2852
+ float scale_h = static_cast<float>(target_height) / image.ny;
2853
+
2854
+ int new_width, new_height;
2855
+
2856
+ if (scale_w < scale_h) {
2857
+ new_width = target_width;
2858
+ new_height = std::min(static_cast<int>(std::ceil(image.ny * scale_w)), target_height);
2859
+ } else {
2860
+ new_height = target_height;
2861
+ new_width = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width);
2862
+ }
2863
+
2864
+ clip_image_u8 resized_image;
2865
+ bicubic_resize(image, resized_image, new_width, new_height);
2866
+
2867
+ clip_image_u8 padded_image;
2868
+ padded_image.nx = target_width;
2869
+ padded_image.ny = target_height;
2870
+ padded_image.buf.resize(3 * target_width * target_height);
2871
+
2872
+ // Fill the padded image with the fill color
2873
+ for (size_t i = 0; i < padded_image.buf.size(); i += 3) {
2874
+ padded_image.buf[i] = pad_color[0];
2875
+ padded_image.buf[i + 1] = pad_color[1];
2876
+ padded_image.buf[i + 2] = pad_color[2];
2877
+ }
2878
+
2879
+ // Calculate padding offsets
2880
+ int pad_x = (target_width - new_width) / 2;
2881
+ int pad_y = (target_height - new_height) / 2;
2882
+
2883
+ // Copy the resized image into the center of the padded buffer
2884
+ for (int y = 0; y < new_height; ++y) {
2885
+ for (int x = 0; x < new_width; ++x) {
2886
+ for (int c = 0; c < 3; ++c) {
2887
+ padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c];
2888
+ }
2889
+ }
2890
+ }
2891
+ dst = std::move(padded_image);
2892
+ }
2893
+
2894
+ static void crop_image(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
2895
+ dst.nx = w;
2896
+ dst.ny = h;
2897
+ dst.buf.resize(3 * w * h);
2898
+
2899
+ for (int i = 0; i < h; ++i) {
2900
+ for (int j = 0; j < w; ++j) {
2901
+ int src_idx = 3 * ((y + i)*image.nx + (x + j));
2902
+ int dst_idx = 3 * (i*w + j);
2903
+ dst.buf[dst_idx] = image.buf[src_idx];
2904
+ dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
2905
+ dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
2906
+ }
2907
+ }
2908
+ }
2909
+
2910
+ // calculate the size of the **resized** image, while preserving the aspect ratio
2911
+ // the calculated size will be aligned to the nearest multiple of align_size
2912
+ // if H or W size is larger than max_dimension, it will be resized to max_dimension
2913
+ static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int max_dimension) {
2914
+ if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0) {
2915
+ return {0, 0};
2916
+ }
2917
+
2918
+ float scale = std::min(1.0f, std::min(static_cast<float>(max_dimension) / inp_size.width,
2919
+ static_cast<float>(max_dimension) / inp_size.height));
2920
+
2921
+ float target_width_f = static_cast<float>(inp_size.width) * scale;
2922
+ float target_height_f = static_cast<float>(inp_size.height) * scale;
2923
+
2924
+ int aligned_width = CLIP_ALIGN((int)target_width_f, align_size);
2925
+ int aligned_height = CLIP_ALIGN((int)target_height_f, align_size);
2926
+
2927
+ return {aligned_width, aligned_height};
2928
+ }
2929
+
2930
+ private:
2931
+ static inline int clip(int x, int lower, int upper) {
2932
+ return std::max(lower, std::min(x, upper));
2933
+ }
2934
+
2935
+ // Linear interpolation between two points
2936
+ static inline float lerp(float s, float e, float t) {
2937
+ return s + (e - s) * t;
2938
+ }
2939
+ };
2940
+
2941
+ /**
2942
+ * implementation of LLaVA-UHD:
2943
+ * - https://arxiv.org/pdf/2403.11703
2944
+ * - https://github.com/thunlp/LLaVA-UHD
2945
+ * - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
2946
+ *
2947
+ * overview:
2948
+ * - an image always have a single overview (downscaled image)
2949
+ * - an image can have 0 or multiple slices, depending on the image size
2950
+ * - each slice can then be considered as a separate image
2951
+ *
2952
+ * for example:
2953
+ *
2954
+ * [overview] --> [slice 1] --> [slice 2]
2955
+ * | |
2956
+ * +--> [slice 3] --> [slice 4]
2957
+ */
2958
+ struct llava_uhd {
2959
+ struct slice_coordinates {
2960
+ int x;
2961
+ int y;
2962
+ clip_image_size size;
2963
+ };
2964
+
2965
+ struct slice_instructions {
2966
+ clip_image_size overview_size; // size of downscaled image
2967
+ clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size)
2968
+ clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices
2969
+ std::vector<slice_coordinates> slices;
2970
+ bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
2971
+ };
2972
+
2973
+ static int get_max_slices(struct clip_ctx * ctx) {
2974
+ if (clip_is_minicpmv(ctx)) {
2975
+ return 9;
2976
+ }
2977
+ return 0;
2978
+ }
2979
+
2980
+ static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
2981
+ slice_instructions res;
2982
+ const int patch_size = clip_get_patch_size(ctx);
2983
+ const int slice_size = clip_get_image_size(ctx);
2984
+ const int max_slice_nums = get_max_slices(ctx);
2985
+ const int original_width = original_size.width;
2986
+ const int original_height = original_size.height;
2987
+ const float log_ratio = log((float)original_width / original_height);
2988
+ const float ratio = (float)original_width * original_height / (slice_size * slice_size);
2989
+ const int multiple = fmin(ceil(ratio), max_slice_nums);
2990
+ const bool has_slices = (multiple > 1);
2991
+ const bool has_pinpoints = !ctx->vision_model.hparams.image_grid_pinpoints.empty();
2992
+
2993
+ if (has_pinpoints) {
2994
+ // has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
2995
+ auto refine_size = llava_uhd::select_best_resolution(
2996
+ ctx->vision_model.hparams.image_grid_pinpoints,
2997
+ original_size);
2998
+ res.overview_size = clip_image_size{slice_size, slice_size};
2999
+ res.refined_size = refine_size;
3000
+ res.grid_size = clip_image_size{0, 0};
3001
+ res.padding_refined = true;
3002
+
3003
+ for (int y = 0; y < refine_size.height; y += slice_size) {
3004
+ for (int x = 0; x < refine_size.width; x += slice_size) {
3005
+ slice_coordinates slice;
3006
+ slice.x = x;
3007
+ slice.y = y;
3008
+ slice.size.width = std::min(slice_size, refine_size.width - x);
3009
+ slice.size.height = std::min(slice_size, refine_size.height - y);
3010
+ res.slices.push_back(slice);
3011
+ if (x == 0) {
3012
+ res.grid_size.width++;
3013
+ }
3014
+ }
3015
+ res.grid_size.height++;
3016
+ }
3017
+
3018
+ return res;
3019
+ }
3020
+
3021
+ // no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
3022
+
3023
+ auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices);
3024
+ res.overview_size = best_size;
3025
+
3026
+ if (!has_slices) {
3027
+ // skip slicing logic
3028
+ res.refined_size = clip_image_size{0, 0};
3029
+ res.grid_size = clip_image_size{0, 0};
3030
+
3031
+ } else {
3032
+ auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio);
3033
+ auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
3034
+ res.grid_size = best_grid;
3035
+ res.refined_size = refine_size;
3036
+
3037
+ int width = refine_size.width;
3038
+ int height = refine_size.height;
3039
+ int grid_x = int(width / best_grid.width);
3040
+ int grid_y = int(height / best_grid.height);
3041
+ for (int patches_y = 0, ic = 0;
3042
+ patches_y < refine_size.height && ic < best_grid.height;
3043
+ patches_y += grid_y, ic += 1) {
3044
+ for (int patches_x = 0, jc = 0;
3045
+ patches_x < refine_size.width && jc < best_grid.width;
3046
+ patches_x += grid_x, jc += 1) {
3047
+ slice_coordinates slice;
3048
+ slice.x = patches_x;
3049
+ slice.y = patches_y;
3050
+ slice.size.width = grid_x;
3051
+ slice.size.height = grid_y;
3052
+ res.slices.push_back(slice);
3053
+ // LOG_INF("slice %d: %d %d %d %d\n", ic, patches_i, patches_j, grid_x, grid_y);
3054
+ }
3055
+ }
3056
+ }
3057
+
3058
+ return res;
3059
+ }
3060
+
3061
+ static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
3062
+ std::vector<clip_image_u8_ptr> output;
3063
+
3064
+ // resize to overview size
3065
+ clip_image_u8_ptr resized_img(clip_image_u8_init());
3066
+ image_manipulation::bicubic_resize(*img, *resized_img, inst.overview_size.width, inst.overview_size.height);
3067
+ output.push_back(std::move(resized_img));
3068
+ if (inst.slices.empty()) {
3069
+ // no slices, just return the resized image
3070
+ return output;
3071
+ }
3072
+
3073
+ // resize to refined size
3074
+ clip_image_u8_ptr refined_img(clip_image_u8_init());
3075
+ if (inst.padding_refined) {
3076
+ image_manipulation::resize_and_pad_image(*img, *refined_img, inst.refined_size);
3077
+ } else {
3078
+ image_manipulation::bilinear_resize(*img, *refined_img, inst.refined_size.width, inst.refined_size.height);
3079
+ }
3080
+
3081
+ // create slices
3082
+ for (const auto & slice : inst.slices) {
3083
+ int x = slice.x;
3084
+ int y = slice.y;
3085
+ int w = slice.size.width;
3086
+ int h = slice.size.height;
3087
+
3088
+ clip_image_u8_ptr img_slice(clip_image_u8_init());
3089
+ image_manipulation::crop_image(*refined_img, *img_slice, x, y, w, h);
3090
+ output.push_back(std::move(img_slice));
3091
+ }
3092
+
3093
+ return output;
3094
+ }
3095
+
3096
+ private:
3097
+ static clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
3098
+ int width = original_size.width;
3099
+ int height = original_size.height;
3100
+ if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
3101
+ float r = static_cast<float>(width) / height;
3102
+ height = static_cast<int>(scale_resolution / std::sqrt(r));
3103
+ width = static_cast<int>(height * r);
3104
+ }
3105
+ clip_image_size res;
3106
+ res.width = ensure_divide(width, patch_size);
3107
+ res.height = ensure_divide(height, patch_size);
3108
+ return res;
3109
+ }
3110
+
3111
+ /**
3112
+ * Selects the best resolution from a list of possible resolutions based on the original size.
3113
+ *
3114
+ * @param original_size The original size of the image
3115
+ * @param possible_resolutions A list of possible resolutions
3116
+ * @return The best fit resolution
3117
+ */
3118
+ static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
3119
+ int original_width = original_size.width;
3120
+ int original_height = original_size.height;
3121
+ clip_image_size best_fit;
3122
+ int max_effective_resolution = 0;
3123
+ int min_wasted_resolution = std::numeric_limits<int>::max();
3124
+
3125
+ for (const auto & resolution : possible_resolutions) {
3126
+ int width = resolution.width;
3127
+ int height = resolution.height;
3128
+ float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
3129
+ int downscaled_width = static_cast<int>(original_width * scale);
3130
+ int downscaled_height = static_cast<int>(original_height * scale);
3131
+ int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
3132
+ int wasted_resolution = (width * height) - effective_resolution;
3133
+ // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
3134
+ if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
3135
+ max_effective_resolution = effective_resolution;
3136
+ min_wasted_resolution = wasted_resolution;
3137
+ best_fit = resolution;
3138
+ }
3139
+ }
3140
+
3141
+ return best_fit;
3142
+ }
3143
+
3144
+ // used by llava 1.6 with custom list of pinpoints
3145
+ static clip_image_size select_best_resolution(const std::vector<int32_t> & pinpoints, const clip_image_size & original_size) {
3146
+ std::vector<clip_image_size> possible_resolutions; // TODO @ngxson : construct this inside hparams, not here
3147
+ for (size_t i = 0; i < pinpoints.size(); i += 2) {
3148
+ possible_resolutions.push_back(clip_image_size{pinpoints[i], pinpoints[i+1]});
3149
+ }
3150
+ return select_best_resolution(original_size, possible_resolutions);
3151
+ }
3152
+
3153
+ static int ensure_divide(int length, int patch_size) {
3154
+ return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
3155
+ }
3156
+
3157
+ static clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
3158
+ int width = original_size.width;
3159
+ int height = original_size.height;
3160
+ int grid_x = grid.width;
3161
+ int grid_y = grid.height;
3162
+
3163
+ int refine_width = ensure_divide(width, grid_x);
3164
+ int refine_height = ensure_divide(height, grid_y);
3165
+
3166
+ clip_image_size grid_size;
3167
+ grid_size.width = refine_width / grid_x;
3168
+ grid_size.height = refine_height / grid_y;
3169
+
3170
+ auto best_grid_size = get_best_resize(grid_size, scale_resolution, patch_size, allow_upscale);
3171
+ int best_grid_width = best_grid_size.width;
3172
+ int best_grid_height = best_grid_size.height;
3173
+
3174
+ clip_image_size refine_size;
3175
+ refine_size.width = best_grid_width * grid_x;
3176
+ refine_size.height = best_grid_height * grid_y;
3177
+ return refine_size;
3178
+ }
3179
+
3180
+ static clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
3181
+ std::vector<int> candidate_split_grids_nums;
3182
+ for (int i : {multiple - 1, multiple, multiple + 1}) {
3183
+ if (i == 1 || i > max_slice_nums) {
3184
+ continue;
3185
+ }
3186
+ candidate_split_grids_nums.push_back(i);
3187
+ }
3188
+
3189
+ std::vector<clip_image_size> candidate_grids;
3190
+ for (int split_grids_nums : candidate_split_grids_nums) {
3191
+ int m = 1;
3192
+ while (m <= split_grids_nums) {
3193
+ if (split_grids_nums % m == 0) {
3194
+ candidate_grids.push_back(clip_image_size{m, split_grids_nums / m});
3195
+ }
3196
+ ++m;
3197
+ }
3198
+ }
3199
+
3200
+ clip_image_size best_grid{1, 1};
3201
+ float min_error = std::numeric_limits<float>::infinity();
3202
+ for (const auto& grid : candidate_grids) {
3203
+ float error = std::abs(log_ratio - std::log(1.0 * grid.width / grid.height));
3204
+ if (error < min_error) {
3205
+ best_grid = grid;
3206
+ min_error = error;
3207
+ }
3208
+ }
3209
+ return best_grid;
3210
+ }
3211
+ };
3212
+
3213
+ // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
3214
+ // res_imgs memory is being allocated here, previous allocations will be freed if found
3215
+ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
3216
+ clip_image_size original_size{img->nx, img->ny};
3217
+ bool pad_to_square = true;
3218
+ auto & params = ctx->vision_model.hparams;
3219
+ // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
3220
+ if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) {
3221
+ pad_to_square = false;
3222
+ }
3223
+
3224
+ if (clip_is_minicpmv(ctx)) {
3225
+ auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
3226
+ std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
3227
+
3228
+ for (size_t i = 0; i < imgs.size(); ++i) {
3229
+ // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
3230
+ clip_image_f32_ptr res(clip_image_f32_init());
3231
+ normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
3232
+ res_imgs->entries.push_back(std::move(res));
3233
+ }
3234
+
3235
+ res_imgs->grid_x = inst.grid_size.width;
3236
+ res_imgs->grid_y = inst.grid_size.height;
3237
+ return true;
3238
+
3239
+ } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
3240
+ clip_image_u8 resized;
3241
+ auto patch_size = params.patch_size * 2;
3242
+ auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
3243
+ image_manipulation::bicubic_resize(*img, resized, new_size.width, new_size.height);
3244
+
3245
+ clip_image_f32_ptr img_f32(clip_image_f32_init());
3246
+ // clip_image_f32_ptr res(clip_image_f32_init());
3247
+ normalize_image_u8_to_f32(resized, *img_f32, ctx->image_mean, ctx->image_std);
3248
+ // res_imgs->data[0] = *res;
3249
+ res_imgs->entries.push_back(std::move(img_f32));
3250
+ return true;
3251
+ }
3252
+ else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE
3253
+ || ctx->proj_type == PROJECTOR_TYPE_GEMMA3
3254
+ || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3
3255
+ || ctx->proj_type == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
3256
+ ) {
3257
+ clip_image_u8 resized_image;
3258
+ int sz = params.image_size;
3259
+ image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz});
3260
+ clip_image_f32_ptr img_f32(clip_image_f32_init());
3261
+ //clip_image_save_to_bmp(resized_image, "resized.bmp");
3262
+ normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
3263
+ res_imgs->entries.push_back(std::move(img_f32));
3264
+ return true;
3265
+
3266
+ } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
3267
+ clip_image_u8 resized_image;
3268
+ auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
3269
+ image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
3270
+ clip_image_f32_ptr img_f32(clip_image_f32_init());
3271
+ normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
3272
+ res_imgs->entries.push_back(std::move(img_f32));
3273
+ return true;
3274
+
3275
+ } else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
3276
+ LM_GGML_ASSERT(!params.image_grid_pinpoints.empty());
3277
+ auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
3278
+ std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
3279
+
3280
+ for (size_t i = 0; i < imgs.size(); ++i) {
3281
+ clip_image_f32_ptr res(clip_image_f32_init());
3282
+ normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
3283
+ res_imgs->entries.push_back(std::move(res));
3284
+ }
3285
+
3286
+ res_imgs->grid_x = inst.grid_size.width;
3287
+ res_imgs->grid_y = inst.grid_size.height;
3288
+ return true;
3289
+
3290
+ }
3291
+
3292
+ // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
3293
+ // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
3294
+
3295
+ clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
3296
+
3297
+ if (pad_to_square) {
3298
+ // for llava-1.5, we resize image to a square, and pad the shorter side with a background color
3299
+ // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
3300
+ const int longer_side = std::max(img->nx, img->ny);
3301
+ temp->nx = longer_side;
3302
+ temp->ny = longer_side;
3303
+ temp->buf.resize(3 * longer_side * longer_side);
3304
+
3305
+ // background color in RGB from LLaVA (this is the mean rgb color * 255)
3306
+ const std::array<uint8_t, 3> pad_color = {122, 116, 104};
3307
+
3308
+ // resize the image to the target_size
3309
+ image_manipulation::resize_and_pad_image(*img, *temp, clip_image_size{params.image_size, params.image_size}, pad_color);
3310
+
3311
+ clip_image_f32_ptr res(clip_image_f32_init());
3312
+ normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
3313
+ res_imgs->entries.push_back(std::move(res));
3314
+ return true;
3315
+
3316
+ } else if (!params.image_grid_pinpoints.empty()) {
3317
+ // "spatial_unpad" with "anyres" processing for llava-1.6
3318
+ auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
3319
+ std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
3320
+
3321
+ for (size_t i = 0; i < imgs.size(); ++i) {
3322
+ // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
3323
+ clip_image_f32_ptr res(clip_image_f32_init());
3324
+ normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
3325
+ res_imgs->entries.push_back(std::move(res));
3326
+ }
3327
+
3328
+ return true;
3329
+
3330
+ }
3331
+
3332
+ LM_GGML_ASSERT(false && "Unknown image preprocessing type");
3333
+ }
3334
+
3335
+ lm_ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
3336
+ return ctx->vision_model.image_newline;
3337
+ }
3338
+
3339
+ void clip_free(clip_ctx * ctx) {
3340
+ if (ctx == nullptr) {
3341
+ return;
3342
+ }
3343
+ delete ctx;
3344
+ }
3345
+
3346
+ // deprecated
3347
+ size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
3348
+ const int32_t nx = ctx->vision_model.hparams.image_size;
3349
+ const int32_t ny = ctx->vision_model.hparams.image_size;
3350
+ return clip_embd_nbytes_by_img(ctx, nx, ny);
3351
+ }
3352
+
3353
+ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
3354
+ clip_image_f32 img;
3355
+ img.nx = img_w;
3356
+ img.ny = img_h;
3357
+ return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
3358
+ }
3359
+
3360
+ int32_t clip_get_image_size(const struct clip_ctx * ctx) {
3361
+ return ctx->vision_model.hparams.image_size;
3362
+ }
3363
+
3364
+ int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
3365
+ return ctx->vision_model.hparams.patch_size;
3366
+ }
3367
+
3368
+ int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
3369
+ return ctx->vision_model.hparams.n_embd;
3370
+ }
3371
+
3372
+ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
3373
+ return ctx->vision_model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
3374
+ }
3375
+
3376
+ const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
3377
+ if (ctx->vision_model.hparams.image_grid_pinpoints.size()) {
3378
+ return &ctx->vision_model.hparams.image_grid_pinpoints.front();
3379
+ }
3380
+ return nullptr;
3381
+ }
3382
+
3383
+ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
3384
+ return ctx->vision_model.hparams.image_grid_pinpoints.size();
3385
+ }
3386
+
3387
+ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
3388
+ const auto & params = ctx->vision_model.hparams;
3389
+ const int n_total = clip_n_output_tokens(ctx, img);
3390
+ if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
3391
+ return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0);
3392
+ }
3393
+ return n_total;
3394
+ }
3395
+
3396
+ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
3397
+ const auto & params = ctx->vision_model.hparams;
3398
+ if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
3399
+ return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0);
3400
+ }
3401
+ return 1;
3402
+ }
3403
+
3404
+ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
3405
+ const auto & params = ctx->vision_model.hparams;
3406
+
3407
+ int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
3408
+ int scale_factor = ctx->vision_model.hparams.proj_scale_factor;
3409
+
3410
+ if (ctx->proj_type == PROJECTOR_TYPE_LDP
3411
+ || ctx->proj_type == PROJECTOR_TYPE_LDPV2
3412
+ || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
3413
+ n_patches /= 4;
3414
+ if (ctx->vision_model.mm_glm_tok_boi) {
3415
+ n_patches += 2; // for BOI and EOI token embeddings
3416
+ }
3417
+ } else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
3418
+ if (ctx->minicpmv_version == 2) {
3419
+ n_patches = 96;
3420
+ }
3421
+ else if (ctx->minicpmv_version == 3) {
3422
+ n_patches = 64;
3423
+ }
3424
+ else if (ctx->minicpmv_version == 4) {
3425
+ n_patches = 64;
3426
+ }
3427
+ else {
3428
+ LM_GGML_ABORT("Unknown minicpmv version");
3429
+ }
3430
+ } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
3431
+ int patch_size = params.patch_size * 2;
3432
+ int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
3433
+ int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
3434
+ n_patches = x_patch * y_patch;
3435
+ } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
3436
+ int n_per_side = params.image_size / params.patch_size;
3437
+ int n_per_side_2d_pool = n_per_side / params.proj_scale_factor;
3438
+ n_patches = n_per_side_2d_pool * n_per_side_2d_pool;
3439
+ } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3 || ctx->proj_type == PROJECTOR_TYPE_INTERNVL) {
3440
+ // both W and H are divided by proj_scale_factor
3441
+ n_patches /= (params.proj_scale_factor * params.proj_scale_factor);
3442
+ } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
3443
+ int n_merge = params.spatial_merge_size;
3444
+ int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
3445
+ int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1);
3446
+ n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
3447
+ } else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
3448
+ n_patches /= (scale_factor * scale_factor);
3449
+ } else if (ctx->proj_type == PROJECTOR_TYPE_ULTRAVOX) {
3450
+ const int proj_stack_factor = ctx->vision_model.hparams.proj_stack_factor;
3451
+ const int n_len = CLIP_ALIGN(img->nx, proj_stack_factor);
3452
+ n_patches = n_len / proj_stack_factor / 2;
3453
+ }
3454
+
3455
+ return n_patches;
3456
+ }
3457
+
3458
+ static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
3459
+ assert(embed_dim % 2 == 0);
3460
+ int H = pos.size();
3461
+ int W = pos[0].size();
3462
+
3463
+ std::vector<float> omega(embed_dim / 2);
3464
+ for (int i = 0; i < embed_dim / 2; ++i) {
3465
+ omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
3466
+ }
3467
+
3468
+ std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
3469
+ for (int h = 0; h < H; ++h) {
3470
+ for (int w = 0; w < W; ++w) {
3471
+ for (int d = 0; d < embed_dim / 2; ++d) {
3472
+ float out_value = pos[h][w] * omega[d];
3473
+ emb[h][w][d] = sin(out_value);
3474
+ emb[h][w][d + embed_dim / 2] = cos(out_value);
3475
+ }
3476
+ }
3477
+ }
3478
+
3479
+ return emb;
3480
+ }
3481
+
3482
+ static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>> & grid) {
3483
+ assert(embed_dim % 2 == 0);
3484
+ std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
3485
+ std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
3486
+
3487
+ int H = emb_h.size();
3488
+ int W = emb_h[0].size();
3489
+ std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
3490
+
3491
+ for (int h = 0; h < H; ++h) {
3492
+ for (int w = 0; w < W; ++w) {
3493
+ for (int d = 0; d < embed_dim / 2; ++d) {
3494
+ emb[h][w][d] = emb_h[h][w][d];
3495
+ emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
3496
+ }
3497
+ }
3498
+ }
3499
+ return emb;
3500
+ }
3501
+
3502
+ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
3503
+ int grid_h_size = image_size.first;
3504
+ int grid_w_size = image_size.second;
3505
+
3506
+ std::vector<float> grid_h(grid_h_size);
3507
+ std::vector<float> grid_w(grid_w_size);
3508
+
3509
+ for (int i = 0; i < grid_h_size; ++i) {
3510
+ grid_h[i] = static_cast<float>(i);
3511
+ }
3512
+ for (int i = 0; i < grid_w_size; ++i) {
3513
+ grid_w[i] = static_cast<float>(i);
3514
+ }
3515
+
3516
+ std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
3517
+ for (int h = 0; h < grid_h_size; ++h) {
3518
+ for (int w = 0; w < grid_w_size; ++w) {
3519
+ grid[h][w] = grid_w[w];
3520
+ }
3521
+ }
3522
+ std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
3523
+ for (int h = 0; h < grid_h_size; ++h) {
3524
+ for (int w = 0; w < grid_w_size; ++w) {
3525
+ grid_2d[0][h][w] = grid_h[h];
3526
+ grid_2d[1][h][w] = grid_w[w];
3527
+ }
3528
+ }
3529
+
3530
+ std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
3531
+
3532
+ int H = image_size.first;
3533
+ int W = image_size.second;
3534
+ std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
3535
+ for (int h = 0; h < H; ++h) {
3536
+ for (int w = 0; w < W; ++w) {
3537
+ pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
3538
+ }
3539
+ }
3540
+
3541
+ return pos_embed_2d;
3542
+ }
3543
+
3544
+ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
3545
+ clip_image_f32_batch imgs;
3546
+ clip_image_f32_ptr img_copy(clip_image_f32_init());
3547
+ *img_copy = *img;
3548
+ imgs.entries.push_back(std::move(img_copy));
3549
+
3550
+ return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
3551
+ }
3552
+
3553
+ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
3554
+ const clip_image_f32_batch & imgs = *imgs_c_ptr;
3555
+ int batch_size = imgs.entries.size();
3556
+
3557
+ // TODO @ngxson : implement batch size > 1 as a loop
3558
+ // we don't need true batching support because the cgraph will gonna be big anyway
3559
+ if (batch_size != 1) {
3560
+ return false; // only support batch size of 1
3561
+ }
3562
+
3563
+ // build the inference graph
3564
+ ctx->debug_print_tensors.clear();
3565
+ lm_ggml_backend_sched_reset(ctx->sched.get());
3566
+ lm_ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
3567
+ lm_ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
3568
+
3569
+ // set inputs
3570
+ const auto & model = ctx->vision_model;
3571
+ const auto & hparams = model.hparams;
3572
+
3573
+ const int image_size_width = imgs.entries[0]->nx;
3574
+ const int image_size_height = imgs.entries[0]->ny;
3575
+
3576
+ const int patch_size = hparams.patch_size;
3577
+ const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
3578
+ const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
3579
+ const int pos_w = image_size_width / patch_size;
3580
+ const int pos_h = image_size_height / patch_size;
3581
+
3582
+ const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
3583
+
3584
+ auto get_inp_tensor = [&gf](const char * name) {
3585
+ lm_ggml_tensor * inp = lm_ggml_graph_get_tensor(gf, name);
3586
+ if (inp == nullptr) {
3587
+ LM_GGML_ABORT("Failed to get tensor %s", name);
3588
+ }
3589
+ if (!(inp->flags & LM_GGML_TENSOR_FLAG_INPUT)) {
3590
+ LM_GGML_ABORT("Tensor %s is not an input tensor", name);
3591
+ }
3592
+ return inp;
3593
+ };
3594
+
3595
+ auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
3596
+ lm_ggml_tensor * cur = get_inp_tensor(name);
3597
+ LM_GGML_ASSERT(cur->type == LM_GGML_TYPE_F32);
3598
+ LM_GGML_ASSERT(lm_ggml_nelements(cur) == (int64_t)values.size());
3599
+ lm_ggml_backend_tensor_set(cur, values.data(), 0, lm_ggml_nbytes(cur));
3600
+ };
3601
+
3602
+ auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
3603
+ lm_ggml_tensor * cur = get_inp_tensor(name);
3604
+ LM_GGML_ASSERT(cur->type == LM_GGML_TYPE_I32);
3605
+ LM_GGML_ASSERT(lm_ggml_nelements(cur) == (int64_t)values.size());
3606
+ lm_ggml_backend_tensor_set(cur, values.data(), 0, lm_ggml_nbytes(cur));
3607
+ };
3608
+
3609
+ // set input pixel values
3610
+ if (!imgs.is_audio) {
3611
+ size_t nelem = 0;
3612
+ for (const auto & img : imgs.entries) {
3613
+ nelem += img->nx * img->ny * 3;
3614
+ }
3615
+ std::vector<float> inp_raw(nelem);
3616
+
3617
+ // layout of data (note: the channel dim is unrolled to better visualize the layout):
3618
+ //
3619
+ // ┌──W──┐
3620
+ // │ H │ channel = R
3621
+ // ├─────┤ │
3622
+ // │ H │ channel = G
3623
+ // ├─────┤ │
3624
+ // │ H │ channel = B
3625
+ // └─────┘ │
3626
+ // ──────┘ x B
3627
+
3628
+ for (size_t i = 0; i < imgs.entries.size(); i++) {
3629
+ const int nx = imgs.entries[i]->nx;
3630
+ const int ny = imgs.entries[i]->ny;
3631
+ const int n = nx * ny;
3632
+
3633
+ for (int b = 0; b < batch_size; b++) {
3634
+ float * batch_entry = inp_raw.data() + b * (3*n);
3635
+ for (int y = 0; y < ny; y++) {
3636
+ for (int x = 0; x < nx; x++) {
3637
+ size_t base_src = 3*(y * nx + x); // idx of the first channel
3638
+ size_t base_dst = y * nx + x; // idx of the first channel
3639
+ batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ];
3640
+ batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
3641
+ batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
3642
+ }
3643
+ }
3644
+ }
3645
+ }
3646
+ set_input_f32("inp_raw", inp_raw);
3647
+
3648
+ } else {
3649
+ // audio input
3650
+ LM_GGML_ASSERT(imgs.entries.size() == 1);
3651
+ const auto & mel_inp = imgs.entries[0];
3652
+ const int n_step = mel_inp->nx;
3653
+ const int n_mel = mel_inp->ny;
3654
+ std::vector<float> inp_raw(n_step * n_mel);
3655
+ std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float));
3656
+ set_input_f32("inp_raw", inp_raw);
3657
+ }
3658
+
3659
+ // set input per projector
3660
+ switch (ctx->proj_type) {
3661
+ case PROJECTOR_TYPE_MINICPMV:
3662
+ {
3663
+ // inspired from siglip:
3664
+ // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
3665
+ // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
3666
+ std::vector<int32_t> positions(pos_h * pos_w);
3667
+ int bucket_coords_h[1024];
3668
+ int bucket_coords_w[1024];
3669
+ for (int i = 0; i < pos_h; i++){
3670
+ bucket_coords_h[i] = std::floor(70.0*i/pos_h);
3671
+ }
3672
+ for (int i = 0; i < pos_w; i++){
3673
+ bucket_coords_w[i] = std::floor(70.0*i/pos_w);
3674
+ }
3675
+ for (int i = 0, id = 0; i < pos_h; i++){
3676
+ for (int j = 0; j < pos_w; j++){
3677
+ positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
3678
+ }
3679
+ }
3680
+ set_input_i32("positions", positions);
3681
+
3682
+ // inspired from resampler of Qwen-VL:
3683
+ // -> https://huggingface.co/Qwen/Qwen-VL/tree/main
3684
+ // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
3685
+ int embed_dim = clip_n_mmproj_embd(ctx);
3686
+
3687
+ // TODO @ngxson : this is very inefficient, can we do this using lm_ggml_sin and lm_ggml_cos?
3688
+ auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
3689
+
3690
+ std::vector<float> pos_embed(embed_dim * pos_w * pos_h);
3691
+ for(int i = 0; i < pos_w * pos_h; ++i){
3692
+ for(int j = 0; j < embed_dim; ++j){
3693
+ pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
3694
+ }
3695
+ }
3696
+
3697
+ set_input_f32("pos_embed", pos_embed);
3698
+ } break;
3699
+ case PROJECTOR_TYPE_QWEN2VL:
3700
+ {
3701
+ const int merge_ratio = 2;
3702
+ const int pw = image_size_width / patch_size;
3703
+ const int ph = image_size_height / patch_size;
3704
+ std::vector<int> positions(n_pos * 4);
3705
+ int ptr = 0;
3706
+ for (int y = 0; y < ph; y += merge_ratio) {
3707
+ for (int x = 0; x < pw; x += merge_ratio) {
3708
+ for (int dy = 0; dy < 2; dy++) {
3709
+ for (int dx = 0; dx < 2; dx++) {
3710
+ positions[ ptr] = y + dy;
3711
+ positions[ num_patches + ptr] = x + dx;
3712
+ positions[2 * num_patches + ptr] = y + dy;
3713
+ positions[3 * num_patches + ptr] = x + dx;
3714
+ ptr++;
3715
+ }
3716
+ }
3717
+ }
3718
+ }
3719
+
3720
+ set_input_i32("positions", positions);
3721
+ } break;
3722
+ case PROJECTOR_TYPE_QWEN25VL:
3723
+ {
3724
+ // pw * ph = number of tokens output by ViT after apply patch merger
3725
+ // ipw * ipw = number of vision token been processed inside ViT
3726
+ const int merge_ratio = 2;
3727
+ const int pw = image_size_width / patch_size / merge_ratio;
3728
+ const int ph = image_size_height / patch_size / merge_ratio;
3729
+ const int ipw = image_size_width / patch_size;
3730
+ const int iph = image_size_height / patch_size;
3731
+
3732
+ std::vector<int> idx (ph * pw);
3733
+ std::vector<int> inv_idx(ph * pw);
3734
+
3735
+ if (use_window_attn) {
3736
+ const int attn_window_size = 112;
3737
+ const int grid_window = attn_window_size / patch_size / merge_ratio;
3738
+ int dst = 0;
3739
+ // [num_vision_tokens, num_vision_tokens] attention mask tensor
3740
+ std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
3741
+ int mask_row = 0;
3742
+
3743
+ for (int y = 0; y < ph; y += grid_window) {
3744
+ for (int x = 0; x < pw; x += grid_window) {
3745
+ const int win_h = std::min(grid_window, ph - y);
3746
+ const int win_w = std::min(grid_window, pw - x);
3747
+ const int dst_0 = dst;
3748
+ // group all tokens belong to the same window togather (to a continue range)
3749
+ for (int dy = 0; dy < win_h; dy++) {
3750
+ for (int dx = 0; dx < win_w; dx++) {
3751
+ const int src = (y + dy) * pw + (x + dx);
3752
+ LM_GGML_ASSERT(src < (int)idx.size());
3753
+ LM_GGML_ASSERT(dst < (int)inv_idx.size());
3754
+ idx [src] = dst;
3755
+ inv_idx[dst] = src;
3756
+ dst++;
3757
+ }
3758
+ }
3759
+
3760
+ for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
3761
+ int row_offset = mask_row * (ipw * iph);
3762
+ std::fill(
3763
+ mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
3764
+ mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
3765
+ 0.0);
3766
+ mask_row++;
3767
+ }
3768
+ }
3769
+ }
3770
+
3771
+ set_input_i32("window_idx", idx);
3772
+ set_input_i32("inv_window_idx", inv_idx);
3773
+ set_input_f32("window_mask", mask);
3774
+ } else {
3775
+ for (int i = 0; i < ph * pw; i++) {
3776
+ idx[i] = i;
3777
+ }
3778
+ }
3779
+
3780
+ const int mpow = merge_ratio * merge_ratio;
3781
+ std::vector<int> positions(n_pos * 4);
3782
+
3783
+ int ptr = 0;
3784
+ for (int y = 0; y < iph; y += merge_ratio) {
3785
+ for (int x = 0; x < ipw; x += merge_ratio) {
3786
+ for (int dy = 0; dy < 2; dy++) {
3787
+ for (int dx = 0; dx < 2; dx++) {
3788
+ auto remap = idx[ptr / mpow];
3789
+ remap = (remap * mpow) + (ptr % mpow);
3790
+
3791
+ positions[ remap] = y + dy;
3792
+ positions[ num_patches + remap] = x + dx;
3793
+ positions[2 * num_patches + remap] = y + dy;
3794
+ positions[3 * num_patches + remap] = x + dx;
3795
+ ptr++;
3796
+ }
3797
+ }
3798
+ }
3799
+ }
3800
+
3801
+ set_input_i32("positions", positions);
3802
+ } break;
3803
+ case PROJECTOR_TYPE_PIXTRAL:
3804
+ {
3805
+ // set the 2D positions
3806
+ int n_patches_per_col = image_size_width / patch_size;
3807
+ std::vector<int> pos_data(n_pos);
3808
+ // dimension H
3809
+ for (int i = 0; i < n_pos; i++) {
3810
+ pos_data[i] = i / n_patches_per_col;
3811
+ }
3812
+ set_input_i32("pos_h", pos_data);
3813
+ // dimension W
3814
+ for (int i = 0; i < n_pos; i++) {
3815
+ pos_data[i] = i % n_patches_per_col;
3816
+ }
3817
+ set_input_i32("pos_w", pos_data);
3818
+ } break;
3819
+ case PROJECTOR_TYPE_GLM_EDGE:
3820
+ {
3821
+ // llava and other models
3822
+ std::vector<int32_t> positions(n_pos);
3823
+ for (int i = 0; i < n_pos; i++) {
3824
+ positions[i] = i;
3825
+ }
3826
+ set_input_i32("positions", positions);
3827
+ } break;
3828
+ case PROJECTOR_TYPE_MLP:
3829
+ case PROJECTOR_TYPE_MLP_NORM:
3830
+ case PROJECTOR_TYPE_LDP:
3831
+ case PROJECTOR_TYPE_LDPV2:
3832
+ {
3833
+ // llava and other models
3834
+ std::vector<int32_t> positions(n_pos);
3835
+ for (int i = 0; i < n_pos; i++) {
3836
+ positions[i] = i;
3837
+ }
3838
+ set_input_i32("positions", positions);
3839
+
3840
+ // The patches vector is used to get rows to index into the embeds with;
3841
+ // we should skip dim 0 only if we have CLS to avoid going out of bounds
3842
+ // when retrieving the rows.
3843
+ int patch_offset = model.class_embedding ? 1 : 0;
3844
+ std::vector<int32_t> patches(num_patches);
3845
+ for (int i = 0; i < num_patches; i++) {
3846
+ patches[i] = i + patch_offset;
3847
+ }
3848
+ set_input_i32("patches", patches);
3849
+ } break;
3850
+ case PROJECTOR_TYPE_GEMMA3:
3851
+ case PROJECTOR_TYPE_IDEFICS3:
3852
+ case PROJECTOR_TYPE_INTERNVL:
3853
+ case PROJECTOR_TYPE_ULTRAVOX:
3854
+ {
3855
+ // do nothing
3856
+ } break;
3857
+ case PROJECTOR_TYPE_LLAMA4:
3858
+ {
3859
+ // set the 2D positions
3860
+ int n_patches_per_col = image_size_width / patch_size;
3861
+ std::vector<int> pos_data(num_patches + 1, 0); // +1 for the [CLS] token
3862
+ // last pos is always kept 0, it's for CLS
3863
+ // dimension H
3864
+ for (int i = 0; i < num_patches; i++) {
3865
+ pos_data[i] = (i / n_patches_per_col) + 1;
3866
+ }
3867
+ set_input_i32("pos_h", pos_data);
3868
+ // dimension W
3869
+ for (int i = 0; i < num_patches; i++) {
3870
+ pos_data[i] = (i % n_patches_per_col) + 1;
3871
+ }
3872
+ set_input_i32("pos_w", pos_data);
3873
+ } break;
3874
+ default:
3875
+ LM_GGML_ABORT("Unknown projector type");
3876
+ }
3877
+
3878
+ // lm_ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
3879
+ lm_ggml_backend_dev_t dev = lm_ggml_backend_get_device(ctx->backend_cpu);
3880
+ lm_ggml_backend_reg_t reg = dev ? lm_ggml_backend_dev_backend_reg(dev) : nullptr;
3881
+ if (reg) {
3882
+ auto lm_ggml_backend_set_n_threads_fn = (lm_ggml_backend_set_n_threads_t) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_set_n_threads");
3883
+ if (lm_ggml_backend_set_n_threads_fn) {
3884
+ lm_ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads);
3885
+ }
3886
+ }
3887
+
3888
+ auto status = lm_ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
3889
+ if (status != LM_GGML_STATUS_SUCCESS) {
3890
+ LOG_ERR("%s: lm_ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
3891
+ return false;
3892
+ }
3893
+
3894
+ // print debug nodes
3895
+ if (ctx->debug_graph) {
3896
+ LOG_INF("\n\n---\n\n");
3897
+ LOG_INF("\n\nDebug graph:\n\n");
3898
+ for (lm_ggml_tensor * t : ctx->debug_print_tensors) {
3899
+ std::vector<uint8_t> data(lm_ggml_nbytes(t));
3900
+ lm_ggml_backend_tensor_get(t, data.data(), 0, lm_ggml_nbytes(t));
3901
+ print_tensor_shape(t);
3902
+ print_tensor_data(t, data.data(), 3);
3903
+ }
3904
+ }
3905
+
3906
+ // the last node is the embedding tensor
3907
+ lm_ggml_tensor * embeddings = lm_ggml_graph_node(gf, -1);
3908
+
3909
+ // sanity check (only support batch size of 1 for now)
3910
+ const int n_tokens_out = embeddings->ne[1];
3911
+ const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
3912
+ if (n_tokens_out != expected_n_tokens_out) {
3913
+ LOG_ERR("%s: expected %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
3914
+ LM_GGML_ABORT("Invalid number of output tokens");
3915
+ }
3916
+
3917
+ // copy the embeddings to the location passed by the user
3918
+ lm_ggml_backend_tensor_get(embeddings, vec, 0, lm_ggml_nbytes(embeddings));
3919
+
3920
+ return true;
3921
+ }
3922
+
3923
+ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
3924
+ switch (ctx->proj_type) {
3925
+ case PROJECTOR_TYPE_LDP:
3926
+ return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
3927
+ case PROJECTOR_TYPE_LDPV2:
3928
+ return ctx->vision_model.mm_model_peg_0_b->ne[0];
3929
+ case PROJECTOR_TYPE_MLP:
3930
+ case PROJECTOR_TYPE_PIXTRAL:
3931
+ return ctx->vision_model.mm_2_w->ne[1];
3932
+ case PROJECTOR_TYPE_MLP_NORM:
3933
+ return ctx->vision_model.mm_3_b->ne[0];
3934
+ case PROJECTOR_TYPE_MINICPMV:
3935
+ if (ctx->minicpmv_version == 2) {
3936
+ return 4096;
3937
+ } else if (ctx->minicpmv_version == 3) {
3938
+ return 3584;
3939
+ } else if (ctx->minicpmv_version == 4) {
3940
+ return 3584;
3941
+ }
3942
+ LM_GGML_ABORT("Unknown minicpmv version");
3943
+ case PROJECTOR_TYPE_GLM_EDGE:
3944
+ return ctx->vision_model.mm_model_mlp_3_w->ne[1];
3945
+ case PROJECTOR_TYPE_QWEN2VL:
3946
+ case PROJECTOR_TYPE_QWEN25VL:
3947
+ return ctx->vision_model.mm_1_b->ne[0];
3948
+ case PROJECTOR_TYPE_GEMMA3:
3949
+ return ctx->vision_model.mm_input_proj_w->ne[0];
3950
+ case PROJECTOR_TYPE_IDEFICS3:
3951
+ return ctx->vision_model.projection->ne[1];
3952
+ case PROJECTOR_TYPE_ULTRAVOX:
3953
+ return ctx->vision_model.mm_2_w->ne[1];
3954
+ case PROJECTOR_TYPE_INTERNVL:
3955
+ return ctx->vision_model.mm_3_w->ne[1];
3956
+ case PROJECTOR_TYPE_LLAMA4:
3957
+ return ctx->vision_model.mm_model_proj->ne[1];
3958
+ default:
3959
+ LM_GGML_ABORT("Unknown projector type");
3960
+ }
3961
+ }
3962
+
3963
+ int clip_is_minicpmv(const struct clip_ctx * ctx) {
3964
+ if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
3965
+ return ctx->minicpmv_version;
3966
+ }
3967
+ return 0;
3968
+ }
3969
+
3970
+ bool clip_is_glm(const struct clip_ctx * ctx) {
3971
+ return ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE;
3972
+ }
3973
+
3974
+ bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
3975
+ return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL;
3976
+ }
3977
+
3978
+ bool clip_is_llava(const struct clip_ctx * ctx) {
3979
+ return ctx->has_llava_projector;
3980
+ }
3981
+
3982
+ bool clip_is_gemma3(const struct clip_ctx * ctx) {
3983
+ return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
3984
+ }
3985
+
3986
+ bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
3987
+ return ctx->vision_model.hparams.has_vision;
3988
+ }
3989
+
3990
+ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
3991
+ return ctx->vision_model.hparams.has_audio;
3992
+ }
3993
+
3994
+ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
3995
+ clip_image_f32 clip_img;
3996
+ clip_img.buf.resize(h * w * 3);
3997
+ for (int i = 0; i < h*w*3; i++)
3998
+ {
3999
+ clip_img.buf[i] = img[i];
4000
+ }
4001
+ clip_img.nx = w;
4002
+ clip_img.ny = h;
4003
+ clip_image_encode(ctx, n_threads, &clip_img, vec);
4004
+ return true;
4005
+ }
4006
+
4007
+ //
4008
+ // API used internally with mtmd
4009
+ //
4010
+
4011
+ projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
4012
+ return ctx->proj_type;
4013
+ }
4014
+
4015
+ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {
4016
+ clip_image_f32 * audio = new clip_image_f32;
4017
+ audio->nx = n_frames;
4018
+ audio->ny = n_mel;
4019
+ audio->buf.resize(n_frames * n_mel);
4020
+ std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float));
4021
+
4022
+ batch->entries.push_back(clip_image_f32_ptr(audio));
4023
+ batch->is_audio = true;
4024
+ }