@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -1,3206 +0,0 @@
1
- // NOTE: This is modified from clip.cpp only for LLaVA,
2
- // so there might be still unnecessary artifacts hanging around
3
- // I'll gradually clean and extend it
4
- // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
5
- #include "clip.h"
6
- #include "ggml.h"
7
- #include "ggml-cpp.h"
8
- #include "ggml-cpu.h"
9
- #include "ggml-alloc.h"
10
- #include "ggml-backend.h"
11
- #include "gguf.h"
12
-
13
- #define STB_IMAGE_IMPLEMENTATION
14
- #include "stb_image.h"
15
-
16
- #include <cassert>
17
- #include <cmath>
18
- #include <cstdlib>
19
- #include <cstring>
20
- #include <fstream>
21
- #include <map>
22
- #include <regex>
23
- #include <stdexcept>
24
- #include <unordered_set>
25
- #include <vector>
26
- #include <sstream>
27
- #include <cinttypes>
28
- #include <limits>
29
-
30
- #if defined(LLAVA_LOG_OFF)
31
- # define LOG_INF(...)
32
- # define LOG_WRN(...)
33
- # define LOG_ERR(...)
34
- # define LOG_DBG(...)
35
- #else // defined(LLAVA_LOG_OFF)
36
- # define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
37
- # define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
38
- # define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
39
- # define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
40
- #endif // defined(LLAVA_LOG_OFF)
41
-
42
- //#define CLIP_DEBUG_FUNCTIONS
43
-
44
- // RGB uint8 image
45
- struct clip_image_u8 {
46
- int nx;
47
- int ny;
48
-
49
- std::vector<uint8_t> buf;
50
- };
51
-
52
- // RGB float32 image (NHWC)
53
- // Memory layout: RGBRGBRGB...
54
- struct clip_image_f32 {
55
- int nx;
56
- int ny;
57
-
58
- std::vector<float> buf;
59
- };
60
-
61
- static std::string format(const char * fmt, ...) {
62
- va_list ap;
63
- va_list ap2;
64
- va_start(ap, fmt);
65
- va_copy(ap2, ap);
66
- int size = vsnprintf(NULL, 0, fmt, ap);
67
- GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
68
- std::vector<char> buf(size + 1);
69
- int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
70
- GGML_ASSERT(size2 == size);
71
- va_end(ap2);
72
- va_end(ap);
73
- return std::string(buf.data(), buf.size());
74
- }
75
-
76
- //
77
- // key constants
78
- //
79
-
80
- #define KEY_FTYPE "general.file_type"
81
- #define KEY_NAME "general.name"
82
- #define KEY_DESCRIPTION "general.description"
83
- #define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
84
- #define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
85
- #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
86
- #define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
87
- #define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
88
- #define KEY_MINICPMV_VERSION "clip.minicpmv_version"
89
- #define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
90
- #define KEY_USE_GELU "clip.use_gelu"
91
- #define KEY_USE_SILU "clip.use_silu"
92
- #define KEY_N_EMBD "clip.%s.embedding_length"
93
- #define KEY_N_FF "clip.%s.feed_forward_length"
94
- #define KEY_N_BLOCK "clip.%s.block_count"
95
- #define KEY_N_HEAD "clip.%s.attention.head_count"
96
- #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
97
- #define KEY_PROJ_DIM "clip.%s.projection_dim"
98
- #define KEY_TOKENS "tokenizer.ggml.tokens"
99
- #define KEY_N_POSITIONS "clip.text.context_length"
100
- #define KEY_IMAGE_SIZE "clip.vision.image_size"
101
- #define KEY_PATCH_SIZE "clip.vision.patch_size"
102
- #define KEY_IMAGE_MEAN "clip.vision.image_mean"
103
- #define KEY_IMAGE_STD "clip.vision.image_std"
104
- #define KEY_PROJ_TYPE "clip.projector_type"
105
- #define KEY_FEATURE_LAYER "clip.vision.feature_layer"
106
-
107
- #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
108
- #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
109
- #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
110
-
111
-
112
- //
113
- // tensor name constants
114
- //
115
-
116
- #define TN_TOKEN_EMBD "%s.token_embd.weight"
117
- #define TN_POS_EMBD "%s.position_embd.weight"
118
- #define TN_CLASS_EMBD "v.class_embd"
119
- #define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
120
- #define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
121
- #define TN_PATCH_BIAS "v.patch_embd.bias"
122
- #define TN_ATTN_K "%s.blk.%d.attn_k.%s"
123
- #define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
124
- #define TN_ATTN_V "%s.blk.%d.attn_v.%s"
125
- #define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
126
- #define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
127
- #define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
128
- #define TN_LN_1 "%s.blk.%d.ln1.%s"
129
- #define TN_LN_2 "%s.blk.%d.ln2.%s"
130
- #define TN_LN_PRE "%s.pre_ln.%s"
131
- #define TN_LN_POST "%s.post_ln.%s"
132
- #define TN_TEXT_PROJ "text_projection.weight"
133
- #define TN_VIS_PROJ "visual_projection.weight"
134
- #define TN_LLAVA_PROJ "mm.%d.%s"
135
- #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
136
- #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
137
- #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
138
- #define TN_IMAGE_NEWLINE "model.image_newline"
139
- #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
140
- #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
141
-
142
- #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
143
- #define TN_MINICPMV_QUERY "resampler.query"
144
- #define TN_MINICPMV_PROJ "resampler.proj.weight"
145
- #define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
146
- #define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
147
- #define TN_MINICPMV_LN "resampler.ln_%s.%s"
148
-
149
- #define TN_GLM_ADAPER_CONV "adapter.conv.%s"
150
- #define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s"
151
- #define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s"
152
- #define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
153
- #define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
154
- #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
155
- #define TN_GLM_BOI_W "adapter.boi"
156
- #define TN_GLM_EOI_W "adapter.eoi"
157
-
158
-
159
- enum projector_type {
160
- PROJECTOR_TYPE_MLP,
161
- PROJECTOR_TYPE_MLP_NORM,
162
- PROJECTOR_TYPE_LDP,
163
- PROJECTOR_TYPE_LDPV2,
164
- PROJECTOR_TYPE_RESAMPLER,
165
- PROJECTOR_TYPE_GLM_EDGE,
166
- PROJECTOR_TYPE_MERGER,
167
- PROJECTOR_TYPE_GEMMA3,
168
- PROJECTOR_TYPE_UNKNOWN,
169
- };
170
-
171
- static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
172
- { PROJECTOR_TYPE_MLP, "mlp" },
173
- { PROJECTOR_TYPE_LDP, "ldp" },
174
- { PROJECTOR_TYPE_LDPV2, "ldpv2"},
175
- { PROJECTOR_TYPE_RESAMPLER, "resampler"},
176
- { PROJECTOR_TYPE_GLM_EDGE, "adapter"},
177
- { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
178
- { PROJECTOR_TYPE_GEMMA3, "gemma3"},
179
- };
180
-
181
-
182
- //
183
- // utilities to get data from a gguf file
184
- //
185
-
186
- static int get_key_idx(const gguf_context * ctx, const char * key) {
187
- int i = gguf_find_key(ctx, key);
188
- if (i == -1) {
189
- LOG_ERR("key %s not found in file\n", key);
190
- throw std::runtime_error(format("Missing required key: %s", key));
191
- }
192
-
193
- return i;
194
- }
195
-
196
- static uint32_t get_u32(const gguf_context * ctx, const std::string & key) {
197
- const int i = get_key_idx(ctx, key.c_str());
198
-
199
- return gguf_get_val_u32(ctx, i);
200
- }
201
-
202
- static float get_f32(const gguf_context * ctx, const std::string & key) {
203
- const int i = get_key_idx(ctx, key.c_str());
204
-
205
- return gguf_get_val_f32(ctx, i);
206
- }
207
-
208
- static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) {
209
- struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
210
- if (!cur) {
211
- throw std::runtime_error(format("%s: unable to find tensor %s\n", __func__, name.c_str()));
212
- }
213
-
214
- return cur;
215
- }
216
-
217
- static std::string get_ftype(int ftype) {
218
- return ggml_type_name(static_cast<ggml_type>(ftype));
219
- }
220
-
221
- static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
222
- switch (type) {
223
- case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
224
- case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
225
- case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
226
- case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
227
- case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
228
- case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
229
- case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
230
- case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
231
- case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
232
- case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
233
- case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
234
- default: return format("unknown type %d", type);
235
- }
236
- }
237
-
238
- static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
239
- if (search.empty()) {
240
- return;
241
- }
242
- std::string builder;
243
- builder.reserve(s.length());
244
- size_t pos = 0;
245
- size_t last_pos = 0;
246
- while ((pos = s.find(search, last_pos)) != std::string::npos) {
247
- builder.append(s, last_pos, pos - last_pos);
248
- builder.append(replace);
249
- last_pos = pos + search.length();
250
- }
251
- builder.append(s, last_pos, std::string::npos);
252
- s = std::move(builder);
253
- }
254
-
255
- static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
256
- const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
257
-
258
- switch (type) {
259
- case GGUF_TYPE_STRING:
260
- return gguf_get_val_str(ctx_gguf, i);
261
- case GGUF_TYPE_ARRAY:
262
- {
263
- const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
264
- int arr_n = gguf_get_arr_n(ctx_gguf, i);
265
- const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
266
- std::stringstream ss;
267
- ss << "[";
268
- for (int j = 0; j < arr_n; j++) {
269
- if (arr_type == GGUF_TYPE_STRING) {
270
- std::string val = gguf_get_arr_str(ctx_gguf, i, j);
271
- // escape quotes
272
- replace_all(val, "\\", "\\\\");
273
- replace_all(val, "\"", "\\\"");
274
- ss << '"' << val << '"';
275
- } else if (arr_type == GGUF_TYPE_ARRAY) {
276
- ss << "???";
277
- } else {
278
- ss << gguf_data_to_str(arr_type, data, j);
279
- }
280
- if (j < arr_n - 1) {
281
- ss << ", ";
282
- }
283
- }
284
- ss << "]";
285
- return ss.str();
286
- }
287
- default:
288
- return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
289
- }
290
- }
291
-
292
- static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
293
- size_t tensor_size = ggml_nbytes(tensor);
294
- LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
295
- prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
296
- tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
297
- }
298
-
299
- static projector_type clip_projector_type_from_string(const std::string & name) {
300
- for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
301
- if (kv.second == name) {
302
- return kv.first;
303
- }
304
- }
305
- throw std::runtime_error(format("Unknown projector type: %s", name.c_str()));
306
- }
307
-
308
- #ifdef CLIP_DEBUG_FUNCTIONS
309
- static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
310
- std::ofstream file(filename, std::ios::binary);
311
- if (!file.is_open()) {
312
- LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
313
- return;
314
- }
315
-
316
- // PPM header: P6 format, width, height, and max color value
317
- file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
318
-
319
- // Write pixel data
320
- for (size_t i = 0; i < img.buf.size(); i += 3) {
321
- // PPM expects binary data in RGB format, which matches our image buffer
322
- file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
323
- }
324
-
325
- file.close();
326
- }
327
-
328
- static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
329
- std::ofstream file(filename, std::ios::binary);
330
- if (!file.is_open()) {
331
- LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
332
- return;
333
- }
334
-
335
- int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
336
- int bytesPerPixel = 3;
337
- int widthInBytes = img.nx * bytesPerPixel;
338
- int paddingAmount = (4 - (widthInBytes % 4)) % 4;
339
- int stride = widthInBytes + paddingAmount;
340
-
341
- // Bitmap file header
342
- unsigned char fileHeader[14] = {
343
- 'B','M', // Signature
344
- 0,0,0,0, // Image file size in bytes
345
- 0,0,0,0, // Reserved
346
- 54,0,0,0 // Start of pixel array
347
- };
348
-
349
- // Total file size
350
- fileSize = 54 + (stride * img.ny);
351
- fileHeader[2] = (unsigned char)(fileSize);
352
- fileHeader[3] = (unsigned char)(fileSize >> 8);
353
- fileHeader[4] = (unsigned char)(fileSize >> 16);
354
- fileHeader[5] = (unsigned char)(fileSize >> 24);
355
-
356
- // Bitmap information header (BITMAPINFOHEADER)
357
- unsigned char infoHeader[40] = {
358
- 40,0,0,0, // Size of this header (40 bytes)
359
- 0,0,0,0, // Image width
360
- 0,0,0,0, // Image height
361
- 1,0, // Number of color planes
362
- 24,0, // Bits per pixel
363
- 0,0,0,0, // No compression
364
- 0,0,0,0, // Image size (can be 0 for no compression)
365
- 0,0,0,0, // X pixels per meter (not specified)
366
- 0,0,0,0, // Y pixels per meter (not specified)
367
- 0,0,0,0, // Total colors (color table not used)
368
- 0,0,0,0 // Important colors (all are important)
369
- };
370
-
371
- // Width and height in the information header
372
- infoHeader[4] = (unsigned char)(img.nx);
373
- infoHeader[5] = (unsigned char)(img.nx >> 8);
374
- infoHeader[6] = (unsigned char)(img.nx >> 16);
375
- infoHeader[7] = (unsigned char)(img.nx >> 24);
376
- infoHeader[8] = (unsigned char)(img.ny);
377
- infoHeader[9] = (unsigned char)(img.ny >> 8);
378
- infoHeader[10] = (unsigned char)(img.ny >> 16);
379
- infoHeader[11] = (unsigned char)(img.ny >> 24);
380
-
381
- // Write file headers
382
- file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
383
- file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
384
-
385
- // Pixel data
386
- std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
387
- for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
388
- for (int x = 0; x < img.nx; ++x) {
389
- // Each pixel
390
- size_t pixelIndex = (y * img.nx + x) * 3;
391
- unsigned char pixel[3] = {
392
- img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
393
- img.buf[pixelIndex + 1],
394
- img.buf[pixelIndex]
395
- };
396
- file.write(reinterpret_cast<char*>(pixel), 3);
397
- }
398
- // Write padding for the row
399
- file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
400
- }
401
-
402
- file.close();
403
- }
404
-
405
- // debug function to convert f32 to u8
406
- static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
407
- dst.nx = src.nx;
408
- dst.ny = src.ny;
409
- dst.buf.resize(3 * src.nx * src.ny);
410
- for (size_t i = 0; i < src.buf.size(); ++i) {
411
- dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
412
- }
413
- }
414
- #endif
415
-
416
-
417
- //
418
- // clip layers
419
- //
420
-
421
- struct clip_hparams {
422
- int32_t image_size;
423
- int32_t patch_size;
424
- int32_t hidden_size;
425
- int32_t n_intermediate;
426
- int32_t projection_dim;
427
- int32_t n_head;
428
- int32_t n_layer;
429
-
430
- float eps;
431
-
432
- char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
433
-
434
- std::vector<int32_t> image_grid_pinpoints;
435
- int32_t image_crop_resolution;
436
- std::unordered_set<int32_t> vision_feature_layer;
437
- };
438
-
439
- struct clip_layer {
440
- // attention
441
- struct ggml_tensor * k_w;
442
- struct ggml_tensor * k_b;
443
- struct ggml_tensor * q_w;
444
- struct ggml_tensor * q_b;
445
- struct ggml_tensor * v_w;
446
- struct ggml_tensor * v_b;
447
-
448
- struct ggml_tensor * o_w;
449
- struct ggml_tensor * o_b;
450
-
451
- // layernorm 1
452
- struct ggml_tensor * ln_1_w;
453
- struct ggml_tensor * ln_1_b;
454
-
455
- // ff
456
- struct ggml_tensor * ff_i_w;
457
- struct ggml_tensor * ff_i_b;
458
-
459
- struct ggml_tensor * ff_o_w;
460
- struct ggml_tensor * ff_o_b;
461
-
462
- // layernorm 2
463
- struct ggml_tensor * ln_2_w;
464
- struct ggml_tensor * ln_2_b;
465
- };
466
-
467
- struct clip_vision_model {
468
- struct clip_hparams hparams;
469
-
470
- // embeddings
471
- struct ggml_tensor * class_embedding;
472
- struct ggml_tensor * patch_embeddings_0;
473
- struct ggml_tensor * patch_embeddings_1; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
474
- struct ggml_tensor * patch_bias;
475
- struct ggml_tensor * position_embeddings;
476
-
477
- struct ggml_tensor * pre_ln_w;
478
- struct ggml_tensor * pre_ln_b;
479
-
480
- std::vector<clip_layer> layers;
481
-
482
- struct ggml_tensor * post_ln_w;
483
- struct ggml_tensor * post_ln_b;
484
-
485
- struct ggml_tensor * projection;
486
-
487
- // LLaVA projection
488
- struct ggml_tensor * mm_0_w = NULL;
489
- struct ggml_tensor * mm_0_b = NULL;
490
- struct ggml_tensor * mm_2_w = NULL;
491
- struct ggml_tensor * mm_2_b = NULL;
492
-
493
- struct ggml_tensor * image_newline = NULL;
494
-
495
- // Yi type models with mlp+normalization projection
496
- struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
497
- struct ggml_tensor * mm_1_b = NULL;
498
- struct ggml_tensor * mm_3_w = NULL;
499
- struct ggml_tensor * mm_3_b = NULL;
500
- struct ggml_tensor * mm_4_w = NULL;
501
- struct ggml_tensor * mm_4_b = NULL;
502
-
503
- //GLMV-Edge projection
504
- struct ggml_tensor * mm_model_adapter_conv_w;
505
- struct ggml_tensor * mm_model_adapter_conv_b;
506
- struct ggml_tensor * boi_w;
507
- struct ggml_tensor * eoi_w;
508
-
509
- // MobileVLM projection
510
- struct ggml_tensor * mm_model_mlp_1_w;
511
- struct ggml_tensor * mm_model_mlp_1_b;
512
- struct ggml_tensor * mm_model_mlp_3_w;
513
- struct ggml_tensor * mm_model_mlp_3_b;
514
- struct ggml_tensor * mm_model_block_1_block_0_0_w;
515
- struct ggml_tensor * mm_model_block_1_block_0_1_w;
516
- struct ggml_tensor * mm_model_block_1_block_0_1_b;
517
- struct ggml_tensor * mm_model_block_1_block_1_fc1_w;
518
- struct ggml_tensor * mm_model_block_1_block_1_fc1_b;
519
- struct ggml_tensor * mm_model_block_1_block_1_fc2_w;
520
- struct ggml_tensor * mm_model_block_1_block_1_fc2_b;
521
- struct ggml_tensor * mm_model_block_1_block_2_0_w;
522
- struct ggml_tensor * mm_model_block_1_block_2_1_w;
523
- struct ggml_tensor * mm_model_block_1_block_2_1_b;
524
- struct ggml_tensor * mm_model_block_2_block_0_0_w;
525
- struct ggml_tensor * mm_model_block_2_block_0_1_w;
526
- struct ggml_tensor * mm_model_block_2_block_0_1_b;
527
- struct ggml_tensor * mm_model_block_2_block_1_fc1_w;
528
- struct ggml_tensor * mm_model_block_2_block_1_fc1_b;
529
- struct ggml_tensor * mm_model_block_2_block_1_fc2_w;
530
- struct ggml_tensor * mm_model_block_2_block_1_fc2_b;
531
- struct ggml_tensor * mm_model_block_2_block_2_0_w;
532
- struct ggml_tensor * mm_model_block_2_block_2_1_w;
533
- struct ggml_tensor * mm_model_block_2_block_2_1_b;
534
-
535
- // MobileVLM_V2 projection
536
- struct ggml_tensor * mm_model_mlp_0_w;
537
- struct ggml_tensor * mm_model_mlp_0_b;
538
- struct ggml_tensor * mm_model_mlp_2_w;
539
- struct ggml_tensor * mm_model_mlp_2_b;
540
- struct ggml_tensor * mm_model_peg_0_w;
541
- struct ggml_tensor * mm_model_peg_0_b;
542
-
543
- // MINICPMV projection
544
- struct ggml_tensor * mm_model_pos_embed_k;
545
- struct ggml_tensor * mm_model_query;
546
- struct ggml_tensor * mm_model_proj;
547
- struct ggml_tensor * mm_model_kv_proj;
548
- struct ggml_tensor * mm_model_attn_q_w;
549
- struct ggml_tensor * mm_model_attn_q_b;
550
- struct ggml_tensor * mm_model_attn_k_w;
551
- struct ggml_tensor * mm_model_attn_k_b;
552
- struct ggml_tensor * mm_model_attn_v_w;
553
- struct ggml_tensor * mm_model_attn_v_b;
554
- struct ggml_tensor * mm_model_attn_o_w;
555
- struct ggml_tensor * mm_model_attn_o_b;
556
- struct ggml_tensor * mm_model_ln_q_w;
557
- struct ggml_tensor * mm_model_ln_q_b;
558
- struct ggml_tensor * mm_model_ln_kv_w;
559
- struct ggml_tensor * mm_model_ln_kv_b;
560
- struct ggml_tensor * mm_model_ln_post_w;
561
- struct ggml_tensor * mm_model_ln_post_b;
562
-
563
- // gemma3
564
- struct ggml_tensor * mm_input_proj_w;
565
- struct ggml_tensor * mm_soft_emb_norm_w;
566
- };
567
-
568
- struct clip_ctx {
569
- bool has_text_encoder = false;
570
- bool has_vision_encoder = false;
571
- bool has_llava_projector = false;
572
- bool has_minicpmv_projector = false;
573
- bool has_glm_projector = false;
574
- bool has_qwen2vl_merger = false;
575
- int minicpmv_version = 2;
576
-
577
- struct clip_vision_model vision_model;
578
- projector_type proj_type = PROJECTOR_TYPE_MLP;
579
-
580
- int32_t max_feature_layer; // unused in newer models like gemma3
581
- float image_mean[3];
582
- float image_std[3];
583
- bool use_gelu = false;
584
- bool use_silu = false;
585
- int32_t ftype = 1;
586
-
587
- bool has_class_embedding = true;
588
- bool has_pre_norm = true;
589
- bool has_post_norm = false;
590
- bool has_patch_bias = false;
591
-
592
- struct gguf_context * ctx_gguf = nullptr;
593
- struct ggml_context * ctx_data = nullptr;
594
-
595
- std::vector<uint8_t> buf_compute_meta;
596
-
597
- std::vector<ggml_backend_t> backend_ptrs;
598
- std::vector<ggml_backend_buffer_type_t> backend_buft;
599
-
600
- ggml_backend_t backend = nullptr;
601
- ggml_backend_t backend_cpu = nullptr;
602
- ggml_backend_buffer_t buf = nullptr;
603
-
604
- ggml_backend_sched_ptr sched;
605
-
606
- struct clip_image_size * load_image_size = nullptr;
607
-
608
- clip_ctx(clip_context_params & ctx_params) {
609
- backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
610
- backend = ctx_params.use_gpu
611
- ? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
612
- : nullptr;
613
-
614
- if (backend) {
615
- LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
616
- backend_ptrs.push_back(backend);
617
- backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
618
- } else {
619
- backend = backend_cpu;
620
- LOG_INF("%s: CLIP using CPU backend\n", __func__);
621
- }
622
-
623
- backend_ptrs.push_back(backend_cpu);
624
- backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
625
-
626
- sched.reset(
627
- ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
628
- );
629
- }
630
-
631
- ~clip_ctx() {
632
- ggml_free(ctx_data);
633
- gguf_free(ctx_gguf);
634
- ggml_backend_buffer_free(buf);
635
- ggml_backend_free(backend);
636
- if (backend_cpu != backend) {
637
- ggml_backend_free(backend_cpu);
638
- }
639
- }
640
- };
641
-
642
- static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
643
- const auto & model = ctx->vision_model;
644
- const auto & hparams = model.hparams;
645
-
646
- const int image_size = hparams.image_size;
647
- int image_size_width = image_size;
648
- int image_size_height = image_size;
649
-
650
- const int patch_size = hparams.patch_size;
651
- const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
652
- const int hidden_size = hparams.hidden_size;
653
- const int n_head = hparams.n_head;
654
- const int d_head = hidden_size / n_head;
655
- const int n_layer = hparams.n_layer;
656
- const float eps = hparams.eps;
657
-
658
- GGML_ASSERT(imgs->size == 1); // batch_size == 1
659
-
660
- struct ggml_init_params params = {
661
- /*.mem_size =*/ ctx->buf_compute_meta.size(),
662
- /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
663
- /*.no_alloc =*/ true,
664
- };
665
-
666
- struct ggml_context * ctx0 = ggml_init(params);
667
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
668
-
669
- // input raw
670
- struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3);
671
- ggml_set_name(inp_raw, "inp_raw");
672
- ggml_set_input(inp_raw);
673
-
674
- struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
675
- inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
676
- inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
677
- inp = ggml_add(ctx0, inp, model.patch_bias);
678
-
679
- // position embeddings
680
- struct ggml_tensor * embeddings = ggml_add(ctx0, inp, model.position_embeddings);
681
-
682
- // loop over layers
683
- for (int il = 0; il < n_layer; il++) {
684
- struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
685
-
686
- // layernorm1
687
- {
688
- cur = ggml_norm(ctx0, cur, eps);
689
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), model.layers[il].ln_1_b);
690
- }
691
-
692
- // self-attention
693
- {
694
-
695
- struct ggml_tensor * Q =
696
- ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
697
-
698
- Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
699
- Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
700
-
701
- struct ggml_tensor * K =
702
- ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
703
-
704
- K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
705
- K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
706
-
707
- struct ggml_tensor * V =
708
- ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
709
-
710
- V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches);
711
- V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
712
-
713
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
714
- KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
715
- KQ = ggml_soft_max_inplace(ctx0, KQ);
716
-
717
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
718
- KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
719
- KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
720
-
721
- cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
722
- }
723
-
724
- // attention output
725
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
726
-
727
- // re-add the layer input, e.g., residual
728
- cur = ggml_add(ctx0, cur, embeddings);
729
-
730
- embeddings = cur; // embeddings = residual, cur = hidden_states
731
-
732
- // layernorm2
733
- {
734
- cur = ggml_norm(ctx0, cur, eps);
735
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
736
- }
737
-
738
- cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
739
- cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
740
-
741
- // siglip uses gelu
742
- cur = ggml_gelu(ctx0, cur);
743
-
744
- cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
745
- cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
746
-
747
- // residual 2
748
- cur = ggml_add(ctx0, embeddings, cur);
749
-
750
- embeddings = cur;
751
- }
752
-
753
- // post-layernorm
754
- if (ctx->has_post_norm) {
755
- embeddings = ggml_norm(ctx0, embeddings, eps);
756
- ggml_set_name(embeddings, "post_ln");
757
-
758
- embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
759
- }
760
-
761
- if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
762
- const int batch_size = 1;
763
- const int mm_tokens_per_image = 256; // default value for gemma3
764
- const int tokens_per_side = sqrt(mm_tokens_per_image);
765
- const int patches_per_image = sqrt(num_patches);
766
- const int kernel_size = patches_per_image / tokens_per_side;
767
-
768
- embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
769
- embeddings = ggml_reshape_4d(ctx0, embeddings, patches_per_image, patches_per_image, hidden_size, batch_size);
770
-
771
- // doing a pool2d to reduce the number of output tokens to 256
772
- embeddings = ggml_pool_2d(ctx0, embeddings, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
773
- embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], hidden_size, batch_size);
774
- embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
775
-
776
- // apply norm before projection
777
- embeddings = ggml_rms_norm(ctx0, embeddings, eps);
778
- embeddings = ggml_mul(ctx0, embeddings, model.mm_soft_emb_norm_w);
779
-
780
- // apply projection
781
- embeddings = ggml_mul_mat(ctx0,
782
- ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
783
- embeddings);
784
- }
785
-
786
- // build the graph
787
- ggml_build_forward_expand(gf, embeddings);
788
-
789
- ggml_free(ctx0);
790
-
791
- return gf;
792
- }
793
-
794
- static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
795
- if (!ctx->has_vision_encoder) {
796
- LOG_ERR("This gguf file seems to have no vision encoder\n");
797
- return nullptr;
798
- }
799
-
800
- const auto & model = ctx->vision_model;
801
- const auto & hparams = model.hparams;
802
-
803
- const int image_size = hparams.image_size;
804
- int image_size_width = image_size;
805
- int image_size_height = image_size;
806
- if (ctx->has_minicpmv_projector) {
807
- if (load_image_size == nullptr) {
808
- load_image_size = clip_image_size_init();
809
- }
810
- LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
811
- image_size_width = load_image_size->width;
812
- image_size_height = load_image_size->height;
813
- if (is_inf) {
814
- image_size_width = imgs->data->nx;
815
- image_size_height = imgs->data->ny;
816
- }
817
- }
818
- else if (ctx->has_qwen2vl_merger) {
819
- // use the image's native resolution when image is avaible
820
- if (is_inf) {
821
- // if (imgs->data->nx && imgs->data->ny) {
822
- image_size_width = imgs->data->nx;
823
- image_size_height = imgs->data->ny;
824
- }
825
- }
826
- const int patch_size = hparams.patch_size;
827
- const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
828
- const int patches_w = image_size_width / patch_size;
829
- const int patches_h = image_size_height / patch_size;
830
- const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
831
- const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions;
832
- const int hidden_size = hparams.hidden_size;
833
- const int n_head = hparams.n_head;
834
- const int d_head = hidden_size / n_head;
835
- const float eps = hparams.eps;
836
- int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
837
-
838
- const int batch_size = imgs->size;
839
-
840
- if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) {
841
- GGML_ASSERT(batch_size == 1);
842
- }
843
-
844
- struct ggml_init_params params = {
845
- /*.mem_size =*/ ctx->buf_compute_meta.size(),
846
- /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
847
- /*.no_alloc =*/ true,
848
- };
849
-
850
- struct ggml_context * ctx0 = ggml_init(params);
851
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
852
-
853
- struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
854
- ggml_set_name(inp_raw, "inp_raw");
855
- ggml_set_input(inp_raw);
856
-
857
- struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
858
-
859
- if (ctx->has_qwen2vl_merger) {
860
- GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
861
- GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
862
-
863
- auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
864
- inp = ggml_add(ctx0, inp, inp_1);
865
- inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
866
- inp = ggml_reshape_4d(
867
- ctx0, inp,
868
- hidden_size * 2, patches_w / 2, patches_h, batch_size);
869
- inp = ggml_reshape_4d(
870
- ctx0, inp,
871
- hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
872
- inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
873
- inp = ggml_reshape_3d(
874
- ctx0, inp,
875
- hidden_size, patches_w * patches_h, batch_size);
876
- }
877
- else {
878
- inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
879
- inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
880
- }
881
-
882
- if (ctx->has_patch_bias) {
883
- // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
884
- inp = ggml_add(ctx0, inp, model.patch_bias);
885
- }
886
- struct ggml_tensor * embeddings = inp;
887
- struct ggml_tensor * pos_embed = nullptr;
888
-
889
- if (ctx->has_llava_projector) {
890
- // concat class_embeddings and patch_embeddings
891
- if (ctx->has_class_embedding) {
892
- embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
893
- ggml_set_name(embeddings, "embeddings");
894
- ggml_set_input(embeddings);
895
- embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
896
- embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
897
- embeddings = ggml_acc(ctx0, embeddings, inp,
898
- embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
899
- }
900
- }
901
-
902
- struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
903
- ggml_set_name(positions, "positions");
904
- ggml_set_input(positions);
905
-
906
- if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding
907
- embeddings =
908
- ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
909
- }
910
-
911
- if (ctx->has_minicpmv_projector) {
912
- int pos_w = image_size_width/patch_size;
913
- int pos_h = image_size_height/patch_size;
914
- if (ctx->minicpmv_version == 2) {
915
- pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
916
- }
917
- else if (ctx->minicpmv_version == 3) {
918
- pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
919
- }
920
- else if (ctx->minicpmv_version == 4) {
921
- pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
922
- }
923
- ggml_set_name(pos_embed, "pos_embed");
924
- ggml_set_input(pos_embed);
925
- }
926
-
927
- // pre-layernorm
928
- if (ctx->has_pre_norm) {
929
- embeddings = ggml_norm(ctx0, embeddings, eps);
930
- ggml_set_name(embeddings, "pre_ln");
931
-
932
- embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
933
- }
934
-
935
- std::vector<struct ggml_tensor *> embedding_stack;
936
- const auto & vision_feature_layer = hparams.vision_feature_layer;
937
-
938
- // loop over layers
939
- for (int il = 0; il < ctx->max_feature_layer; il++) {
940
- struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
941
-
942
- // If this is an embedding feature layer, save the output.
943
- // NOTE: 0 index here refers to the input to the encoder.
944
- if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
945
- embedding_stack.push_back(embeddings);
946
- }
947
-
948
- //const size_t nb_q_w = model.layers[il].q_w->nb[0];
949
-
950
- // layernorm1
951
- {
952
- cur = ggml_norm(ctx0, cur, eps);
953
-
954
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w),
955
- model.layers[il].ln_1_b);
956
- }
957
-
958
- // self-attention
959
- {
960
-
961
- struct ggml_tensor * Q =
962
- ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
963
-
964
- Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
965
- if (ctx->has_qwen2vl_merger) {
966
- Q = ggml_rope_multi(
967
- ctx0, Q, positions, nullptr,
968
- d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
969
- }
970
- Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
971
- Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
972
- Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
973
-
974
- struct ggml_tensor * K =
975
- ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
976
-
977
- K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
978
- if (ctx->has_qwen2vl_merger) {
979
- K = ggml_rope_multi(
980
- ctx0, K, positions, nullptr,
981
- d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
982
- }
983
- K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
984
- K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
985
-
986
- struct ggml_tensor * V =
987
- ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
988
-
989
- V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
990
- V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
991
- V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
992
-
993
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
994
- KQ = ggml_soft_max_inplace(ctx0, KQ);
995
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
996
- KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
997
- KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
998
-
999
- cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
1000
- }
1001
-
1002
- // attention output
1003
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
1004
-
1005
- // re-add the layer input, e.g., residual
1006
- cur = ggml_add(ctx0, cur, embeddings);
1007
-
1008
- embeddings = cur; // embeddings = residual, cur = hidden_states
1009
-
1010
- // layernorm2
1011
- {
1012
- cur = ggml_norm(ctx0, cur, eps);
1013
-
1014
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
1015
- }
1016
-
1017
- cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
1018
- cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
1019
-
1020
- if (ctx->use_gelu) {
1021
- cur = ggml_gelu_inplace(ctx0, cur);
1022
- } else if (ctx->use_silu) {
1023
- cur = ggml_silu_inplace(ctx0, cur);
1024
- } else {
1025
- cur = ggml_gelu_quick_inplace(ctx0, cur);
1026
- }
1027
-
1028
- cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
1029
- cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
1030
-
1031
- // residual 2
1032
- cur = ggml_add(ctx0, embeddings, cur);
1033
-
1034
- embeddings = cur;
1035
- }
1036
-
1037
- // post-layernorm
1038
- if (ctx->has_post_norm) {
1039
- embeddings = ggml_norm(ctx0, embeddings, eps);
1040
- ggml_set_name(embeddings, "post_ln");
1041
-
1042
- embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
1043
- }
1044
-
1045
- // final layer is a vision feature layer
1046
- if (vision_feature_layer.find(ctx->max_feature_layer) != vision_feature_layer.end()) {
1047
- embedding_stack.push_back(embeddings);
1048
- }
1049
-
1050
- // If feature layers are explicitly set, stack them (if we have multiple)
1051
- if (!embedding_stack.empty()) {
1052
- embeddings = embedding_stack[0];
1053
- for (size_t i = 1; i < embedding_stack.size(); i++) {
1054
- embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
1055
- }
1056
- }
1057
-
1058
- // llava projector
1059
- if (ctx->has_llava_projector) {
1060
- embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
1061
-
1062
- struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
1063
- ggml_set_name(patches, "patches");
1064
- ggml_set_input(patches);
1065
-
1066
- // shape [1, 576, 1024]
1067
- // ne is whcn, ne = [1024, 576, 1, 1]
1068
- embeddings = ggml_get_rows(ctx0, embeddings, patches);
1069
-
1070
- // print_tensor_info(embeddings, "embeddings");
1071
-
1072
- // llava projector
1073
- if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
1074
- embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
1075
- embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
1076
-
1077
- embeddings = ggml_gelu(ctx0, embeddings);
1078
- embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
1079
- embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
1080
- }
1081
- else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
1082
- embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
1083
- embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
1084
- // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
1085
- // First LayerNorm
1086
- embeddings = ggml_norm(ctx0, embeddings, eps);
1087
- embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
1088
- model.mm_1_b);
1089
-
1090
- // GELU activation
1091
- embeddings = ggml_gelu(ctx0, embeddings);
1092
-
1093
- // Second linear layer
1094
- embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
1095
- embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
1096
-
1097
- // Second LayerNorm
1098
- embeddings = ggml_norm(ctx0, embeddings, eps);
1099
- embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
1100
- model.mm_4_b);
1101
- }
1102
- else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
1103
- // MobileVLM projector
1104
- int n_patch = 24;
1105
- struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
1106
- mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
1107
- mlp_1 = ggml_gelu(ctx0, mlp_1);
1108
- struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
1109
- mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
1110
- // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
1111
-
1112
- // block 1
1113
- struct ggml_tensor * block_1 = nullptr;
1114
- {
1115
- // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
1116
- mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
1117
- mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
1118
- // stride = 1, padding = 1, bias is nullptr
1119
- block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
1120
-
1121
- // layer norm
1122
- // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
1123
- block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
1124
- // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
1125
- block_1 = ggml_norm(ctx0, block_1, eps);
1126
- block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
1127
- block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
1128
-
1129
- // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
1130
- // hardswish
1131
- struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
1132
-
1133
- block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
1134
- // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
1135
- // pointwise conv
1136
- block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
1137
- block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
1138
- block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
1139
- block_1 = ggml_relu(ctx0, block_1);
1140
- block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
1141
- block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
1142
- block_1 = ggml_hardsigmoid(ctx0, block_1);
1143
- // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
1144
- block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
1145
- block_1 = ggml_mul(ctx0, block_1_hw, block_1);
1146
-
1147
- int w = block_1->ne[0], h = block_1->ne[1];
1148
- block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
1149
- block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
1150
-
1151
- // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
1152
- block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
1153
- block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
1154
-
1155
- // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
1156
- block_1 = ggml_norm(ctx0, block_1, eps);
1157
- block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
1158
- block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
1159
- // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
1160
- // residual
1161
- block_1 = ggml_add(ctx0, mlp_3, block_1);
1162
- }
1163
-
1164
- // block_2
1165
- {
1166
- // stride = 2
1167
- block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
1168
-
1169
- // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
1170
- // layer norm
1171
- block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
1172
- // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
1173
- block_1 = ggml_norm(ctx0, block_1, eps);
1174
- block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
1175
- block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
1176
- // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
1177
- // hardswish
1178
- struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
1179
-
1180
- // not sure the parameters is right for globalAvgPooling
1181
- block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
1182
- // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
1183
- // pointwise conv
1184
- block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
1185
- block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
1186
- block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
1187
- block_1 = ggml_relu(ctx0, block_1);
1188
- block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
1189
- block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
1190
- block_1 = ggml_hardsigmoid(ctx0, block_1);
1191
-
1192
- // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
1193
- block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
1194
- block_1 = ggml_mul(ctx0, block_1_hw, block_1);
1195
-
1196
- int w = block_1->ne[0], h = block_1->ne[1];
1197
- block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
1198
- block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
1199
- // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
1200
- block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
1201
- block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
1202
-
1203
-
1204
- // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
1205
- block_1 = ggml_norm(ctx0, block_1, eps);
1206
- block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
1207
- block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
1208
- // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
1209
- }
1210
- embeddings = block_1;
1211
- }
1212
- else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
1213
- {
1214
- int n_patch = 24;
1215
- struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
1216
- mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
1217
- mlp_0 = ggml_gelu(ctx0, mlp_0);
1218
- struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
1219
- mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
1220
- // mlp_2 ne = [2048, 576, 1, 1]
1221
- // // AVG Pool Layer 2*2, strides = 2
1222
- mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
1223
- // mlp_2 ne = [576, 2048, 1, 1]
1224
- mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
1225
- // mlp_2 ne [24, 24, 2048, 1]
1226
- mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
1227
- // weight ne = [3, 3, 2048, 1]
1228
- struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
1229
- peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
1230
- peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
1231
- mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
1232
- peg_0 = ggml_add(ctx0, peg_0, mlp_2);
1233
- peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
1234
- embeddings = peg_0;
1235
- }
1236
- else {
1237
- GGML_ABORT("fatal error");
1238
- }
1239
- }
1240
- // minicpmv projector
1241
- else if (ctx->has_minicpmv_projector)
1242
- {
1243
- if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
1244
- struct ggml_tensor * q = model.mm_model_query;
1245
- { // layernorm
1246
- q = ggml_norm(ctx0, q, eps);
1247
- q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
1248
- }
1249
- struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
1250
- { // layernorm
1251
- v = ggml_norm(ctx0, v, eps);
1252
- v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
1253
- }
1254
- struct ggml_tensor * k;
1255
- { // position
1256
- // q = ggml_add(ctx0, q, model.mm_model_pos_embed);
1257
- k = ggml_add(ctx0, v, pos_embed);
1258
- }
1259
-
1260
- { // attention
1261
- int hidden_size = 4096;
1262
- const int d_head = 128;
1263
- int n_head = hidden_size/d_head;
1264
- int num_query = 96;
1265
- if (ctx->minicpmv_version == 2) {
1266
- hidden_size = 4096;
1267
- n_head = hidden_size/d_head;
1268
- num_query = 96;
1269
- }
1270
- else if (ctx->minicpmv_version == 3) {
1271
- hidden_size = 3584;
1272
- n_head = hidden_size/d_head;
1273
- num_query = 64;
1274
- }
1275
- else if (ctx->minicpmv_version == 4) {
1276
- hidden_size = 3584;
1277
- n_head = hidden_size/d_head;
1278
- num_query = 64;
1279
- }
1280
-
1281
- struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
1282
- Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
1283
- struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
1284
- struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
1285
- // permute
1286
- Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
1287
- Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
1288
- Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
1289
- K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
1290
- K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
1291
- K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
1292
- V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
1293
- V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
1294
- V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
1295
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1296
- KQ = ggml_soft_max_inplace(ctx0, KQ);
1297
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
1298
- KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
1299
- KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1300
- KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
1301
-
1302
- embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
1303
- }
1304
- { // layernorm
1305
- embeddings = ggml_norm(ctx0, embeddings, eps);
1306
- embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
1307
- }
1308
- embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
1309
- }
1310
- else {
1311
- GGML_ASSERT(false);
1312
- }
1313
- }
1314
- // glm projector
1315
- else if (ctx->has_glm_projector) {
1316
- if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
1317
- size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
1318
- embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
1319
- embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
1320
- embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
1321
- embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
1322
- embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
1323
- embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
1324
- //GLU
1325
- {
1326
- embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
1327
- embeddings = ggml_norm(ctx0, embeddings, eps);
1328
- embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
1329
- embeddings = ggml_gelu_inplace(ctx0, embeddings);
1330
- struct ggml_tensor * x = embeddings;
1331
- embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
1332
- x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
1333
- embeddings = ggml_silu_inplace(ctx0, embeddings);
1334
- embeddings = ggml_mul(ctx0, embeddings,x);
1335
- embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
1336
- }
1337
- } else {
1338
- GGML_ABORT("fatel error");
1339
- }
1340
- }
1341
- else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
1342
- embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
1343
-
1344
- embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
1345
- embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
1346
-
1347
- // GELU activation
1348
- embeddings = ggml_gelu(ctx0, embeddings);
1349
-
1350
- // Second linear layer
1351
- embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
1352
- embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
1353
- }
1354
-
1355
- // build the graph
1356
- ggml_build_forward_expand(gf, embeddings);
1357
-
1358
- ggml_free(ctx0);
1359
-
1360
- return gf;
1361
- }
1362
-
1363
- static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
1364
- if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
1365
- return clip_image_build_graph_siglip(ctx, imgs);
1366
- } else {
1367
- // TODO: we should have one build_* function per model
1368
- return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
1369
- }
1370
- }
1371
-
1372
- // read and create ggml_context containing the tensors and their data
1373
- struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1374
- return clip_init(fname, clip_context_params{
1375
- /* use_gpu */ true,
1376
- /* verbosity */ verbosity,
1377
- });
1378
- }
1379
-
1380
- struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
1381
- int verbosity = ctx_params.verbosity;
1382
- struct ggml_context * meta = NULL;
1383
-
1384
- struct gguf_init_params params = {
1385
- /*.no_alloc = */ true,
1386
- /*.ctx = */ &meta,
1387
- };
1388
-
1389
- struct gguf_context * ctx = gguf_init_from_file(fname, params);
1390
- if (!ctx) {
1391
- throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
1392
- }
1393
-
1394
- if (verbosity >= 1) {
1395
- const int n_tensors = gguf_get_n_tensors(ctx);
1396
- const int n_kv = gguf_get_n_kv(ctx);
1397
- const int ftype = get_u32(ctx, KEY_FTYPE);
1398
- const std::string ftype_str = get_ftype(ftype);
1399
- const int idx_desc = get_key_idx(ctx, KEY_DESCRIPTION);
1400
- const std::string description = gguf_get_val_str(ctx, idx_desc);
1401
- const int idx_name = gguf_find_key(ctx, KEY_NAME);
1402
- if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
1403
- const std::string name = gguf_get_val_str(ctx, idx_name);
1404
- LOG_INF("%s: model name: %s\n", __func__, name.c_str());
1405
- }
1406
- LOG_INF("%s: description: %s\n", __func__, description.c_str());
1407
- LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
1408
- LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
1409
- LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
1410
- LOG_INF("%s: n_kv: %d\n", __func__, n_kv);
1411
- LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
1412
- LOG_INF("\n");
1413
- }
1414
- const int n_tensors = gguf_get_n_tensors(ctx);
1415
-
1416
- // kv
1417
- const int n_kv = gguf_get_n_kv(ctx);
1418
- LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
1419
- __func__, n_kv, n_tensors, fname);
1420
- {
1421
- std::map<enum ggml_type, uint32_t> n_type;
1422
-
1423
- for (int i = 0; i < n_tensors; i++) {
1424
- enum ggml_type type = gguf_get_tensor_type(ctx, i);
1425
-
1426
- n_type[type]++;
1427
- }
1428
-
1429
- LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
1430
- for (int i = 0; i < n_kv; i++) {
1431
- const char * name = gguf_get_key(ctx, i);
1432
- const enum gguf_type type = gguf_get_kv_type(ctx, i);
1433
- const std::string type_name =
1434
- type == GGUF_TYPE_ARRAY
1435
- ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i))
1436
- : gguf_type_name(type);
1437
-
1438
- std::string value = gguf_kv_to_str(ctx, i);
1439
- const size_t MAX_VALUE_LEN = 40;
1440
- if (value.size() > MAX_VALUE_LEN) {
1441
- value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
1442
- }
1443
- replace_all(value, "\n", "\\n");
1444
-
1445
- LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
1446
- }
1447
-
1448
- // print type counts
1449
- for (auto & kv : n_type) {
1450
- if (kv.second == 0) {
1451
- continue;
1452
- }
1453
-
1454
- LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
1455
- }
1456
- }
1457
-
1458
- // data
1459
- size_t model_size = 0;
1460
- {
1461
- for (int i = 0; i < n_tensors; ++i) {
1462
- const char * name = gguf_get_tensor_name(ctx, i);
1463
- const size_t offset = gguf_get_tensor_offset(ctx, i);
1464
- enum ggml_type type = gguf_get_tensor_type(ctx, i);
1465
- struct ggml_tensor * cur = ggml_get_tensor(meta, name);
1466
- size_t tensor_size = ggml_nbytes(cur);
1467
- model_size += tensor_size;
1468
- if (verbosity >= 3) {
1469
- LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
1470
- __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
1471
- }
1472
- }
1473
- }
1474
-
1475
- clip_ctx * new_clip = new clip_ctx(ctx_params);
1476
-
1477
- // update projector type
1478
- {
1479
- int idx = gguf_find_key(ctx, KEY_PROJ_TYPE);
1480
- if (idx != -1) {
1481
- const std::string proj_type = gguf_get_val_str(ctx, idx);
1482
- new_clip->proj_type = clip_projector_type_from_string(proj_type);
1483
- } else {
1484
- new_clip->proj_type = PROJECTOR_TYPE_MLP;
1485
- }
1486
-
1487
- if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
1488
- if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
1489
- new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
1490
- }
1491
- }
1492
- }
1493
-
1494
- // model size and capabilities
1495
- {
1496
- int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
1497
- new_clip->has_text_encoder = gguf_get_val_bool(ctx, idx);
1498
-
1499
- idx = get_key_idx(ctx, KEY_HAS_VIS_ENC);
1500
- new_clip->has_vision_encoder = gguf_get_val_bool(ctx, idx);
1501
-
1502
- idx = gguf_find_key(ctx, KEY_HAS_LLAVA_PROJ);
1503
- if (idx != -1) {
1504
- new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
1505
- }
1506
-
1507
- idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ);
1508
- if (idx != -1) {
1509
- new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx);
1510
- }
1511
-
1512
- idx = gguf_find_key(ctx, KEY_MINICPMV_VERSION);
1513
- if (idx != -1) {
1514
- new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
1515
- }
1516
-
1517
- idx = gguf_find_key(ctx, KEY_HAS_GLM_PROJ);
1518
- if (idx != -1) {
1519
- new_clip->has_glm_projector = gguf_get_val_bool(ctx, idx);
1520
- }
1521
-
1522
- idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER);
1523
- if (idx != -1) {
1524
- new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
1525
- }
1526
- // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
1527
-
1528
- GGML_ASSERT(new_clip->has_vision_encoder);
1529
- GGML_ASSERT(!new_clip->has_text_encoder);
1530
-
1531
- try {
1532
- idx = get_key_idx(ctx, KEY_USE_GELU);
1533
- new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
1534
- } catch (std::runtime_error & /*e*/) {
1535
- new_clip->use_gelu = false;
1536
- }
1537
-
1538
- try {
1539
- idx = get_key_idx(ctx, KEY_USE_SILU);
1540
- new_clip->use_silu = gguf_get_val_bool(ctx, idx);
1541
- } catch (std::runtime_error & /*e*/) {
1542
- new_clip->use_silu = false;
1543
- }
1544
-
1545
- if (verbosity >= 1) {
1546
- LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
1547
- LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
1548
- LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
1549
- LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
1550
- LOG_INF("%s: minicpmv_version: %d\n", __func__, new_clip->minicpmv_version);
1551
- LOG_INF("%s: glm_projector: %d\n", __func__, new_clip->has_glm_projector);
1552
- LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
1553
- LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
1554
- }
1555
- }
1556
-
1557
- LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
1558
-
1559
- // load tensors
1560
- {
1561
- std::vector<uint8_t> read_buf;
1562
- struct ggml_init_params params = {
1563
- /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(),
1564
- /*.mem_buffer =*/ NULL,
1565
- /*.no_alloc =*/ true,
1566
- };
1567
-
1568
- new_clip->ctx_data = ggml_init(params);
1569
- if (!new_clip->ctx_data) {
1570
- LOG_ERR("%s: ggml_init() failed\n", __func__);
1571
- clip_free(new_clip);
1572
- gguf_free(ctx);
1573
- return nullptr;
1574
- }
1575
-
1576
- auto fin = std::ifstream(fname, std::ios::binary);
1577
- if (!fin) {
1578
- LOG_ERR("cannot open model file for loading tensors\n");
1579
- clip_free(new_clip);
1580
- gguf_free(ctx);
1581
- return nullptr;
1582
- }
1583
-
1584
- // add tensors to context
1585
- for (int i = 0; i < n_tensors; ++i) {
1586
- const char * name = gguf_get_tensor_name(ctx, i);
1587
- struct ggml_tensor * t = ggml_get_tensor(meta, name);
1588
- struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t);
1589
- ggml_set_name(cur, name);
1590
- }
1591
-
1592
- // alloc memory and offload data
1593
- ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(new_clip->backend);
1594
- new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft(new_clip->ctx_data, buft);
1595
- ggml_backend_buffer_set_usage(new_clip->buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
1596
- for (int i = 0; i < n_tensors; ++i) {
1597
- const char * name = gguf_get_tensor_name(ctx, i);
1598
- struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
1599
- const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
1600
- fin.seekg(offset, std::ios::beg);
1601
- if (!fin) {
1602
- LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
1603
- clip_free(new_clip);
1604
- gguf_free(ctx);
1605
- return nullptr;
1606
- }
1607
- int num_bytes = ggml_nbytes(cur);
1608
- if (ggml_backend_buft_is_host(buft)) {
1609
- // for the CPU and Metal backend, we can read directly into the tensor
1610
- fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
1611
- } else {
1612
- // read into a temporary buffer first, then copy to device memory
1613
- read_buf.resize(num_bytes);
1614
- fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
1615
- ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
1616
- }
1617
- }
1618
- fin.close();
1619
- }
1620
-
1621
- // vision model
1622
- if (new_clip->has_vision_encoder) {
1623
- // load vision model
1624
- auto & vision_model = new_clip->vision_model;
1625
- auto & hparams = vision_model.hparams;
1626
- hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision"));
1627
- hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision"));
1628
- hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision"));
1629
- hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
1630
- hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE);
1631
- hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE);
1632
- hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
1633
- hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
1634
-
1635
- try {
1636
- int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
1637
- int n = gguf_get_arr_n(ctx, idx);
1638
- const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
1639
- for (int i = 0; i < n; ++i) {
1640
- hparams.image_grid_pinpoints.push_back(pinpoints[i]);
1641
- }
1642
- } catch (std::runtime_error & /*e*/) { }
1643
-
1644
- // Load the vision feature layer indices if they are explicitly provided;
1645
- // if multiple vision feature layers are present, the values will be concatenated
1646
- // to form the final visual features.
1647
- // NOTE: gguf conversions should standardize the values of the vision feature layer to
1648
- // be non-negative, since we use -1 to mark values as unset here.
1649
- try {
1650
- int idx = get_key_idx(ctx, KEY_FEATURE_LAYER);
1651
- int n = gguf_get_arr_n(ctx, idx);
1652
-
1653
- const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data(ctx, idx);
1654
-
1655
- for (int i = 0; i < n; ++i) {
1656
- hparams.vision_feature_layer.insert(vision_feature_layer[i]);
1657
- }
1658
- } catch (std::runtime_error & /*e*/) { }
1659
-
1660
- try {
1661
- int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
1662
- strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx));
1663
- } catch (std::runtime_error & /*e*/) {
1664
- strcpy(hparams.mm_patch_merge_type, "flat");
1665
- }
1666
-
1667
- try {
1668
- hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6
1669
- } catch(const std::exception& /*e*/) {
1670
- hparams.image_crop_resolution = hparams.image_size;
1671
- }
1672
-
1673
- int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
1674
- int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
1675
-
1676
- const float * mean_data = (const float *)gguf_get_arr_data(ctx, idx_mean);
1677
- const float * std_data = (const float *)gguf_get_arr_data(ctx, idx_std);
1678
-
1679
- for (int i = 0; i < 3; ++i) {
1680
- new_clip->image_mean[i] = mean_data[i];
1681
- new_clip->image_std[i] = std_data[i];
1682
- }
1683
-
1684
- // Calculate the deepest feature layer based on hparams and projector type
1685
- new_clip->max_feature_layer = get_deepest_feature_layer(new_clip);
1686
-
1687
- if (verbosity >= 2) {
1688
- LOG_INF("\n%s: vision model hparams\n", __func__);
1689
- LOG_INF("image_size %d\n", hparams.image_size);
1690
- LOG_INF("patch_size %d\n", hparams.patch_size);
1691
- LOG_INF("v_hidden_size %d\n", hparams.hidden_size);
1692
- LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate);
1693
- LOG_INF("v_projection_dim %d\n", hparams.projection_dim);
1694
- LOG_INF("v_n_head %d\n", hparams.n_head);
1695
- LOG_INF("v_n_layer %d\n", hparams.n_layer);
1696
- LOG_INF("v_eps %f\n", hparams.eps);
1697
- LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
1698
- LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
1699
- LOG_INF("v_image_grid_pinpoints: ");
1700
- for (const auto & pp : hparams.image_grid_pinpoints) {
1701
- LOG_INF("%d ", pp);
1702
- }
1703
- LOG_INF("\n");
1704
- LOG_INF("v_vision_feature_layer: ");
1705
- for (const auto & feature_layer: hparams.vision_feature_layer) {
1706
- LOG_INF("%d ", feature_layer);
1707
- }
1708
- LOG_INF("\n");
1709
- LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
1710
-
1711
- }
1712
-
1713
- try {
1714
- vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
1715
- new_clip->has_class_embedding = true;
1716
- } catch (const std::exception& /*e*/) {
1717
- new_clip->has_class_embedding = false;
1718
- }
1719
-
1720
- try {
1721
- vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
1722
- vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
1723
- new_clip->has_pre_norm = true;
1724
- } catch (std::exception & /*e*/) {
1725
- new_clip->has_pre_norm = false;
1726
- }
1727
-
1728
- try {
1729
- vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
1730
- vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
1731
- new_clip->has_post_norm = true;
1732
- } catch (std::exception & /*e*/) {
1733
- new_clip->has_post_norm = false;
1734
- }
1735
-
1736
- try {
1737
- vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
1738
- new_clip->has_patch_bias = true;
1739
- } catch (std::exception & /*e*/) {
1740
- new_clip->has_patch_bias = false;
1741
- }
1742
-
1743
- try {
1744
- vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
1745
- } catch(const std::exception& /*e*/) {
1746
- vision_model.patch_embeddings_0 = nullptr;
1747
- }
1748
-
1749
- try {
1750
- vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
1751
- } catch(const std::exception& /*e*/) {
1752
- vision_model.position_embeddings = nullptr;
1753
- }
1754
-
1755
- try {
1756
- vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
1757
- } catch(const std::exception& /*e*/) {
1758
- new_clip->has_qwen2vl_merger = false;
1759
- }
1760
-
1761
- // LLaVA projection
1762
- if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
1763
- vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
1764
- vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
1765
- try {
1766
- // Yi-type llava
1767
- vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
1768
- vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
1769
- } catch (std::runtime_error & /*e*/) { }
1770
- try {
1771
- // missing in Yi-type llava
1772
- vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
1773
- vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
1774
- } catch (std::runtime_error & /*e*/) { }
1775
- try {
1776
- // Yi-type llava
1777
- vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
1778
- vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
1779
- } catch (std::runtime_error & /*e*/) { }
1780
- try {
1781
- // Yi-type llava
1782
- vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
1783
- vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
1784
- } catch (std::runtime_error & /*e*/) { }
1785
- try {
1786
- vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
1787
- // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
1788
- } catch (std::runtime_error & /*e*/) { }
1789
- } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
1790
- // MobileVLM projection
1791
- vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
1792
- vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
1793
- vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
1794
- vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
1795
- vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
1796
- vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
1797
- vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
1798
- vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
1799
- vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
1800
- vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
1801
- vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
1802
- vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
1803
- vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
1804
- vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
1805
- vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
1806
- vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
1807
- vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
1808
- vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
1809
- vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
1810
- vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
1811
- vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
1812
- vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
1813
- vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
1814
- vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
1815
- }
1816
- else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2)
1817
- {
1818
- // MobilVLM_V2 projection
1819
- vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight"));
1820
- vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias"));
1821
- vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight"));
1822
- vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias"));
1823
- vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
1824
- vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
1825
- }
1826
- else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
1827
- // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
1828
- vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
1829
- vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
1830
- vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
1831
- vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
1832
- vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight"));
1833
- vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight"));
1834
- vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight"));
1835
- vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias"));
1836
- vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias"));
1837
- vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias"));
1838
- vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight"));
1839
- vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias"));
1840
- vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight"));
1841
- vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias"));
1842
- vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight"));
1843
- vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias"));
1844
- vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
1845
- vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
1846
- }
1847
- else if (new_clip->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
1848
- vision_model.mm_model_adapter_conv_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "weight"));
1849
- vision_model.mm_model_adapter_conv_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "bias"));
1850
- vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_LINEAR,"weight"));
1851
- vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"weight"));
1852
- vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"bias"));
1853
- vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
1854
- vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_GATE,"weight"));
1855
- vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
1856
- vision_model.boi_w = get_tensor(new_clip->ctx_data, TN_GLM_BOI_W);
1857
- vision_model.eoi_w = get_tensor(new_clip->ctx_data, TN_GLM_EOI_W);
1858
- }
1859
- else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
1860
- vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
1861
- vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
1862
- vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
1863
- vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
1864
- }
1865
- else if (new_clip->proj_type == PROJECTOR_TYPE_GEMMA3) {
1866
- vision_model.mm_input_proj_w = get_tensor(new_clip->ctx_data, TN_MM_INP_PROJ);
1867
- vision_model.mm_soft_emb_norm_w = get_tensor(new_clip->ctx_data, TN_MM_SOFT_EMB_N);
1868
- }
1869
- else {
1870
- std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
1871
- throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
1872
- }
1873
-
1874
- vision_model.layers.resize(hparams.n_layer);
1875
-
1876
- for (int il = 0; il < hparams.n_layer; ++il) {
1877
- auto & layer = vision_model.layers[il];
1878
- layer.k_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "weight"));
1879
- layer.q_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "weight"));
1880
- layer.v_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "weight"));
1881
- layer.o_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight"));
1882
- layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "weight"));
1883
- layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "weight"));
1884
- layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "weight"));
1885
- layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "weight"));
1886
- layer.k_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "bias"));
1887
- layer.q_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "bias"));
1888
- layer.v_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "bias"));
1889
- layer.o_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias"));
1890
- layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "bias"));
1891
- layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "bias"));
1892
- layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "bias"));
1893
- layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "bias"));
1894
- }
1895
- }
1896
-
1897
- ggml_free(meta);
1898
-
1899
- new_clip->ctx_gguf = ctx;
1900
-
1901
- // measure mem requirement and allocate
1902
- {
1903
- new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
1904
- clip_image_f32_batch batch;
1905
- batch.size = 1;
1906
- batch.data = nullptr;
1907
- ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
1908
- ggml_backend_sched_reserve(new_clip->sched.get(), gf);
1909
- for (size_t i = 0; i < new_clip->backend_ptrs.size(); ++i) {
1910
- ggml_backend_t backend = new_clip->backend_ptrs[i];
1911
- ggml_backend_buffer_type_t buft = new_clip->backend_buft[i];
1912
- size_t size = ggml_backend_sched_get_buffer_size(new_clip->sched.get(), backend);
1913
- if (size > 1) {
1914
- LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
1915
- ggml_backend_buft_name(buft),
1916
- size / 1024.0 / 1024.0);
1917
- }
1918
- }
1919
- }
1920
-
1921
- return new_clip;
1922
- }
1923
-
1924
- void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
1925
- ctx_clip->load_image_size = load_image_size;
1926
- }
1927
-
1928
- struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
1929
- return ctx_clip->load_image_size;
1930
- }
1931
-
1932
- struct clip_image_size * clip_image_size_init() {
1933
- struct clip_image_size * load_image_size = new struct clip_image_size();
1934
- load_image_size->width = 448;
1935
- load_image_size->height = 448;
1936
- return load_image_size;
1937
- }
1938
-
1939
- struct clip_image_u8 * clip_image_u8_init() {
1940
- return new clip_image_u8();
1941
- }
1942
-
1943
- struct clip_image_f32 * clip_image_f32_init() {
1944
- return new clip_image_f32();
1945
- }
1946
-
1947
- void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
1948
- void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
1949
- void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) {
1950
- if (batch->size > 0) {
1951
- delete[] batch->data;
1952
- batch->size = 0;
1953
- }
1954
- }
1955
- void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) {
1956
- if (batch->size > 0) {
1957
- delete[] batch->data;
1958
- batch->size = 0;
1959
- }
1960
- }
1961
-
1962
- void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
1963
- img->nx = nx;
1964
- img->ny = ny;
1965
- img->buf.resize(3 * nx * ny);
1966
- memcpy(img->buf.data(), rgb_pixels, img->buf.size());
1967
- }
1968
-
1969
- bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
1970
- int nx, ny, nc;
1971
- auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
1972
- if (!data) {
1973
- LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
1974
- return false;
1975
- }
1976
- clip_build_img_from_pixels(data, nx, ny, img);
1977
- stbi_image_free(data);
1978
- return true;
1979
- }
1980
-
1981
- bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
1982
- int nx, ny, nc;
1983
- auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
1984
- if (!data) {
1985
- LOG_ERR("%s: failed to decode image bytes\n", __func__);
1986
- return false;
1987
- }
1988
- clip_build_img_from_pixels(data, nx, ny, img);
1989
- stbi_image_free(data);
1990
- return true;
1991
- }
1992
-
1993
- // Linear interpolation between two points
1994
- inline float clip_lerp(float s, float e, float t) {
1995
- return s + (e - s) * t;
1996
- }
1997
- // Bilinear resize function
1998
- static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
1999
- dst.nx = target_width;
2000
- dst.ny = target_height;
2001
- dst.buf.resize(3 * target_width * target_height);
2002
-
2003
- float x_ratio = static_cast<float>(src.nx - 1) / target_width;
2004
- float y_ratio = static_cast<float>(src.ny - 1) / target_height;
2005
-
2006
- for (int y = 0; y < target_height; y++) {
2007
- for (int x = 0; x < target_width; x++) {
2008
- float px = x_ratio * x;
2009
- float py = y_ratio * y;
2010
- int x_floor = static_cast<int>(px);
2011
- int y_floor = static_cast<int>(py);
2012
- float x_lerp = px - x_floor;
2013
- float y_lerp = py - y_floor;
2014
-
2015
- for (int c = 0; c < 3; c++) {
2016
- float top = clip_lerp(
2017
- static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
2018
- static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
2019
- x_lerp
2020
- );
2021
- float bottom = clip_lerp(
2022
- static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
2023
- static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
2024
- x_lerp
2025
- );
2026
- dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(clip_lerp(top, bottom, y_lerp));
2027
- }
2028
- }
2029
- }
2030
- }
2031
-
2032
- // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
2033
- static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) {
2034
- dst->nx = src->nx;
2035
- dst->ny = src->ny;
2036
- dst->buf.resize(src->buf.size());
2037
-
2038
- for (size_t i = 0; i < src->buf.size(); ++i) {
2039
- int c = i % 3; // rgb
2040
- dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c];
2041
- }
2042
- }
2043
-
2044
- inline int clip(int x, int lower, int upper) {
2045
- return std::max(lower, std::min(x, upper));
2046
- }
2047
-
2048
- static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) {
2049
- const int nx = img.nx;
2050
- const int ny = img.ny;
2051
-
2052
- dst.nx = target_width;
2053
- dst.ny = target_height;
2054
- dst.buf.resize(3 * target_width * target_height);
2055
-
2056
- float Cc;
2057
- float C[5];
2058
- float d0, d2, d3, a0, a1, a2, a3;
2059
- int i, j, k, jj;
2060
- int x, y;
2061
- float dx, dy;
2062
- float tx, ty;
2063
-
2064
- tx = (float)nx / (float)target_width;
2065
- ty = (float)ny / (float)target_height;
2066
-
2067
- // Bicubic interpolation; adapted from ViT.cpp, inspired from :
2068
- // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
2069
- // -> https://en.wikipedia.org/wiki/Bicubic_interpolation
2070
-
2071
- for (i = 0; i < target_height; i++) {
2072
- for (j = 0; j < target_width; j++) {
2073
- x = (int)(tx * j);
2074
- y = (int)(ty * i);
2075
-
2076
- dx = tx * j - x;
2077
- dy = ty * i - y;
2078
-
2079
- for (k = 0; k < 3; k++) {
2080
- for (jj = 0; jj <= 3; jj++) {
2081
- d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2082
- d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2083
- d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2084
- a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2085
-
2086
- a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
2087
- a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
2088
- a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
2089
-
2090
- C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
2091
-
2092
- d0 = C[0] - C[1];
2093
- d2 = C[2] - C[1];
2094
- d3 = C[3] - C[1];
2095
- a0 = C[1];
2096
- a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
2097
- a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
2098
- a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
2099
- Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
2100
-
2101
- const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
2102
- dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
2103
- }
2104
- }
2105
- }
2106
- }
2107
-
2108
- return true;
2109
- }
2110
-
2111
- // llava-1.6 type of resize_and_pad (black)
2112
- static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair<int, int>& target_resolution) {
2113
- int target_width = target_resolution.first;
2114
- int target_height = target_resolution.second;
2115
-
2116
- float scale_w = static_cast<float>(target_width) / image.nx;
2117
- float scale_h = static_cast<float>(target_height) / image.ny;
2118
-
2119
- int new_width, new_height;
2120
-
2121
- if (scale_w < scale_h) {
2122
- new_width = target_width;
2123
- new_height = std::min(static_cast<int>(std::ceil(image.ny * scale_w)), target_height);
2124
- } else {
2125
- new_height = target_height;
2126
- new_width = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width);
2127
- }
2128
-
2129
- clip_image_u8 resized_image;
2130
- // bilinear_resize(image, resized_image, new_width, new_height);
2131
- bicubic_resize(image, resized_image, new_width, new_height);
2132
-
2133
- clip_image_u8 padded_image;
2134
- padded_image.nx = target_width;
2135
- padded_image.ny = target_height;
2136
- padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black
2137
-
2138
- // Calculate padding offsets
2139
- int pad_x = (target_width - new_width) / 2;
2140
- int pad_y = (target_height - new_height) / 2;
2141
-
2142
- // Copy the resized image into the center of the padded buffer
2143
- for (int y = 0; y < new_height; ++y) {
2144
- for (int x = 0; x < new_width; ++x) {
2145
- for (int c = 0; c < 3; ++c) {
2146
- padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c];
2147
- }
2148
- }
2149
- }
2150
- image_output = std::move(padded_image);
2151
- }
2152
-
2153
- /**
2154
- * Selects the best resolution from a list of possible resolutions based on the original size.
2155
- *
2156
- * @param original_size The original size of the image in the format (width, height).
2157
- * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
2158
- * @return The best fit resolution in the format (width, height).
2159
- */
2160
- static std::pair<int, int> select_best_resolution(const std::pair<int, int> & original_size, const std::vector<std::pair<int, int>> & possible_resolutions) {
2161
- int original_width = original_size.first;
2162
- int original_height = original_size.second;
2163
- std::pair<int, int> best_fit;
2164
- int max_effective_resolution = 0;
2165
- int min_wasted_resolution = std::numeric_limits<int>::max();
2166
-
2167
- for (const auto& resolution : possible_resolutions) {
2168
- int width = resolution.first;
2169
- int height = resolution.second;
2170
- float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
2171
- int downscaled_width = static_cast<int>(original_width * scale);
2172
- int downscaled_height = static_cast<int>(original_height * scale);
2173
- int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
2174
- int wasted_resolution = (width * height) - effective_resolution;
2175
- // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
2176
- if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
2177
- max_effective_resolution = effective_resolution;
2178
- min_wasted_resolution = wasted_resolution;
2179
- best_fit = resolution;
2180
- }
2181
- }
2182
-
2183
- return best_fit;
2184
- }
2185
-
2186
- static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & image, int patch_size) {
2187
- std::vector<clip_image_u8*> patches;
2188
- int width = image.nx;
2189
- int height = image.ny;
2190
- for (int i = 0; i < height; i += patch_size) {
2191
- for (int j = 0; j < width; j += patch_size) {
2192
- clip_image_u8 *patch = clip_image_u8_init();
2193
- patch->nx = std::min(patch_size, width - j);
2194
- patch->ny = std::min(patch_size, height - i);
2195
- patch->buf.resize(3 * patch->nx * patch->ny);
2196
- for (int y = 0; y < patch->ny; ++y) {
2197
- for (int x = 0; x < patch->nx; ++x) {
2198
- for (int c = 0; c < 3; ++c) {
2199
- patch->buf[3 * (y * patch->nx + x) + c] = image.buf[3 * ((i + y) * width + (j + x)) + c];
2200
- }
2201
- }
2202
- }
2203
- patches.push_back(patch);
2204
- }
2205
- }
2206
- return patches;
2207
- }
2208
-
2209
- static int ensure_divide(int length, int patch_size) {
2210
- return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
2211
- }
2212
-
2213
- static std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
2214
- int width = original_size.first;
2215
- int height = original_size.second;
2216
- if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
2217
- float r = static_cast<float>(width) / height;
2218
- height = static_cast<int>(scale_resolution / std::sqrt(r));
2219
- width = static_cast<int>(height * r);
2220
- }
2221
- int best_width = ensure_divide(width, patch_size);
2222
- int best_height = ensure_divide(height, patch_size);
2223
- return std::make_pair(best_width, best_height);
2224
- }
2225
-
2226
- static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
2227
- int width, height;
2228
- std::tie(width, height) = original_size;
2229
- int grid_x, grid_y;
2230
- std::tie(grid_x, grid_y) = grid;
2231
-
2232
- int refine_width = ensure_divide(width, grid_x);
2233
- int refine_height = ensure_divide(height, grid_y);
2234
-
2235
- int grid_width = refine_width / grid_x;
2236
- int grid_height = refine_height / grid_y;
2237
-
2238
- // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
2239
- auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
2240
- int best_grid_width, best_grid_height;
2241
- std::tie(best_grid_width, best_grid_height) = best_grid_size;
2242
-
2243
- // std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
2244
- std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
2245
- return refine_size;
2246
- }
2247
-
2248
- static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
2249
- std::vector<int> candidate_split_grids_nums;
2250
- for (int i : {multiple - 1, multiple, multiple + 1}) {
2251
- if (i == 1 || i > max_slice_nums) {
2252
- continue;
2253
- }
2254
- candidate_split_grids_nums.push_back(i);
2255
- }
2256
-
2257
- std::vector<std::pair<int, int>> candidate_grids;
2258
- for (int split_grids_nums : candidate_split_grids_nums) {
2259
- int m = 1;
2260
- while (m <= split_grids_nums) {
2261
- if (split_grids_nums % m == 0) {
2262
- candidate_grids.emplace_back(m, split_grids_nums / m);
2263
- }
2264
- ++m;
2265
- }
2266
- }
2267
-
2268
- std::pair<int, int> best_grid{1, 1};
2269
- float min_error = std::numeric_limits<float>::infinity();
2270
- for (const auto& grid : candidate_grids) {
2271
- float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
2272
- if (error < min_error) {
2273
- best_grid = grid;
2274
- min_error = error;
2275
- }
2276
- }
2277
- return best_grid;
2278
- }
2279
-
2280
- // inspired from LLaVA-UHD:
2281
- // -> https://arxiv.org/pdf/2403.11703
2282
- // -> https://github.com/thunlp/LLaVA-UHD
2283
- // -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
2284
- static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
2285
- const std::pair<int, int> original_size={img->nx,img->ny};
2286
- const int original_width = img->nx;
2287
- const int original_height = img->ny;
2288
- const float log_ratio = log(1.0*original_width/original_height);
2289
- const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
2290
- const int multiple = fmin(ceil(ratio), max_slice_nums);
2291
-
2292
- std::vector<std::vector<clip_image_u8 *>> images;
2293
- LOG_INF("%s: multiple %d\n", __func__, multiple);
2294
- images.push_back(std::vector<clip_image_u8 *>());
2295
-
2296
- if (multiple <= 1) {
2297
- auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
2298
- clip_image_u8 * source_image = clip_image_u8_init();
2299
- bicubic_resize(*img, *source_image, best_size.first, best_size.second);
2300
- // source_image = image.resize(best_size, Image.Resampling.BICUBIC)
2301
- images[images.size()-1].push_back(source_image);
2302
- }
2303
- else if (multiple > 1) {
2304
- auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
2305
- clip_image_u8 * source_image = clip_image_u8_init();
2306
- bicubic_resize(*img, *source_image, best_size.first, best_size.second);
2307
- // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
2308
- LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
2309
- images[images.size()-1].push_back(source_image);
2310
-
2311
- std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
2312
- LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
2313
-
2314
- auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
2315
- clip_image_u8 * refine_image = clip_image_u8_init();
2316
- bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
2317
-
2318
- LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
2319
-
2320
- // split_to_patches
2321
- int width = refine_image->nx;
2322
- int height = refine_image->ny;
2323
- int grid_x = int(width / best_grid.first);
2324
- int grid_y = int(height / best_grid.second);
2325
- for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
2326
- images.push_back(std::vector<clip_image_u8 *>());
2327
- for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
2328
- clip_image_u8 * patch = clip_image_u8_init();
2329
- patch->nx = grid_x;
2330
- patch->ny = grid_y;
2331
- patch->buf.resize(3 * patch->nx * patch->ny);
2332
- for (int y = patches_i; y < patches_i + grid_y; ++y) {
2333
- for (int x = patches_j; x < patches_j + grid_x; ++x) {
2334
- const int i = 3 * (y * refine_image->nx + x);
2335
- const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
2336
- patch->buf[j] = refine_image->buf[i];
2337
- patch->buf[j+1] = refine_image->buf[i+1];
2338
- patch->buf[j+2] = refine_image->buf[i+2];
2339
- }
2340
- }
2341
- images[images.size()-1].push_back(patch);
2342
- }
2343
- }
2344
- clip_image_u8_free(refine_image);
2345
- }
2346
- return images;
2347
- }
2348
-
2349
- int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
2350
- const int max_slice_nums=9;
2351
- const int scale_resolution=448;
2352
- const int original_width = ctx_clip->load_image_size->width;
2353
- const int original_height = ctx_clip->load_image_size->height;
2354
- const float log_ratio = log(1.0*original_width/original_height);
2355
- const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
2356
- const int multiple = fmin(ceil(ratio), max_slice_nums);
2357
- std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
2358
- return best_grid.first;
2359
- }
2360
-
2361
- // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
2362
- // res_imgs memory is being allocated here, previous allocations will be freed if found
2363
- bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
2364
-
2365
- if(clip_is_minicpmv(ctx)){
2366
- int max_slice_nums = 9;
2367
- std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
2368
- res_imgs->size = 0;
2369
- for (size_t i = 0; i < imgs.size(); ++i){
2370
- res_imgs->size += imgs[i].size();
2371
- }
2372
- res_imgs->data = new clip_image_f32[res_imgs->size];
2373
- int idx = 0;
2374
- for (size_t i = 0; i < imgs.size(); ++i) {
2375
- for (size_t j = 0; j < imgs[i].size(); ++j) {
2376
- LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
2377
- clip_image_f32 * res = clip_image_f32_init();
2378
- normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
2379
- res_imgs->data[idx++] = *res;
2380
- clip_image_f32_free(res);
2381
- }
2382
- }
2383
- for (size_t i = 0; i < imgs.size(); ++i) {
2384
- for (size_t j = 0; j < imgs[i].size(); ++j) {
2385
- if (imgs[i][j] != nullptr) {
2386
- clip_image_u8_free(imgs[i][j]);
2387
- }
2388
- }
2389
- }
2390
- return true;
2391
- }
2392
- else if (ctx->has_qwen2vl_merger) {
2393
- clip_image_u8 * resized = clip_image_u8_init();
2394
- auto patch_size = clip_patch_size(ctx) * 2;
2395
- int nx = ceil((float)img->nx / patch_size) * patch_size;
2396
- int ny = ceil((float)img->ny / patch_size) * patch_size;
2397
- bicubic_resize(*img, *resized, nx, ny);
2398
-
2399
- res_imgs->data = new clip_image_f32[1];
2400
- // clip_image_f32 * res = clip_image_f32_init();
2401
- normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std);
2402
- // res_imgs->data[0] = *res;
2403
- res_imgs->size = 1;
2404
-
2405
- // clip_image_f32_free(res);
2406
- clip_image_u8_free(resized);
2407
- return true;
2408
- }
2409
-
2410
- if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
2411
- res_imgs->size = 1;
2412
- res_imgs->data = new clip_image_f32[res_imgs->size];
2413
- clip_image_u8 resized_image;
2414
- int32_t sz=ctx->vision_model.hparams.image_size;
2415
- bicubic_resize(*img, resized_image,sz,sz);
2416
- clip_image_f32 * res = clip_image_f32_init();
2417
- //clip_image_save_to_bmp(resized_image, "resized.bmp");
2418
- normalize_image_u8_to_f32(&resized_image, res, ctx->image_mean, ctx->image_std);
2419
- res_imgs->data[0] = *res;
2420
- clip_image_f32_free(res);
2421
- return true;
2422
- }
2423
-
2424
- bool pad_to_square = true;
2425
- if (!ctx->has_vision_encoder) {
2426
- LOG_ERR("This gguf file seems to have no vision encoder\n");
2427
- return false;
2428
- }
2429
- auto & params = ctx->vision_model.hparams;
2430
- // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
2431
- if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) {
2432
- pad_to_square = false;
2433
- }
2434
- // free the previous res_imgs if any set
2435
- if (res_imgs->size > 0) {
2436
- clip_image_f32_batch_free(res_imgs);
2437
- }
2438
- res_imgs->data = nullptr;
2439
- res_imgs->size = 0;
2440
-
2441
- // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
2442
- // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
2443
-
2444
- clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
2445
- if (pad_to_square && img->nx != img->ny) {
2446
- int longer_side = std::max(img->nx, img->ny);
2447
- temp->nx = longer_side;
2448
- temp->ny = longer_side;
2449
- temp->buf.resize(3 * longer_side * longer_side);
2450
- const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255)
2451
-
2452
- // fill with background color
2453
- for (size_t i = 0; i < temp->buf.size(); i++) {
2454
- temp->buf[i] = bc[i % 3];
2455
- }
2456
-
2457
- // copy from the input image
2458
- for (int y = 0; y < img->ny; y++) {
2459
- for (int x = 0; x < img->nx; x++) {
2460
- const int i = 3 * (y * img->nx + x);
2461
- const int j = 3 * (y * temp->nx + x);
2462
- temp->buf[j] = img->buf[i];
2463
- temp->buf[j+1] = img->buf[i+1];
2464
- temp->buf[j+2] = img->buf[i+2];
2465
- }
2466
- }
2467
- } else {
2468
- if (!params.image_grid_pinpoints.empty()) {
2469
- // "spatial_unpad" with "anyres" processing for llava-1.6
2470
- std::vector<std::pair<int, int>> possible_resolutions;
2471
- for (size_t i = 0; i < params.image_grid_pinpoints.size(); i+=2) {
2472
- possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
2473
- }
2474
- std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
2475
- // clip_image_save_to_bmp(*img, "input.bmp");
2476
- resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6
2477
- // clip_image_save_to_bmp(*temp, "resized.bmp");
2478
- // visually verify normalized image:
2479
- // normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
2480
- // {
2481
- // clip_image_u8 * temp2 = clip_image_u8_init();
2482
- // clip_image_convert_f32_to_u8(*res, *temp2);
2483
- // clip_image_save_to_bmp(*temp2, "resized_normalized_f32.bmp");
2484
- // clip_image_u8_free(temp2);
2485
- // }
2486
-
2487
- std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
2488
-
2489
- clip_image_u8 *image_original_resize = clip_image_u8_init();
2490
- // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
2491
- bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
2492
- patches.insert(patches.begin(), image_original_resize);
2493
- // clip_image_f32_batch_init(patches.size());
2494
- res_imgs->size = patches.size();
2495
- res_imgs->data = new clip_image_f32[res_imgs->size];
2496
- int num=0;
2497
- for (auto& patch : patches) {
2498
- normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std);
2499
- num++;
2500
- }
2501
-
2502
- for (size_t i = 0; i < patches.size(); i++) {
2503
- // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
2504
- clip_image_u8_free(patches[i]);
2505
- }
2506
-
2507
- clip_image_u8_free(temp);
2508
-
2509
- return true;
2510
- } else {
2511
- temp->nx = img->nx;
2512
- temp->ny = img->ny;
2513
- temp->buf.resize(img->buf.size());
2514
- memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
2515
- }
2516
- }
2517
-
2518
- const int nx = temp->nx;
2519
- const int ny = temp->ny;
2520
- // clip_image_save_to_bmp(*temp, "resized_vanilla.bmp");
2521
-
2522
- const int nx2 = ctx->vision_model.hparams.image_size;
2523
- const int ny2 = ctx->vision_model.hparams.image_size;
2524
- clip_image_f32 * res = clip_image_f32_init();
2525
- res->nx = nx2;
2526
- res->ny = ny2;
2527
- res->buf.resize(3 * nx2 * ny2);
2528
-
2529
- const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;
2530
-
2531
- const int nx3 = int(nx / scale + 0.5f);
2532
- const int ny3 = int(ny / scale + 0.5f);
2533
-
2534
- const auto & m3 = ctx->image_mean; // {0.48145466f, 0.4578275f, 0.40821073f};
2535
- const auto & s3 = ctx->image_std; // {0.26862954f, 0.26130258f, 0.27577711f};
2536
-
2537
- for (int y = 0; y < ny3; y++) {
2538
- for (int x = 0; x < nx3; x++) {
2539
- for (int c = 0; c < 3; c++) {
2540
- // linear interpolation
2541
- const float sx = (x + 0.5f) * scale - 0.5f;
2542
- const float sy = (y + 0.5f) * scale - 0.5f;
2543
-
2544
- const int x0 = std::max(0, (int)std::floor(sx));
2545
- const int y0 = std::max(0, (int)std::floor(sy));
2546
-
2547
- const int x1 = std::min(x0 + 1, nx - 1);
2548
- const int y1 = std::min(y0 + 1, ny - 1);
2549
-
2550
- const float dx = sx - x0;
2551
- const float dy = sy - y0;
2552
-
2553
- const int j00 = 3 * (y0 * nx + x0) + c;
2554
- const int j01 = 3 * (y0 * nx + x1) + c;
2555
- const int j10 = 3 * (y1 * nx + x0) + c;
2556
- const int j11 = 3 * (y1 * nx + x1) + c;
2557
-
2558
- const float v00 = temp->buf[j00];
2559
- const float v01 = temp->buf[j01];
2560
- const float v10 = temp->buf[j10];
2561
- const float v11 = temp->buf[j11];
2562
-
2563
- const float v0 = v00 * (1.0f - dx) + v01 * dx;
2564
- const float v1 = v10 * (1.0f - dx) + v11 * dx;
2565
-
2566
- const float v = v0 * (1.0f - dy) + v1 * dy;
2567
-
2568
- const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f);
2569
-
2570
- const int i = 3 * (y * nx3 + x) + c;
2571
-
2572
- res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
2573
- }
2574
- }
2575
- }
2576
- clip_image_u8_free(temp);
2577
-
2578
- // {
2579
- // clip_image_u8 * temp2 = clip_image_u8_init();
2580
- // clip_image_convert_f32_to_u8(*res, *temp2);
2581
- // clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp");
2582
- // clip_image_u8_free(temp2);
2583
- // }
2584
- // res_imgs.push_back(res);
2585
-
2586
- res_imgs->size = 1;
2587
- res_imgs->data = new clip_image_f32[res_imgs->size];
2588
- res_imgs->data[0] = *res;
2589
- clip_image_f32_free(res);
2590
-
2591
- return true;
2592
- }
2593
-
2594
- ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
2595
- return ctx->vision_model.image_newline;
2596
- }
2597
-
2598
- void clip_free(clip_ctx * ctx) {
2599
- delete ctx;
2600
- }
2601
-
2602
- size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
2603
- int extra_tokens = ctx->has_glm_projector ? 2 : 0;
2604
- return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
2605
- }
2606
-
2607
- size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
2608
- clip_image_f32 img;
2609
- img.nx = img_w;
2610
- img.ny = img_h;
2611
- return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
2612
- }
2613
-
2614
- int32_t clip_image_size(const struct clip_ctx * ctx) {
2615
- return ctx->vision_model.hparams.image_size;
2616
- }
2617
-
2618
- int32_t clip_patch_size(const struct clip_ctx * ctx) {
2619
- return ctx->vision_model.hparams.patch_size;
2620
- }
2621
-
2622
- int32_t clip_hidden_size(const struct clip_ctx * ctx) {
2623
- return ctx->vision_model.hparams.hidden_size;
2624
- }
2625
-
2626
- const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
2627
- return ctx->vision_model.hparams.mm_patch_merge_type;
2628
- }
2629
-
2630
- const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
2631
- if (ctx->vision_model.hparams.image_grid_pinpoints.size()) {
2632
- return &ctx->vision_model.hparams.image_grid_pinpoints.front();
2633
- }
2634
- return nullptr;
2635
- }
2636
-
2637
- size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
2638
- return ctx->vision_model.hparams.image_grid_pinpoints.size();
2639
- }
2640
-
2641
- int clip_n_patches(const struct clip_ctx * ctx) {
2642
- clip_image_f32 img;
2643
- img.nx = ctx->vision_model.hparams.image_size;
2644
- img.ny = ctx->vision_model.hparams.image_size;
2645
- return clip_n_patches_by_img(ctx, &img);
2646
- }
2647
-
2648
- int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
2649
- const auto & params = ctx->vision_model.hparams;
2650
-
2651
- int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
2652
-
2653
- if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
2654
- n_patches /= 4;
2655
- } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2656
- if (ctx->minicpmv_version == 2) {
2657
- n_patches = 96;
2658
- }
2659
- else if (ctx->minicpmv_version == 3) {
2660
- n_patches = 64;
2661
- }
2662
- else if (ctx->minicpmv_version == 4) {
2663
- n_patches = 64;
2664
- }
2665
- } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
2666
- int patch_size = params.patch_size * 2;
2667
- int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
2668
- int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
2669
- n_patches = x_patch * y_patch;
2670
- }
2671
-
2672
- return n_patches;
2673
- }
2674
-
2675
- static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
2676
- assert(embed_dim % 2 == 0);
2677
- int H = pos.size();
2678
- int W = pos[0].size();
2679
-
2680
- std::vector<float> omega(embed_dim / 2);
2681
- for (int i = 0; i < embed_dim / 2; ++i) {
2682
- omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
2683
- }
2684
-
2685
- std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
2686
- for (int h = 0; h < H; ++h) {
2687
- for (int w = 0; w < W; ++w) {
2688
- for (int d = 0; d < embed_dim / 2; ++d) {
2689
- float out_value = pos[h][w] * omega[d];
2690
- emb[h][w][d] = sin(out_value);
2691
- emb[h][w][d + embed_dim / 2] = cos(out_value);
2692
- }
2693
- }
2694
- }
2695
-
2696
- return emb;
2697
- }
2698
-
2699
- static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>> & grid) {
2700
- assert(embed_dim % 2 == 0);
2701
- std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
2702
- std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
2703
-
2704
- int H = emb_h.size();
2705
- int W = emb_h[0].size();
2706
- std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
2707
-
2708
- for (int h = 0; h < H; ++h) {
2709
- for (int w = 0; w < W; ++w) {
2710
- for (int d = 0; d < embed_dim / 2; ++d) {
2711
- emb[h][w][d] = emb_h[h][w][d];
2712
- emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
2713
- }
2714
- }
2715
- }
2716
- return emb;
2717
- }
2718
-
2719
- static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
2720
- int grid_h_size = image_size.first;
2721
- int grid_w_size = image_size.second;
2722
-
2723
- std::vector<float> grid_h(grid_h_size);
2724
- std::vector<float> grid_w(grid_w_size);
2725
-
2726
- for (int i = 0; i < grid_h_size; ++i) {
2727
- grid_h[i] = static_cast<float>(i);
2728
- }
2729
- for (int i = 0; i < grid_w_size; ++i) {
2730
- grid_w[i] = static_cast<float>(i);
2731
- }
2732
-
2733
- std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
2734
- for (int h = 0; h < grid_h_size; ++h) {
2735
- for (int w = 0; w < grid_w_size; ++w) {
2736
- grid[h][w] = grid_w[w];
2737
- }
2738
- }
2739
- std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
2740
- for (int h = 0; h < grid_h_size; ++h) {
2741
- for (int w = 0; w < grid_w_size; ++w) {
2742
- grid_2d[0][h][w] = grid_h[h];
2743
- grid_2d[1][h][w] = grid_w[w];
2744
- }
2745
- }
2746
-
2747
- std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
2748
-
2749
- int H = image_size.first;
2750
- int W = image_size.second;
2751
- std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
2752
- for (int h = 0; h < H; ++h) {
2753
- for (int w = 0; w < W; ++w) {
2754
- pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
2755
- }
2756
- }
2757
-
2758
- return pos_embed_2d;
2759
- }
2760
-
2761
- bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
2762
- if (!ctx->has_vision_encoder) {
2763
- LOG_ERR("This gguf file seems to have no vision encoder\n");
2764
- return false;
2765
- }
2766
-
2767
- clip_image_f32_batch imgs{};
2768
- imgs.size = 1;
2769
- imgs.data = img;
2770
- return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
2771
- }
2772
-
2773
- bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
2774
- if (!ctx->has_vision_encoder) {
2775
- LOG_ERR("This gguf file seems to have no vision encoder\n");
2776
- return false;
2777
- }
2778
-
2779
- int batch_size = imgs->size;
2780
- if (ctx->has_llava_projector) {
2781
- GGML_ASSERT(batch_size == 1); // TODO: support multiple images
2782
- }
2783
- if (ctx->has_minicpmv_projector) {
2784
- GGML_ASSERT(batch_size == 1);
2785
- }
2786
- if (ctx->has_glm_projector) {
2787
- GGML_ASSERT(batch_size == 1);
2788
- ggml_tensor * boi = ctx->vision_model.boi_w;
2789
- ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
2790
- vec = (float*)(vec+ggml_nelements(boi)); //offset for boi
2791
- }
2792
-
2793
- // build the inference graph
2794
- ggml_backend_sched_reset(ctx->sched.get());
2795
- ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
2796
- ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
2797
-
2798
- // set inputs
2799
- const auto & model = ctx->vision_model;
2800
- const auto & hparams = model.hparams;
2801
-
2802
- const int image_size = hparams.image_size;
2803
- int image_size_width = image_size;
2804
- int image_size_height = image_size;
2805
- if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
2806
- image_size_width = imgs->data[0].nx;
2807
- image_size_height = imgs->data[0].ny;
2808
- }
2809
- const int patch_size = hparams.patch_size;
2810
- const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
2811
- const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
2812
- if(ctx->load_image_size==nullptr){
2813
- ctx->load_image_size= clip_image_size_init();
2814
- }
2815
- const int pos_w = ctx->load_image_size->width/patch_size;
2816
- const int pos_h = ctx->load_image_size->height/patch_size;
2817
-
2818
- {
2819
- struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
2820
- float * data = (float *)malloc(ggml_nbytes(inp_raw));
2821
-
2822
- for (size_t i = 0; i < imgs->size; i++) {
2823
- const int nx = imgs->data[i].nx;
2824
- const int ny = imgs->data[i].ny;
2825
- if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
2826
- GGML_ASSERT(nx == image_size && ny == image_size);
2827
- }
2828
-
2829
- const int n = nx * ny;
2830
-
2831
- for (int b = 0; b < batch_size; b++) {
2832
- for (int k = 0; k < 3; k++) {
2833
- for (int y = 0; y < ny; y++) {
2834
- for (int x = 0; x < nx; x++) {
2835
- data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
2836
- }
2837
- }
2838
- }
2839
- }
2840
- }
2841
- ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
2842
- free(data);
2843
- }
2844
- if (ctx->has_minicpmv_projector) {
2845
- {
2846
- // inspired from siglip:
2847
- // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
2848
- // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
2849
- struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
2850
- int* positions_data = (int*)malloc(ggml_nbytes(positions));
2851
- int bucket_coords_h[1024];
2852
- int bucket_coords_w[1024];
2853
- for (int i = 0; i < pos_h; i++){
2854
- bucket_coords_h[i] = std::floor(70.0*i/pos_h);
2855
- }
2856
- for (int i = 0; i < pos_w; i++){
2857
- bucket_coords_w[i] = std::floor(70.0*i/pos_w);
2858
- }
2859
- for (int i = 0, id = 0; i < pos_h; i++){
2860
- for (int j = 0; j < pos_w; j++){
2861
- positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
2862
- }
2863
- }
2864
- ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
2865
- free(positions_data);
2866
- }
2867
-
2868
- {
2869
- // inspired from resampler of Qwen-VL:
2870
- // -> https://huggingface.co/Qwen/Qwen-VL/tree/main
2871
- // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
2872
- struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
2873
- int embed_dim = 4096;
2874
- if (ctx->minicpmv_version == 2) {
2875
- embed_dim = 4096;
2876
- }
2877
- else if (ctx->minicpmv_version == 3) {
2878
- embed_dim = 3584;
2879
- }
2880
- else if (ctx->minicpmv_version == 4) {
2881
- embed_dim = 3584;
2882
- }
2883
- auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
2884
-
2885
- float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
2886
- for(int i=0;i < pos_w * pos_h; ++i){
2887
- for(int j=0; j < embed_dim; ++j){
2888
- pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j];
2889
- }
2890
- }
2891
-
2892
- ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
2893
- free(pos_embed_data);
2894
- }
2895
- }
2896
- else{
2897
- {
2898
- if (ctx->has_class_embedding) {
2899
- struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
2900
-
2901
- void* zero_mem = malloc(ggml_nbytes(embeddings));
2902
- memset(zero_mem, 0, ggml_nbytes(embeddings));
2903
- ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
2904
- free(zero_mem);
2905
- }
2906
- }
2907
-
2908
- if (ctx->has_qwen2vl_merger) {
2909
- struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
2910
-
2911
- const int pw = image_size_width / patch_size;
2912
- const int ph = image_size_height / patch_size;
2913
- int* positions_data = (int*)malloc(ggml_nbytes(positions));
2914
-
2915
- int ptr = 0;
2916
- for (int y = 0; y < ph; y+=2)
2917
- {
2918
- for (int x = 0; x < pw; x+=2)
2919
- {
2920
- for (int dy = 0; dy < 2; dy++) {
2921
- for (int dx = 0; dx < 2; dx++) {
2922
- positions_data[ptr] = y + dy;
2923
- positions_data[num_patches + ptr] = x + dx;
2924
- positions_data[num_patches * 2 + ptr] = y + dy;
2925
- positions_data[num_patches * 3 + ptr] = x + dx;
2926
- ptr++;
2927
- }
2928
- }
2929
- }
2930
- }
2931
-
2932
- ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
2933
- free(positions_data);
2934
- }
2935
- else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
2936
- // do nothing
2937
- }
2938
- else {
2939
- struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
2940
-
2941
- int* positions_data = (int*)malloc(ggml_nbytes(positions));
2942
- for (int i = 0; i < num_positions; i++) {
2943
- positions_data[i] = i;
2944
- }
2945
- ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
2946
- free(positions_data);
2947
-
2948
- if (!ctx->has_glm_projector) {
2949
- struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
2950
- // The patches vector is used to get rows to index into the embeds with;
2951
- // we should skip dim 0 only if we have CLS to avoid going out of bounds
2952
- // when retrieving the rows.
2953
- int patch_offset = ctx->has_class_embedding ? 1 : 0;
2954
- int* patches_data = (int*)malloc(ggml_nbytes(patches));
2955
- for (int i = 0; i < num_patches; i++) {
2956
- patches_data[i] = i + patch_offset;
2957
- }
2958
- ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
2959
- free(patches_data);
2960
- }
2961
- }
2962
- }
2963
-
2964
- ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
2965
-
2966
- auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
2967
- if (status != GGML_STATUS_SUCCESS) {
2968
- LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
2969
- return false;
2970
- }
2971
-
2972
- // the last node is the embedding tensor
2973
- struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
2974
-
2975
- // copy the embeddings to the location passed by the user
2976
- ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
2977
-
2978
- if (ctx->has_glm_projector) {
2979
- //eoi
2980
- ggml_tensor * eoi = ctx->vision_model.eoi_w;
2981
- int offset = ggml_nelements(embeddings);
2982
- ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi));
2983
- }
2984
-
2985
- return true;
2986
- }
2987
-
2988
- bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
2989
- assert(itype < GGML_TYPE_COUNT);
2990
- ggml_type type = static_cast<ggml_type>(itype);
2991
-
2992
- auto * ctx_clip = clip_model_load(fname_inp, 2);
2993
-
2994
- const auto & ctx_src = ctx_clip->ctx_gguf;
2995
- const auto & ctx_data = ctx_clip->ctx_data;
2996
-
2997
- auto * ctx_out = gguf_init_empty();
2998
- gguf_set_kv(ctx_out, ctx_src);
2999
- gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
3000
- gguf_set_val_u32(ctx_out, "general.file_type", itype);
3001
-
3002
- auto fout = std::ofstream(fname_out, std::ios::binary);
3003
-
3004
- const int n_tensors = gguf_get_n_tensors(ctx_src);
3005
-
3006
- for (int i = 0; i < n_tensors; ++i) {
3007
- const char * name = gguf_get_tensor_name(ctx_src, i);
3008
- struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
3009
- gguf_add_tensor(ctx_out, cur);
3010
- }
3011
-
3012
- const size_t meta_size = gguf_get_meta_size(ctx_out);
3013
- for (size_t i = 0; i < meta_size; ++i) {
3014
- fout.put(0);
3015
- }
3016
-
3017
- // regexes of tensor names to be quantized
3018
- const std::vector<std::string> k_names = {
3019
- ".*weight",
3020
- };
3021
-
3022
- std::vector<uint8_t> work(512);
3023
- std::vector<float> conv_buf(512);
3024
- size_t total_size_org = 0;
3025
- size_t total_size_new = 0;
3026
-
3027
- for (int i = 0; i < n_tensors; ++i) {
3028
- const std::string name = gguf_get_tensor_name(ctx_src, i);
3029
- struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());
3030
-
3031
- enum ggml_type new_type;
3032
- void * new_data;
3033
- size_t new_size;
3034
-
3035
- bool quantize = false;
3036
- for (const auto & s : k_names) {
3037
- if (std::regex_match(name, std::regex(s))) {
3038
- quantize = true;
3039
- break;
3040
- }
3041
- }
3042
-
3043
- // quantize only 2D tensors and bigger than block size
3044
- quantize &= (ggml_n_dims(cur) == 2) && cur->ne[0] > ggml_blck_size(type);
3045
-
3046
- if (quantize) {
3047
- new_type = type;
3048
- if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
3049
- new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
3050
- // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
3051
- }
3052
- const size_t n_elms = ggml_nelements(cur);
3053
- float * f32_data;
3054
-
3055
- switch (cur->type) {
3056
- case GGML_TYPE_F32:
3057
- f32_data = (float *)cur->data;
3058
- break;
3059
- case GGML_TYPE_F16:
3060
- if (conv_buf.size() < n_elms) {
3061
- conv_buf.resize(n_elms);
3062
- }
3063
- for (size_t j = 0; j < n_elms; ++j) {
3064
- conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]);
3065
- }
3066
- f32_data = (float *)conv_buf.data();
3067
- break;
3068
- default:
3069
- LOG_ERR("Please use an input file in f32 or f16\n");
3070
- gguf_free(ctx_out);
3071
- return false;
3072
- }
3073
-
3074
- if (work.size() < n_elms * 4) {
3075
- work.resize(n_elms * 4);
3076
- }
3077
- new_data = work.data();
3078
-
3079
- new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr);
3080
- } else {
3081
- new_type = cur->type;
3082
- new_data = cur->data;
3083
- new_size = ggml_nbytes(cur);
3084
- }
3085
- const size_t orig_size = ggml_nbytes(cur);
3086
- total_size_org += orig_size;
3087
- total_size_new += new_size;
3088
- gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
3089
- GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size);
3090
- gguf_set_tensor_data(ctx_out, name.c_str(), new_data);
3091
- fout.write((const char *)new_data, new_size);
3092
- size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
3093
- for (size_t j = 0; j < pad; ++j) {
3094
- fout.put(0);
3095
- }
3096
-
3097
- LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
3098
- orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
3099
- }
3100
-
3101
- // go back to beginning of file and write the updated metadata
3102
- fout.seekp(0, std::ios::beg);
3103
- std::vector<uint8_t> meta(meta_size);
3104
- gguf_get_meta_data(ctx_out, meta.data());
3105
- fout.write((const char *)meta.data(), meta_size);
3106
-
3107
- fout.close();
3108
-
3109
- clip_free(ctx_clip);
3110
- gguf_free(ctx_out);
3111
-
3112
- {
3113
- LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
3114
- LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
3115
- }
3116
-
3117
- return true;
3118
- }
3119
-
3120
- int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
3121
- if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
3122
- return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
3123
- }
3124
- if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
3125
- return ctx->vision_model.mm_model_peg_0_b->ne[0];
3126
- }
3127
- if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
3128
- return ctx->vision_model.mm_2_b->ne[0];
3129
- }
3130
- if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
3131
- return ctx->vision_model.mm_3_b->ne[0];
3132
- }
3133
- if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
3134
- if (ctx->minicpmv_version == 2) {
3135
- return 4096;
3136
- }
3137
- else if (ctx->minicpmv_version == 3) {
3138
- return 3584;
3139
- }
3140
- else if (ctx->minicpmv_version == 4) {
3141
- return 3584;
3142
- }
3143
- }
3144
- if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE){
3145
- return ctx->vision_model.mm_model_mlp_3_w->ne[1];
3146
- }
3147
- if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
3148
- return ctx->vision_model.mm_1_b->ne[0];
3149
- }
3150
- if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
3151
- return ctx->vision_model.mm_input_proj_w->ne[0];
3152
- }
3153
-
3154
- std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
3155
- throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
3156
- }
3157
-
3158
- int clip_is_minicpmv(const struct clip_ctx * ctx) {
3159
- if (ctx->has_minicpmv_projector) {
3160
- return ctx->minicpmv_version;
3161
- }
3162
- return 0;
3163
- }
3164
-
3165
- bool clip_is_glm(const struct clip_ctx * ctx) {
3166
- return ctx->has_glm_projector;
3167
- }
3168
- bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
3169
- return ctx->has_qwen2vl_merger;
3170
- }
3171
-
3172
- // Determine the number of encoder layers to iterate over
3173
- int get_deepest_feature_layer(const struct clip_ctx * ctx) {
3174
- // Get the index of the second to last layer; this is the
3175
- // default for models that have a llava projector
3176
- const auto & hparams = ctx->vision_model.hparams;
3177
- int n_layer = hparams.n_layer - 1;
3178
- int deepest_feature_layer = -1;
3179
-
3180
- // Handle other projectors; incrementing here indicates that we
3181
- // should use the last encoder layer for the vision features.
3182
- if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
3183
- n_layer += 1;
3184
- }
3185
-
3186
- // If we set explicit vision feature layers, only go up to the deepest one
3187
- for (const auto & feature_layer : hparams.vision_feature_layer) {
3188
- if (feature_layer > deepest_feature_layer) {
3189
- deepest_feature_layer = feature_layer;
3190
- }
3191
- }
3192
- return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
3193
- }
3194
-
3195
- bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
3196
- clip_image_f32 clip_img;
3197
- clip_img.buf.resize(h * w * 3);
3198
- for (int i = 0; i < h*w*3; i++)
3199
- {
3200
- clip_img.buf[i] = img[i];
3201
- }
3202
- clip_img.nx = w;
3203
- clip_img.ny = h;
3204
- clip_image_encode(ctx, n_threads, &clip_img, vec);
3205
- return true;
3206
- }