@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -3,8 +3,8 @@
3
3
  // I'll gradually clean and extend it
4
4
  // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
5
5
  #include "clip.h"
6
- #include "log.h"
7
6
  #include "ggml.h"
7
+ #include "ggml-cpu.h"
8
8
  #include "ggml-alloc.h"
9
9
  #include "ggml-backend.h"
10
10
 
@@ -20,6 +20,10 @@
20
20
  #include "ggml-cann.h"
21
21
  #endif
22
22
 
23
+ #ifdef GGML_USE_VULKAN
24
+ #include "ggml-vulkan.h"
25
+ #endif
26
+
23
27
  #define STB_IMAGE_IMPLEMENTATION
24
28
  #include "stb_image.h"
25
29
 
@@ -36,6 +40,11 @@
36
40
  #include <cinttypes>
37
41
  #include <limits>
38
42
 
43
+ #define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
44
+ #define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
45
+ #define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
46
+ #define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
47
+
39
48
  //#define CLIP_DEBUG_FUNCTIONS
40
49
 
41
50
  // RGB uint8 image
@@ -74,26 +83,28 @@ static std::string format(const char * fmt, ...) {
74
83
  // key constants
75
84
  //
76
85
 
77
- #define KEY_FTYPE "general.file_type"
78
- #define KEY_NAME "general.name"
79
- #define KEY_DESCRIPTION "general.description"
80
- #define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
81
- #define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
82
- #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
83
- #define KEY_USE_GELU "clip.use_gelu"
84
- #define KEY_N_EMBD "clip.%s.embedding_length"
85
- #define KEY_N_FF "clip.%s.feed_forward_length"
86
- #define KEY_N_BLOCK "clip.%s.block_count"
87
- #define KEY_N_HEAD "clip.%s.attention.head_count"
88
- #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
89
- #define KEY_PROJ_DIM "clip.%s.projection_dim"
90
- #define KEY_TOKENS "tokenizer.ggml.tokens"
91
- #define KEY_N_POSITIONS "clip.text.context_length"
92
- #define KEY_IMAGE_SIZE "clip.vision.image_size"
93
- #define KEY_PATCH_SIZE "clip.vision.patch_size"
94
- #define KEY_IMAGE_MEAN "clip.vision.image_mean"
95
- #define KEY_IMAGE_STD "clip.vision.image_std"
96
- #define KEY_PROJ_TYPE "clip.projector_type"
86
+ #define KEY_FTYPE "general.file_type"
87
+ #define KEY_NAME "general.name"
88
+ #define KEY_DESCRIPTION "general.description"
89
+ #define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
90
+ #define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
91
+ #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
92
+ #define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
93
+ #define KEY_MINICPMV_VERSION "clip.minicpmv_version"
94
+ #define KEY_USE_GELU "clip.use_gelu"
95
+ #define KEY_N_EMBD "clip.%s.embedding_length"
96
+ #define KEY_N_FF "clip.%s.feed_forward_length"
97
+ #define KEY_N_BLOCK "clip.%s.block_count"
98
+ #define KEY_N_HEAD "clip.%s.attention.head_count"
99
+ #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
100
+ #define KEY_PROJ_DIM "clip.%s.projection_dim"
101
+ #define KEY_TOKENS "tokenizer.ggml.tokens"
102
+ #define KEY_N_POSITIONS "clip.text.context_length"
103
+ #define KEY_IMAGE_SIZE "clip.vision.image_size"
104
+ #define KEY_PATCH_SIZE "clip.vision.patch_size"
105
+ #define KEY_IMAGE_MEAN "clip.vision.image_mean"
106
+ #define KEY_IMAGE_STD "clip.vision.image_std"
107
+ #define KEY_PROJ_TYPE "clip.projector_type"
97
108
 
98
109
  #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
99
110
  #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
@@ -127,12 +138,20 @@ static std::string format(const char * fmt, ...) {
127
138
  #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
128
139
  #define TN_IMAGE_NEWLINE "model.image_newline"
129
140
 
141
+ #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
142
+ #define TN_MINICPMV_QUERY "resampler.query"
143
+ #define TN_MINICPMV_PROJ "resampler.proj.weight"
144
+ #define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
145
+ #define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
146
+ #define TN_MINICPMV_LN "resampler.ln_%s.%s"
147
+
130
148
 
131
149
  enum projector_type {
132
150
  PROJECTOR_TYPE_MLP,
133
151
  PROJECTOR_TYPE_MLP_NORM,
134
152
  PROJECTOR_TYPE_LDP,
135
153
  PROJECTOR_TYPE_LDPV2,
154
+ PROJECTOR_TYPE_RESAMPLER,
136
155
  PROJECTOR_TYPE_UNKNOWN,
137
156
  };
138
157
 
@@ -140,6 +159,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
140
159
  { PROJECTOR_TYPE_MLP, "mlp" },
141
160
  { PROJECTOR_TYPE_LDP, "ldp" },
142
161
  { PROJECTOR_TYPE_LDPV2, "ldpv2"},
162
+ { PROJECTOR_TYPE_RESAMPLER, "resampler"},
143
163
  };
144
164
 
145
165
 
@@ -150,7 +170,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
150
170
  static int get_key_idx(const gguf_context * ctx, const char * key) {
151
171
  int i = gguf_find_key(ctx, key);
152
172
  if (i == -1) {
153
- LOG_TEE("key %s not found in file\n", key);
173
+ LOG_ERR("key %s not found in file\n", key);
154
174
  throw std::runtime_error(format("Missing required key: %s", key));
155
175
  }
156
176
 
@@ -200,17 +220,20 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
200
220
  }
201
221
 
202
222
  static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
203
- std::string result;
204
- for (size_t pos = 0; ; pos += search.length()) {
205
- auto new_pos = s.find(search, pos);
206
- if (new_pos == std::string::npos) {
207
- result += s.substr(pos, s.size() - pos);
208
- break;
209
- }
210
- result += s.substr(pos, new_pos - pos) + replace;
211
- pos = new_pos;
223
+ if (search.empty()) {
224
+ return;
212
225
  }
213
- s = std::move(result);
226
+ std::string builder;
227
+ builder.reserve(s.length());
228
+ size_t pos = 0;
229
+ size_t last_pos = 0;
230
+ while ((pos = s.find(search, last_pos)) != std::string::npos) {
231
+ builder.append(s, last_pos, pos - last_pos);
232
+ builder.append(replace);
233
+ last_pos = pos + search.length();
234
+ }
235
+ builder.append(s, last_pos, std::string::npos);
236
+ s = std::move(builder);
214
237
  }
215
238
 
216
239
  static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
@@ -252,7 +275,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
252
275
 
253
276
  static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
254
277
  size_t tensor_size = ggml_nbytes(tensor);
255
- LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
278
+ LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
256
279
  prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
257
280
  tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
258
281
  }
@@ -270,7 +293,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
270
293
  static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
271
294
  std::ofstream file(filename, std::ios::binary);
272
295
  if (!file.is_open()) {
273
- LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
296
+ LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
274
297
  return;
275
298
  }
276
299
 
@@ -289,7 +312,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
289
312
  static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
290
313
  std::ofstream file(filename, std::ios::binary);
291
314
  if (!file.is_open()) {
292
- LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
315
+ LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
293
316
  return;
294
317
  }
295
318
 
@@ -492,12 +515,34 @@ struct clip_vision_model {
492
515
  struct ggml_tensor * mm_model_mlp_2_b;
493
516
  struct ggml_tensor * mm_model_peg_0_w;
494
517
  struct ggml_tensor * mm_model_peg_0_b;
518
+
519
+ // MINICPMV projection
520
+ struct ggml_tensor * mm_model_pos_embed_k;
521
+ struct ggml_tensor * mm_model_query;
522
+ struct ggml_tensor * mm_model_proj;
523
+ struct ggml_tensor * mm_model_kv_proj;
524
+ struct ggml_tensor * mm_model_attn_q_w;
525
+ struct ggml_tensor * mm_model_attn_q_b;
526
+ struct ggml_tensor * mm_model_attn_k_w;
527
+ struct ggml_tensor * mm_model_attn_k_b;
528
+ struct ggml_tensor * mm_model_attn_v_w;
529
+ struct ggml_tensor * mm_model_attn_v_b;
530
+ struct ggml_tensor * mm_model_attn_o_w;
531
+ struct ggml_tensor * mm_model_attn_o_b;
532
+ struct ggml_tensor * mm_model_ln_q_w;
533
+ struct ggml_tensor * mm_model_ln_q_b;
534
+ struct ggml_tensor * mm_model_ln_kv_w;
535
+ struct ggml_tensor * mm_model_ln_kv_b;
536
+ struct ggml_tensor * mm_model_ln_post_w;
537
+ struct ggml_tensor * mm_model_ln_post_b;
495
538
  };
496
539
 
497
540
  struct clip_ctx {
498
541
  bool has_text_encoder = false;
499
542
  bool has_vision_encoder = false;
500
543
  bool has_llava_projector = false;
544
+ bool has_minicpmv_projector = false;
545
+ int minicpmv_version = 2;
501
546
 
502
547
  struct clip_vision_model vision_model;
503
548
  projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -522,31 +567,46 @@ struct clip_ctx {
522
567
 
523
568
  ggml_backend_t backend = NULL;
524
569
  ggml_gallocr_t compute_alloc = NULL;
570
+
571
+ struct clip_image_size * load_image_size;
525
572
  };
526
573
 
527
- static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
574
+ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
528
575
  if (!ctx->has_vision_encoder) {
529
- LOG_TEE("This gguf file seems to have no vision encoder\n");
576
+ LOG_ERR("This gguf file seems to have no vision encoder\n");
530
577
  return nullptr;
531
578
  }
532
579
 
533
580
  const auto & model = ctx->vision_model;
534
581
  const auto & hparams = model.hparams;
535
582
 
536
- const int image_size = hparams.image_size;
583
+ const int image_size = hparams.image_size;
584
+ int image_size_width = image_size;
585
+ int image_size_height = image_size;
586
+ if (ctx->has_minicpmv_projector) {
587
+ if (load_image_size == nullptr) {
588
+ load_image_size = clip_image_size_init();
589
+ }
590
+ LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
591
+ image_size_width = load_image_size->width;
592
+ image_size_height = load_image_size->height;
593
+ if (is_inf) {
594
+ image_size_width = imgs->data->nx;
595
+ image_size_height = imgs->data->ny;
596
+ }
597
+ }
537
598
  const int patch_size = hparams.patch_size;
538
- const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
539
- const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
599
+ const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
540
600
  const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
541
601
  const int hidden_size = hparams.hidden_size;
542
602
  const int n_head = hparams.n_head;
543
603
  const int d_head = hidden_size / n_head;
544
- const int n_layer = hparams.n_layer;
604
+ int n_layer = hparams.n_layer;
545
605
  const float eps = hparams.eps;
546
606
 
547
607
  const int batch_size = imgs->size;
548
608
 
549
- if (ctx->has_llava_projector) {
609
+ if (ctx->has_llava_projector || ctx->has_minicpmv_projector) {
550
610
  GGML_ASSERT(batch_size == 1);
551
611
  }
552
612
 
@@ -559,7 +619,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
559
619
  struct ggml_context * ctx0 = ggml_init(params);
560
620
  struct ggml_cgraph * gf = ggml_new_graph(ctx0);
561
621
 
562
- struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
622
+ struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
563
623
  ggml_set_name(inp_raw, "inp_raw");
564
624
  ggml_set_input(inp_raw);
565
625
 
@@ -572,19 +632,21 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
572
632
  // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
573
633
  inp = ggml_add(ctx0, inp, model.patch_bias);
574
634
  }
575
-
576
- // concat class_embeddings and patch_embeddings
577
635
  struct ggml_tensor * embeddings = inp;
578
- if (ctx->has_class_embedding) {
579
- embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
580
- ggml_set_name(embeddings, "embeddings");
581
- ggml_set_input(embeddings);
582
- embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
583
- embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
584
- embeddings = ggml_acc(ctx0, embeddings, inp,
585
- embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
586
- }
636
+ struct ggml_tensor * pos_embed = nullptr;
587
637
 
638
+ if (ctx->has_llava_projector) {
639
+ // concat class_embeddings and patch_embeddings
640
+ if (ctx->has_class_embedding) {
641
+ embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
642
+ ggml_set_name(embeddings, "embeddings");
643
+ ggml_set_input(embeddings);
644
+ embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
645
+ embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
646
+ embeddings = ggml_acc(ctx0, embeddings, inp,
647
+ embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
648
+ }
649
+ }
588
650
 
589
651
  struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
590
652
  ggml_set_name(positions, "positions");
@@ -593,6 +655,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
593
655
  embeddings =
594
656
  ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
595
657
 
658
+ if (ctx->has_minicpmv_projector) {
659
+ int pos_w = image_size_width/patch_size;
660
+ int pos_h = image_size_height/patch_size;
661
+ if (ctx->minicpmv_version == 2) {
662
+ pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
663
+ }
664
+ else if (ctx->minicpmv_version == 3) {
665
+ pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
666
+ }
667
+ ggml_set_name(pos_embed, "pos_embed");
668
+ ggml_set_input(pos_embed);
669
+ }
670
+
596
671
  // pre-layernorm
597
672
  if (ctx->has_pre_norm) {
598
673
  embeddings = ggml_norm(ctx0, embeddings, eps);
@@ -602,6 +677,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
602
677
  }
603
678
 
604
679
  // loop over layers
680
+ if (ctx->has_minicpmv_projector) {
681
+ n_layer += 1;
682
+ }
605
683
  for (int il = 0; il < n_layer - 1; il++) {
606
684
  struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
607
685
 
@@ -691,7 +769,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
691
769
  }
692
770
 
693
771
  // llava projector
694
- {
772
+ if (ctx->has_llava_projector) {
695
773
  embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
696
774
 
697
775
  struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
@@ -712,8 +790,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
712
790
  embeddings = ggml_gelu(ctx0, embeddings);
713
791
  embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
714
792
  embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
715
-
716
- } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
793
+ }
794
+ else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
717
795
  embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
718
796
  embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
719
797
  // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
@@ -872,6 +950,75 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
872
950
  GGML_ABORT("fatal error");
873
951
  }
874
952
  }
953
+ // minicpmv projector
954
+ else if (ctx->has_minicpmv_projector)
955
+ {
956
+ if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
957
+ struct ggml_tensor * q = model.mm_model_query;
958
+ { // layernorm
959
+ q = ggml_norm(ctx0, q, eps);
960
+ q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
961
+ }
962
+ struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
963
+ { // layernorm
964
+ v = ggml_norm(ctx0, v, eps);
965
+ v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
966
+ }
967
+ struct ggml_tensor * k;
968
+ { // position
969
+ // q = ggml_add(ctx0, q, model.mm_model_pos_embed);
970
+ k = ggml_add(ctx0, v, pos_embed);
971
+ }
972
+
973
+ { // attention
974
+ int hidden_size = 4096;
975
+ const int d_head = 128;
976
+ int n_head = hidden_size/d_head;
977
+ int num_query = 96;
978
+ if (ctx->minicpmv_version == 2) {
979
+ hidden_size = 4096;
980
+ n_head = hidden_size/d_head;
981
+ num_query = 96;
982
+ }
983
+ else if (ctx->minicpmv_version == 3) {
984
+ hidden_size = 3584;
985
+ n_head = hidden_size/d_head;
986
+ num_query = 64;
987
+ }
988
+
989
+ struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
990
+ Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
991
+ struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
992
+ struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
993
+ // permute
994
+ Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
995
+ Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
996
+ Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
997
+ K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
998
+ K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
999
+ K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
1000
+ V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
1001
+ V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
1002
+ V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
1003
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1004
+ KQ = ggml_soft_max_inplace(ctx0, KQ);
1005
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
1006
+ KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
1007
+ KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1008
+ KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
1009
+
1010
+ embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
1011
+ }
1012
+ { // layernorm
1013
+ embeddings = ggml_norm(ctx0, embeddings, eps);
1014
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
1015
+ }
1016
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
1017
+ }
1018
+ else {
1019
+ GGML_ASSERT(false);
1020
+ }
1021
+ }
875
1022
 
876
1023
  // build the graph
877
1024
  ggml_build_forward_expand(gf, embeddings);
@@ -905,21 +1052,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
905
1052
  const int idx_name = gguf_find_key(ctx, KEY_NAME);
906
1053
  if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
907
1054
  const std::string name = gguf_get_val_str(ctx, idx_name);
908
- LOG_TEE("%s: model name: %s\n", __func__, name.c_str());
1055
+ LOG_INF("%s: model name: %s\n", __func__, name.c_str());
909
1056
  }
910
- LOG_TEE("%s: description: %s\n", __func__, description.c_str());
911
- LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
912
- LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
913
- LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors);
914
- LOG_TEE("%s: n_kv: %d\n", __func__, n_kv);
915
- LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str());
916
- LOG_TEE("\n");
1057
+ LOG_INF("%s: description: %s\n", __func__, description.c_str());
1058
+ LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
1059
+ LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
1060
+ LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
1061
+ LOG_INF("%s: n_kv: %d\n", __func__, n_kv);
1062
+ LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
1063
+ LOG_INF("\n");
917
1064
  }
918
1065
  const int n_tensors = gguf_get_n_tensors(ctx);
919
1066
 
920
1067
  // kv
921
1068
  const int n_kv = gguf_get_n_kv(ctx);
922
- LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
1069
+ LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
923
1070
  __func__, n_kv, n_tensors, fname);
924
1071
  {
925
1072
  std::map<enum ggml_type, uint32_t> n_type;
@@ -930,7 +1077,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
930
1077
  n_type[type]++;
931
1078
  }
932
1079
 
933
- LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
1080
+ LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
934
1081
  for (int i = 0; i < n_kv; i++) {
935
1082
  const char * name = gguf_get_key(ctx, i);
936
1083
  const enum gguf_type type = gguf_get_kv_type(ctx, i);
@@ -946,7 +1093,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
946
1093
  }
947
1094
  replace_all(value, "\n", "\\n");
948
1095
 
949
- LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
1096
+ LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
950
1097
  }
951
1098
 
952
1099
  // print type counts
@@ -955,7 +1102,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
955
1102
  continue;
956
1103
  }
957
1104
 
958
- LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
1105
+ LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
959
1106
  }
960
1107
  }
961
1108
 
@@ -970,13 +1117,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
970
1117
  size_t tensor_size = ggml_nbytes(cur);
971
1118
  model_size += tensor_size;
972
1119
  if (verbosity >= 3) {
973
- LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
1120
+ LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
974
1121
  __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
975
1122
  }
976
1123
  }
977
1124
  }
978
1125
 
979
- clip_ctx * new_clip = new clip_ctx;
1126
+ clip_ctx * new_clip = new clip_ctx{};
980
1127
 
981
1128
  // update projector type
982
1129
  {
@@ -997,23 +1144,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
997
1144
 
998
1145
  #ifdef GGML_USE_CUDA
999
1146
  new_clip->backend = ggml_backend_cuda_init(0);
1000
- LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
1147
+ LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1001
1148
  #endif
1002
1149
 
1003
1150
  #ifdef GGML_USE_METAL
1004
1151
  new_clip->backend = ggml_backend_metal_init();
1005
- LOG_TEE("%s: CLIP using Metal backend\n", __func__);
1152
+ LOG_INF("%s: CLIP using Metal backend\n", __func__);
1006
1153
  #endif
1007
1154
 
1008
1155
  #ifdef GGML_USE_CANN
1009
1156
  new_clip->backend = ggml_backend_cann_init(0);
1010
- LOG_TEE("%s: CLIP using CANN backend\n", __func__);
1157
+ LOG_INF("%s: CLIP using CANN backend\n", __func__);
1011
1158
  #endif
1012
1159
 
1160
+ #ifdef GGML_USE_VULKAN
1161
+ new_clip->backend = ggml_backend_vk_init(0);
1162
+ LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1163
+ #endif
1013
1164
 
1014
1165
  if (!new_clip->backend) {
1015
1166
  new_clip->backend = ggml_backend_cpu_init();
1016
- LOG_TEE("%s: CLIP using CPU backend\n", __func__);
1167
+ LOG_INF("%s: CLIP using CPU backend\n", __func__);
1017
1168
  }
1018
1169
 
1019
1170
  // model size and capabilities
@@ -1029,7 +1180,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1029
1180
  new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
1030
1181
  }
1031
1182
 
1032
- GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
1183
+ idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ);
1184
+ if (idx != -1) {
1185
+ new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx);
1186
+ }
1187
+
1188
+ idx = gguf_find_key(ctx, KEY_MINICPMV_VERSION);
1189
+ if (idx != -1) {
1190
+ new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
1191
+ }
1192
+
1193
+ // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
1194
+
1033
1195
  GGML_ASSERT(new_clip->has_vision_encoder);
1034
1196
  GGML_ASSERT(!new_clip->has_text_encoder);
1035
1197
 
@@ -1037,15 +1199,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1037
1199
  new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
1038
1200
 
1039
1201
  if (verbosity >= 1) {
1040
- LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
1041
- LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
1042
- LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
1043
- LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
1044
- LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
1202
+ LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
1203
+ LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
1204
+ LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
1205
+ LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
1206
+ LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
1207
+ LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
1045
1208
  }
1046
1209
  }
1047
1210
 
1048
- LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
1211
+ LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
1049
1212
 
1050
1213
  // load tensors
1051
1214
  {
@@ -1058,7 +1221,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1058
1221
 
1059
1222
  new_clip->ctx_data = ggml_init(params);
1060
1223
  if (!new_clip->ctx_data) {
1061
- LOG_TEE("%s: ggml_init() failed\n", __func__);
1224
+ LOG_ERR("%s: ggml_init() failed\n", __func__);
1062
1225
  clip_free(new_clip);
1063
1226
  gguf_free(ctx);
1064
1227
  return nullptr;
@@ -1066,7 +1229,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1066
1229
 
1067
1230
  auto fin = std::ifstream(fname, std::ios::binary);
1068
1231
  if (!fin) {
1069
- LOG_TEE("cannot open model file for loading tensors\n");
1232
+ LOG_ERR("cannot open model file for loading tensors\n");
1070
1233
  clip_free(new_clip);
1071
1234
  gguf_free(ctx);
1072
1235
  return nullptr;
@@ -1088,7 +1251,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1088
1251
  const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
1089
1252
  fin.seekg(offset, std::ios::beg);
1090
1253
  if (!fin) {
1091
- LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
1254
+ LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
1092
1255
  clip_free(new_clip);
1093
1256
  gguf_free(ctx);
1094
1257
  return nullptr;
@@ -1159,23 +1322,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1159
1322
  }
1160
1323
 
1161
1324
  if (verbosity >= 2) {
1162
- LOG_TEE("\n%s: vision model hparams\n", __func__);
1163
- LOG_TEE("image_size %d\n", hparams.image_size);
1164
- LOG_TEE("patch_size %d\n", hparams.patch_size);
1165
- LOG_TEE("v_hidden_size %d\n", hparams.hidden_size);
1166
- LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate);
1167
- LOG_TEE("v_projection_dim %d\n", hparams.projection_dim);
1168
- LOG_TEE("v_n_head %d\n", hparams.n_head);
1169
- LOG_TEE("v_n_layer %d\n", hparams.n_layer);
1170
- LOG_TEE("v_eps %f\n", hparams.eps);
1171
- LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
1172
- LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
1173
- LOG_TEE("v_image_grid_pinpoints: ");
1325
+ LOG_INF("\n%s: vision model hparams\n", __func__);
1326
+ LOG_INF("image_size %d\n", hparams.image_size);
1327
+ LOG_INF("patch_size %d\n", hparams.patch_size);
1328
+ LOG_INF("v_hidden_size %d\n", hparams.hidden_size);
1329
+ LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate);
1330
+ LOG_INF("v_projection_dim %d\n", hparams.projection_dim);
1331
+ LOG_INF("v_n_head %d\n", hparams.n_head);
1332
+ LOG_INF("v_n_layer %d\n", hparams.n_layer);
1333
+ LOG_INF("v_eps %f\n", hparams.eps);
1334
+ LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
1335
+ LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
1336
+ LOG_INF("v_image_grid_pinpoints: ");
1174
1337
  for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
1175
- LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
1338
+ LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
1176
1339
  }
1177
- LOG_TEE("\n");
1178
- LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
1340
+ LOG_INF("\n");
1341
+ LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
1179
1342
 
1180
1343
  }
1181
1344
 
@@ -1213,7 +1376,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1213
1376
  vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
1214
1377
  vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
1215
1378
  } catch(const std::exception& /*e*/) {
1216
- LOG_TEE("%s: failed to load vision model tensors\n", __func__);
1379
+ LOG_ERR("%s: failed to load vision model tensors\n", __func__);
1217
1380
  }
1218
1381
 
1219
1382
  // LLaVA projection
@@ -1242,7 +1405,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1242
1405
  } catch (std::runtime_error & /*e*/) { }
1243
1406
  try {
1244
1407
  vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
1245
- // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
1408
+ // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
1246
1409
  } catch (std::runtime_error & /*e*/) { }
1247
1410
  } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
1248
1411
  // MobileVLM projection
@@ -1281,6 +1444,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1281
1444
  vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
1282
1445
  vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
1283
1446
  }
1447
+ else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
1448
+ // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
1449
+ vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
1450
+ vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
1451
+ vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
1452
+ vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
1453
+ vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight"));
1454
+ vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight"));
1455
+ vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight"));
1456
+ vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias"));
1457
+ vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias"));
1458
+ vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias"));
1459
+ vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight"));
1460
+ vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias"));
1461
+ vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight"));
1462
+ vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias"));
1463
+ vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight"));
1464
+ vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias"));
1465
+ vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
1466
+ vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
1467
+ }
1284
1468
  else {
1285
1469
  std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
1286
1470
  throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@@ -1319,15 +1503,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1319
1503
  new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
1320
1504
  clip_image_f32_batch batch;
1321
1505
  batch.size = 1;
1322
- ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
1506
+ ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
1323
1507
  ggml_gallocr_reserve(new_clip->compute_alloc, gf);
1324
1508
  size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
1325
- LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
1509
+ LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
1326
1510
  }
1327
1511
 
1328
1512
  return new_clip;
1329
1513
  }
1330
1514
 
1515
+ void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
1516
+ ctx_clip->load_image_size = load_image_size;
1517
+ }
1518
+
1519
+ struct clip_image_size * clip_image_size_init() {
1520
+ struct clip_image_size * load_image_size = new struct clip_image_size();
1521
+ load_image_size->width = 448;
1522
+ load_image_size->height = 448;
1523
+ return load_image_size;
1524
+ }
1525
+
1331
1526
  struct clip_image_u8 * clip_image_u8_init() {
1332
1527
  return new clip_image_u8();
1333
1528
  }
@@ -1362,7 +1557,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
1362
1557
  int nx, ny, nc;
1363
1558
  auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
1364
1559
  if (!data) {
1365
- LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
1560
+ LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
1366
1561
  return false;
1367
1562
  }
1368
1563
  build_clip_img_from_data(data, nx, ny, img);
@@ -1374,7 +1569,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
1374
1569
  int nx, ny, nc;
1375
1570
  auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
1376
1571
  if (!data) {
1377
- LOG_TEE("%s: failed to decode image bytes\n", __func__);
1572
+ LOG_ERR("%s: failed to decode image bytes\n", __func__);
1378
1573
  return false;
1379
1574
  }
1380
1575
  build_clip_img_from_data(data, nx, ny, img);
@@ -1433,7 +1628,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
1433
1628
  }
1434
1629
  }
1435
1630
 
1436
- inline float clip(float x, float lower, float upper) {
1631
+ inline int clip(int x, int lower, int upper) {
1437
1632
  return std::max(lower, std::min(x, upper));
1438
1633
  }
1439
1634
 
@@ -1564,7 +1759,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
1564
1759
  int downscaled_height = static_cast<int>(original_height * scale);
1565
1760
  int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
1566
1761
  int wasted_resolution = (width * height) - effective_resolution;
1567
- // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
1762
+ // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
1568
1763
  if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
1569
1764
  max_effective_resolution = effective_resolution;
1570
1765
  min_wasted_resolution = wasted_resolution;
@@ -1598,12 +1793,185 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
1598
1793
  return patches;
1599
1794
  }
1600
1795
 
1796
+ static int ensure_divide(int length, int patch_size) {
1797
+ return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
1798
+ }
1799
+
1800
+ static std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
1801
+ int width = original_size.first;
1802
+ int height = original_size.second;
1803
+ if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
1804
+ float r = static_cast<float>(width) / height;
1805
+ height = static_cast<int>(scale_resolution / std::sqrt(r));
1806
+ width = static_cast<int>(height * r);
1807
+ }
1808
+ int best_width = ensure_divide(width, patch_size);
1809
+ int best_height = ensure_divide(height, patch_size);
1810
+ return std::make_pair(best_width, best_height);
1811
+ }
1812
+
1813
+ static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
1814
+ int width, height;
1815
+ std::tie(width, height) = original_size;
1816
+ int grid_x, grid_y;
1817
+ std::tie(grid_x, grid_y) = grid;
1818
+
1819
+ int refine_width = ensure_divide(width, grid_x);
1820
+ int refine_height = ensure_divide(height, grid_y);
1821
+
1822
+ int grid_width = refine_width / grid_x;
1823
+ int grid_height = refine_height / grid_y;
1824
+
1825
+ // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
1826
+ auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
1827
+ int best_grid_width, best_grid_height;
1828
+ std::tie(best_grid_width, best_grid_height) = best_grid_size;
1829
+
1830
+ // std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
1831
+ std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
1832
+ return refine_size;
1833
+ }
1834
+
1835
+ static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
1836
+ std::vector<int> candidate_split_grids_nums;
1837
+ for (int i : {multiple - 1, multiple, multiple + 1}) {
1838
+ if (i == 1 || i > max_slice_nums) {
1839
+ continue;
1840
+ }
1841
+ candidate_split_grids_nums.push_back(i);
1842
+ }
1843
+
1844
+ std::vector<std::pair<int, int>> candidate_grids;
1845
+ for (int split_grids_nums : candidate_split_grids_nums) {
1846
+ int m = 1;
1847
+ while (m <= split_grids_nums) {
1848
+ if (split_grids_nums % m == 0) {
1849
+ candidate_grids.emplace_back(m, split_grids_nums / m);
1850
+ }
1851
+ ++m;
1852
+ }
1853
+ }
1854
+
1855
+ std::pair<int, int> best_grid{1, 1};
1856
+ float min_error = std::numeric_limits<float>::infinity();
1857
+ for (const auto& grid : candidate_grids) {
1858
+ float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
1859
+ if (error < min_error) {
1860
+ best_grid = grid;
1861
+ min_error = error;
1862
+ }
1863
+ }
1864
+ return best_grid;
1865
+ }
1866
+
1867
+ // inspired from LLaVA-UHD:
1868
+ // -> https://arxiv.org/pdf/2403.11703
1869
+ // -> https://github.com/thunlp/LLaVA-UHD
1870
+ // -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
1871
+ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
1872
+ const std::pair<int, int> original_size={img->nx,img->ny};
1873
+ const int original_width = img->nx;
1874
+ const int original_height = img->ny;
1875
+ const float log_ratio = log(1.0*original_width/original_height);
1876
+ const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
1877
+ const int multiple = fmin(ceil(ratio), max_slice_nums);
1878
+
1879
+ std::vector<std::vector<clip_image_u8 *>> images;
1880
+ LOG_INF("%s: multiple %d\n", __func__, multiple);
1881
+ images.push_back(std::vector<clip_image_u8 *>());
1882
+
1883
+ if (multiple <= 1) {
1884
+ auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
1885
+ clip_image_u8 * source_image = clip_image_u8_init();
1886
+ bicubic_resize(*img, *source_image, best_size.first, best_size.second);
1887
+ // source_image = image.resize(best_size, Image.Resampling.BICUBIC)
1888
+ images[images.size()-1].push_back(source_image);
1889
+ }
1890
+ else if (multiple > 1) {
1891
+ auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
1892
+ clip_image_u8 * source_image = clip_image_u8_init();
1893
+ bicubic_resize(*img, *source_image, best_size.first, best_size.second);
1894
+ // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
1895
+ LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
1896
+ images[images.size()-1].push_back(source_image);
1897
+
1898
+ std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
1899
+ LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
1900
+
1901
+ auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
1902
+ clip_image_u8 * refine_image = clip_image_u8_init();
1903
+ bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
1904
+
1905
+ LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
1906
+
1907
+ // split_to_patches
1908
+ int width = refine_image->nx;
1909
+ int height = refine_image->ny;
1910
+ int grid_x = int(width / best_grid.first);
1911
+ int grid_y = int(height / best_grid.second);
1912
+ for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
1913
+ images.push_back(std::vector<clip_image_u8 *>());
1914
+ for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
1915
+ clip_image_u8 * patch = clip_image_u8_init();
1916
+ patch->nx = grid_x;
1917
+ patch->ny = grid_y;
1918
+ patch->buf.resize(3 * patch->nx * patch->ny);
1919
+ for (int y = patches_i; y < patches_i + grid_y; ++y) {
1920
+ for (int x = patches_j; x < patches_j + grid_x; ++x) {
1921
+ const int i = 3 * (y * refine_image->nx + x);
1922
+ const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
1923
+ patch->buf[j] = refine_image->buf[i];
1924
+ patch->buf[j+1] = refine_image->buf[i+1];
1925
+ patch->buf[j+2] = refine_image->buf[i+2];
1926
+ }
1927
+ }
1928
+ images[images.size()-1].push_back(patch);
1929
+ }
1930
+ }
1931
+ }
1932
+ return images;
1933
+ }
1934
+
1935
+ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
1936
+ const int max_slice_nums=9;
1937
+ const int scale_resolution=448;
1938
+ const int original_width = ctx_clip->load_image_size->width;
1939
+ const int original_height = ctx_clip->load_image_size->height;
1940
+ const float log_ratio = log(1.0*original_width/original_height);
1941
+ const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
1942
+ const int multiple = fmin(ceil(ratio), max_slice_nums);
1943
+ std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
1944
+ return best_grid.first;
1945
+ }
1946
+
1601
1947
  // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
1602
1948
  // res_imgs memory is being allocated here, previous allocations will be freed if found
1603
1949
  bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
1950
+
1951
+ if(clip_is_minicpmv(ctx)){
1952
+ int max_slice_nums = 9;
1953
+ std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
1954
+ res_imgs->size = 0;
1955
+ for (size_t i = 0; i < imgs.size(); ++i){
1956
+ res_imgs->size += imgs[i].size();
1957
+ }
1958
+ res_imgs->data = new clip_image_f32[res_imgs->size];
1959
+ int idx = 0;
1960
+ for (size_t i = 0; i < imgs.size(); ++i) {
1961
+ for (size_t j = 0; j < imgs[i].size(); ++j) {
1962
+ LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
1963
+ clip_image_f32 * res = clip_image_f32_init();
1964
+ normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
1965
+ res_imgs->data[idx++] = *res;
1966
+ clip_image_f32_free(res);
1967
+ }
1968
+ }
1969
+ return true;
1970
+ }
1971
+
1604
1972
  bool pad_to_square = true;
1605
1973
  if (!ctx->has_vision_encoder) {
1606
- LOG_TEE("This gguf file seems to have no vision encoder\n");
1974
+ LOG_ERR("This gguf file seems to have no vision encoder\n");
1607
1975
  return false;
1608
1976
  }
1609
1977
  auto & params = ctx->vision_model.hparams;
@@ -1680,7 +2048,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
1680
2048
  }
1681
2049
 
1682
2050
  for (size_t i = 0; i < patches.size(); i++) {
1683
- // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
2051
+ // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
1684
2052
  clip_image_u8_free(patches[i]);
1685
2053
  }
1686
2054
 
@@ -1816,14 +2184,107 @@ int clip_n_patches(const struct clip_ctx * ctx) {
1816
2184
 
1817
2185
  if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
1818
2186
  n_patches /= 4;
2187
+ } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2188
+ if (ctx->minicpmv_version == 2) {
2189
+ n_patches = 96;
2190
+ }
2191
+ else if (ctx->minicpmv_version == 3) {
2192
+ n_patches = 64;
2193
+ }
1819
2194
  }
1820
2195
 
1821
2196
  return n_patches;
1822
2197
  }
1823
2198
 
2199
+ static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
2200
+ assert(embed_dim % 2 == 0);
2201
+ int H = pos.size();
2202
+ int W = pos[0].size();
2203
+
2204
+ std::vector<float> omega(embed_dim / 2);
2205
+ for (int i = 0; i < embed_dim / 2; ++i) {
2206
+ omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
2207
+ }
2208
+
2209
+ std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
2210
+ for (int h = 0; h < H; ++h) {
2211
+ for (int w = 0; w < W; ++w) {
2212
+ for (int d = 0; d < embed_dim / 2; ++d) {
2213
+ float out_value = pos[h][w] * omega[d];
2214
+ emb[h][w][d] = sin(out_value);
2215
+ emb[h][w][d + embed_dim / 2] = cos(out_value);
2216
+ }
2217
+ }
2218
+ }
2219
+
2220
+ return emb;
2221
+ }
2222
+
2223
+ static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>> & grid) {
2224
+ assert(embed_dim % 2 == 0);
2225
+ std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
2226
+ std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
2227
+
2228
+ int H = emb_h.size();
2229
+ int W = emb_h[0].size();
2230
+ std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
2231
+
2232
+ for (int h = 0; h < H; ++h) {
2233
+ for (int w = 0; w < W; ++w) {
2234
+ for (int d = 0; d < embed_dim / 2; ++d) {
2235
+ emb[h][w][d] = emb_h[h][w][d];
2236
+ emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
2237
+ }
2238
+ }
2239
+ }
2240
+ return emb;
2241
+ }
2242
+
2243
+ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
2244
+ int grid_h_size = image_size.first;
2245
+ int grid_w_size = image_size.second;
2246
+
2247
+ std::vector<float> grid_h(grid_h_size);
2248
+ std::vector<float> grid_w(grid_w_size);
2249
+
2250
+ for (int i = 0; i < grid_h_size; ++i) {
2251
+ grid_h[i] = static_cast<float>(i);
2252
+ }
2253
+ for (int i = 0; i < grid_w_size; ++i) {
2254
+ grid_w[i] = static_cast<float>(i);
2255
+ }
2256
+
2257
+ std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
2258
+ for (int h = 0; h < grid_h_size; ++h) {
2259
+ for (int w = 0; w < grid_w_size; ++w) {
2260
+ grid[h][w] = grid_w[w];
2261
+ }
2262
+ }
2263
+ std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
2264
+ for (int h = 0; h < grid_h_size; ++h) {
2265
+ for (int w = 0; w < grid_w_size; ++w) {
2266
+ grid_2d[0][h][w] = grid_h[h];
2267
+ grid_2d[1][h][w] = grid_w[w];
2268
+ }
2269
+ }
2270
+
2271
+ std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
2272
+
2273
+ int H = image_size.first;
2274
+ int W = image_size.second;
2275
+ std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
2276
+ for (int h = 0; h < H; ++h) {
2277
+ for (int w = 0; w < W; ++w) {
2278
+ pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
2279
+ }
2280
+ }
2281
+
2282
+ return pos_embed_2d;
2283
+ }
2284
+
1824
2285
  bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
1825
2286
  if (!ctx->has_vision_encoder) {
1826
- LOG_TEE("This gguf file seems to have no vision encoder\n");
2287
+ LOG_ERR("This gguf file seems to have no vision encoder\n");
1827
2288
  return false;
1828
2289
  }
1829
2290
 
@@ -1835,7 +2296,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
1835
2296
 
1836
2297
  bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
1837
2298
  if (!ctx->has_vision_encoder) {
1838
- LOG_TEE("This gguf file seems to have no vision encoder\n");
2299
+ LOG_ERR("This gguf file seems to have no vision encoder\n");
1839
2300
  return false;
1840
2301
  }
1841
2302
 
@@ -1843,19 +2304,33 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
1843
2304
  if (ctx->has_llava_projector) {
1844
2305
  GGML_ASSERT(batch_size == 1); // TODO: support multiple images
1845
2306
  }
2307
+ if (ctx->has_minicpmv_projector) {
2308
+ GGML_ASSERT(batch_size == 1);
2309
+ }
1846
2310
 
1847
2311
  // build the inference graph
1848
- ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
2312
+ ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
1849
2313
  ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
1850
2314
 
1851
2315
  // set inputs
1852
2316
  const auto & model = ctx->vision_model;
1853
2317
  const auto & hparams = model.hparams;
1854
2318
 
1855
- const int image_size = hparams.image_size;
2319
+ const int image_size = hparams.image_size;
2320
+ int image_size_width = image_size;
2321
+ int image_size_height = image_size;
2322
+ if (ctx->has_minicpmv_projector) {
2323
+ image_size_width = imgs->data[0].nx;
2324
+ image_size_height = imgs->data[0].ny;
2325
+ }
1856
2326
  const int patch_size = hparams.patch_size;
1857
- const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
2327
+ const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
1858
2328
  const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
2329
+ if(ctx->load_image_size==nullptr){
2330
+ ctx->load_image_size= clip_image_size_init();
2331
+ }
2332
+ const int pos_w = ctx->load_image_size->width/patch_size;
2333
+ const int pos_h = ctx->load_image_size->height/patch_size;
1859
2334
 
1860
2335
  {
1861
2336
  struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
@@ -1864,7 +2339,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
1864
2339
  for (size_t i = 0; i < imgs->size; i++) {
1865
2340
  const int nx = imgs->data[i].nx;
1866
2341
  const int ny = imgs->data[i].ny;
1867
- GGML_ASSERT(nx == image_size && ny == image_size);
2342
+ if (!ctx->has_minicpmv_projector) {
2343
+ GGML_ASSERT(nx == image_size && ny == image_size);
2344
+ }
1868
2345
 
1869
2346
  const int n = nx * ny;
1870
2347
 
@@ -1881,53 +2358,97 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
1881
2358
  ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
1882
2359
  free(data);
1883
2360
  }
2361
+ if (ctx->has_minicpmv_projector) {
2362
+ {
2363
+ // inspired from siglip:
2364
+ // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
2365
+ // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
2366
+ struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
2367
+ int* positions_data = (int*)malloc(ggml_nbytes(positions));
2368
+ int bucket_coords_h[70];
2369
+ int bucket_coords_w[70];
2370
+ for (int i = 0; i < pos_h; i++){
2371
+ bucket_coords_h[i] = std::floor(70.0*i/pos_h);
2372
+ }
2373
+ for (int i = 0; i < pos_w; i++){
2374
+ bucket_coords_w[i] = std::floor(70.0*i/pos_w);
2375
+ }
2376
+ for (int i = 0, id = 0; i < pos_h; i++){
2377
+ for (int j = 0; j < pos_w; j++){
2378
+ positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
2379
+ }
2380
+ }
2381
+ ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
2382
+ free(positions_data);
2383
+ }
1884
2384
 
1885
- {
1886
- if (ctx->has_class_embedding) {
1887
- struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
2385
+ {
2386
+ // inspired from resampler of Qwen-VL:
2387
+ // -> https://huggingface.co/Qwen/Qwen-VL/tree/main
2388
+ // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
2389
+ struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
2390
+ int embed_dim = 4096;
2391
+ if (ctx->minicpmv_version == 2) {
2392
+ embed_dim = 4096;
2393
+ }
2394
+ else if (ctx->minicpmv_version == 3) {
2395
+ embed_dim = 3584;
2396
+ }
2397
+ auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
1888
2398
 
1889
- void* zero_mem = malloc(ggml_nbytes(embeddings));
1890
- memset(zero_mem, 0, ggml_nbytes(embeddings));
1891
- ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
1892
- free(zero_mem);
2399
+ float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
2400
+ for(int i=0;i<pos_w * pos_h;++i){
2401
+ for(int j=0;j<embed_dim;++j){
2402
+ pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
2403
+ }
2404
+ }
2405
+
2406
+ ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
2407
+ free(pos_embed_data);
1893
2408
  }
1894
2409
  }
2410
+ else{
2411
+ {
2412
+ if (ctx->has_class_embedding) {
2413
+ struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
1895
2414
 
1896
- {
1897
- struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
2415
+ void* zero_mem = malloc(ggml_nbytes(embeddings));
2416
+ memset(zero_mem, 0, ggml_nbytes(embeddings));
2417
+ ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
2418
+ free(zero_mem);
2419
+ }
2420
+ }
2421
+
2422
+ {
2423
+ struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
1898
2424
 
1899
- int* positions_data = (int*)malloc(ggml_nbytes(positions));
1900
- for (int i = 0; i < num_positions; i++) {
1901
- positions_data[i] = i;
2425
+ int* positions_data = (int*)malloc(ggml_nbytes(positions));
2426
+ for (int i = 0; i < num_positions; i++) {
2427
+ positions_data[i] = i;
2428
+ }
2429
+ ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
2430
+ free(positions_data);
1902
2431
  }
1903
- ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
1904
- free(positions_data);
1905
- }
1906
2432
 
1907
- {
1908
- struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
1909
- int* patches_data = (int*)malloc(ggml_nbytes(patches));
1910
- for (int i = 0; i < num_patches; i++) {
1911
- patches_data[i] = i + 1;
2433
+ {
2434
+ struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
2435
+ int* patches_data = (int*)malloc(ggml_nbytes(patches));
2436
+ for (int i = 0; i < num_patches; i++) {
2437
+ patches_data[i] = i + 1;
2438
+ }
2439
+ ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
2440
+ free(patches_data);
1912
2441
  }
1913
- ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
1914
- free(patches_data);
1915
2442
  }
1916
2443
 
1917
2444
  if (ggml_backend_is_cpu(ctx->backend)) {
1918
2445
  ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
1919
2446
  }
1920
2447
 
1921
- #ifdef GGML_USE_METAL
1922
- if (ggml_backend_is_metal(ctx->backend)) {
1923
- ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
1924
- }
1925
- #endif
1926
-
1927
2448
  ggml_backend_graph_compute(ctx->backend, gf);
1928
2449
 
1929
2450
  // the last node is the embedding tensor
1930
- struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
2451
+ struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
1931
2452
 
1932
2453
  // copy the embeddings to the location passed by the user
1933
2454
  ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
@@ -1999,7 +2520,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
1999
2520
  new_type = type;
2000
2521
  if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
2001
2522
  new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
2002
- // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
2523
+ // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
2003
2524
  }
2004
2525
  const size_t n_elms = ggml_nelements(cur);
2005
2526
  float * f32_data;
@@ -2018,7 +2539,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
2018
2539
  f32_data = (float *)conv_buf.data();
2019
2540
  break;
2020
2541
  default:
2021
- LOG_TEE("Please use an input file in f32 or f16\n");
2542
+ LOG_ERR("Please use an input file in f32 or f16\n");
2022
2543
  gguf_free(ctx_out);
2023
2544
  return false;
2024
2545
  }
@@ -2045,7 +2566,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
2045
2566
  fout.put(0);
2046
2567
  }
2047
2568
 
2048
- LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
2569
+ LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
2049
2570
  orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
2050
2571
  }
2051
2572
 
@@ -2061,8 +2582,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
2061
2582
  gguf_free(ctx_out);
2062
2583
 
2063
2584
  {
2064
- LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
2065
- LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
2585
+ LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
2586
+ LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
2066
2587
  }
2067
2588
 
2068
2589
  return true;
@@ -2081,7 +2602,22 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
2081
2602
  if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
2082
2603
  return ctx->vision_model.mm_3_b->ne[0];
2083
2604
  }
2605
+ if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2606
+ if (ctx->minicpmv_version == 2) {
2607
+ return 4096;
2608
+ }
2609
+ else if (ctx->minicpmv_version == 3) {
2610
+ return 3584;
2611
+ }
2612
+ }
2084
2613
 
2085
2614
  std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
2086
2615
  throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
2087
2616
  }
2617
+
2618
+ int clip_is_minicpmv(const struct clip_ctx * ctx) {
2619
+ if (ctx->has_minicpmv_projector) {
2620
+ return ctx->minicpmv_version;
2621
+ }
2622
+ return 0;
2623
+ }