@fugood/llama.node 0.3.16 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +238 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +6 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  130. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
  131. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  133. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  135. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  136. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
  142. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  143. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  144. package/src/llama.cpp/include/llama.h +30 -11
  145. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  147. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  149. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  150. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  151. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  152. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  153. package/src/llama.cpp/src/llama-arch.cpp +160 -17
  154. package/src/llama.cpp/src/llama-arch.h +16 -0
  155. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  156. package/src/llama.cpp/src/llama-chat.h +6 -2
  157. package/src/llama.cpp/src/llama-context.cpp +108 -92
  158. package/src/llama.cpp/src/llama-context.h +1 -2
  159. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  160. package/src/llama.cpp/src/llama-graph.h +26 -6
  161. package/src/llama.cpp/src/llama-hparams.h +13 -0
  162. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  163. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  164. package/src/llama.cpp/src/llama-memory.h +1 -1
  165. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  166. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  167. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  168. package/src/llama.cpp/src/llama-model.cpp +1760 -534
  169. package/src/llama.cpp/src/llama-model.h +13 -1
  170. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  171. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  172. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  173. package/src/llama.cpp/src/llama.cpp +1 -1
  174. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  175. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  176. package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
  177. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  178. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  179. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  180. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  181. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  182. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  183. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  184. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  185. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  186. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  188. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  189. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  190. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  191. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  192. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  193. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -3,6 +3,7 @@
3
3
  // I'll gradually clean and extend it
4
4
  // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
5
5
  #include "clip.h"
6
+ #include "clip-impl.h"
6
7
  #include "ggml.h"
7
8
  #include "ggml-cpp.h"
8
9
  #include "ggml-cpu.h"
@@ -26,285 +27,13 @@
26
27
  #include <sstream>
27
28
  #include <cinttypes>
28
29
  #include <limits>
30
+ #include <array>
31
+ #include <numeric>
29
32
 
30
- #if defined(LLAVA_LOG_OFF)
31
- # define LOG_INF(...)
32
- # define LOG_WRN(...)
33
- # define LOG_ERR(...)
34
- # define LOG_DBG(...)
35
- #else // defined(LLAVA_LOG_OFF)
36
- # define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
37
- # define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
38
- # define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
39
- # define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
40
- #endif // defined(LLAVA_LOG_OFF)
33
+ struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
41
34
 
42
35
  //#define CLIP_DEBUG_FUNCTIONS
43
36
 
44
- // RGB uint8 image
45
- struct clip_image_u8 {
46
- int nx;
47
- int ny;
48
-
49
- std::vector<uint8_t> buf;
50
- };
51
-
52
- // RGB float32 image (NHWC)
53
- // Memory layout: RGBRGBRGB...
54
- struct clip_image_f32 {
55
- int nx;
56
- int ny;
57
-
58
- std::vector<float> buf;
59
- };
60
-
61
- static std::string format(const char * fmt, ...) {
62
- va_list ap;
63
- va_list ap2;
64
- va_start(ap, fmt);
65
- va_copy(ap2, ap);
66
- int size = vsnprintf(NULL, 0, fmt, ap);
67
- GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
68
- std::vector<char> buf(size + 1);
69
- int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
70
- GGML_ASSERT(size2 == size);
71
- va_end(ap2);
72
- va_end(ap);
73
- return std::string(buf.data(), buf.size());
74
- }
75
-
76
- //
77
- // key constants
78
- //
79
-
80
- #define KEY_FTYPE "general.file_type"
81
- #define KEY_NAME "general.name"
82
- #define KEY_DESCRIPTION "general.description"
83
- #define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
84
- #define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
85
- #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
86
- #define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
87
- #define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
88
- #define KEY_MINICPMV_VERSION "clip.minicpmv_version"
89
- #define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
90
- #define KEY_USE_GELU "clip.use_gelu"
91
- #define KEY_USE_SILU "clip.use_silu"
92
- #define KEY_N_EMBD "clip.%s.embedding_length"
93
- #define KEY_N_FF "clip.%s.feed_forward_length"
94
- #define KEY_N_BLOCK "clip.%s.block_count"
95
- #define KEY_N_HEAD "clip.%s.attention.head_count"
96
- #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
97
- #define KEY_PROJ_DIM "clip.%s.projection_dim"
98
- #define KEY_TOKENS "tokenizer.ggml.tokens"
99
- #define KEY_N_POSITIONS "clip.text.context_length"
100
- #define KEY_IMAGE_SIZE "clip.vision.image_size"
101
- #define KEY_PATCH_SIZE "clip.vision.patch_size"
102
- #define KEY_IMAGE_MEAN "clip.vision.image_mean"
103
- #define KEY_IMAGE_STD "clip.vision.image_std"
104
- #define KEY_PROJ_TYPE "clip.projector_type"
105
- #define KEY_FEATURE_LAYER "clip.vision.feature_layer"
106
-
107
- #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
108
- #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
109
- #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
110
-
111
-
112
- //
113
- // tensor name constants
114
- //
115
-
116
- #define TN_TOKEN_EMBD "%s.token_embd.weight"
117
- #define TN_POS_EMBD "%s.position_embd.weight"
118
- #define TN_CLASS_EMBD "v.class_embd"
119
- #define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
120
- #define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
121
- #define TN_PATCH_BIAS "v.patch_embd.bias"
122
- #define TN_ATTN_K "%s.blk.%d.attn_k.%s"
123
- #define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
124
- #define TN_ATTN_V "%s.blk.%d.attn_v.%s"
125
- #define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
126
- #define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
127
- #define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
128
- #define TN_LN_1 "%s.blk.%d.ln1.%s"
129
- #define TN_LN_2 "%s.blk.%d.ln2.%s"
130
- #define TN_LN_PRE "%s.pre_ln.%s"
131
- #define TN_LN_POST "%s.post_ln.%s"
132
- #define TN_TEXT_PROJ "text_projection.weight"
133
- #define TN_VIS_PROJ "visual_projection.weight"
134
- #define TN_LLAVA_PROJ "mm.%d.%s"
135
- #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
136
- #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
137
- #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
138
- #define TN_IMAGE_NEWLINE "model.image_newline"
139
- #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
140
- #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
141
-
142
- #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
143
- #define TN_MINICPMV_QUERY "resampler.query"
144
- #define TN_MINICPMV_PROJ "resampler.proj.weight"
145
- #define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
146
- #define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
147
- #define TN_MINICPMV_LN "resampler.ln_%s.%s"
148
-
149
- #define TN_GLM_ADAPER_CONV "adapter.conv.%s"
150
- #define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s"
151
- #define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s"
152
- #define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
153
- #define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
154
- #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
155
- #define TN_GLM_BOI_W "adapter.boi"
156
- #define TN_GLM_EOI_W "adapter.eoi"
157
-
158
-
159
- enum projector_type {
160
- PROJECTOR_TYPE_MLP,
161
- PROJECTOR_TYPE_MLP_NORM,
162
- PROJECTOR_TYPE_LDP,
163
- PROJECTOR_TYPE_LDPV2,
164
- PROJECTOR_TYPE_RESAMPLER,
165
- PROJECTOR_TYPE_GLM_EDGE,
166
- PROJECTOR_TYPE_MERGER,
167
- PROJECTOR_TYPE_GEMMA3,
168
- PROJECTOR_TYPE_UNKNOWN,
169
- };
170
-
171
- static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
172
- { PROJECTOR_TYPE_MLP, "mlp" },
173
- { PROJECTOR_TYPE_LDP, "ldp" },
174
- { PROJECTOR_TYPE_LDPV2, "ldpv2"},
175
- { PROJECTOR_TYPE_RESAMPLER, "resampler"},
176
- { PROJECTOR_TYPE_GLM_EDGE, "adapter"},
177
- { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
178
- { PROJECTOR_TYPE_GEMMA3, "gemma3"},
179
- };
180
-
181
-
182
- //
183
- // utilities to get data from a gguf file
184
- //
185
-
186
- static int get_key_idx(const gguf_context * ctx, const char * key) {
187
- int i = gguf_find_key(ctx, key);
188
- if (i == -1) {
189
- LOG_ERR("key %s not found in file\n", key);
190
- throw std::runtime_error(format("Missing required key: %s", key));
191
- }
192
-
193
- return i;
194
- }
195
-
196
- static uint32_t get_u32(const gguf_context * ctx, const std::string & key) {
197
- const int i = get_key_idx(ctx, key.c_str());
198
-
199
- return gguf_get_val_u32(ctx, i);
200
- }
201
-
202
- static float get_f32(const gguf_context * ctx, const std::string & key) {
203
- const int i = get_key_idx(ctx, key.c_str());
204
-
205
- return gguf_get_val_f32(ctx, i);
206
- }
207
-
208
- static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) {
209
- struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
210
- if (!cur) {
211
- throw std::runtime_error(format("%s: unable to find tensor %s\n", __func__, name.c_str()));
212
- }
213
-
214
- return cur;
215
- }
216
-
217
- static std::string get_ftype(int ftype) {
218
- return ggml_type_name(static_cast<ggml_type>(ftype));
219
- }
220
-
221
- static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
222
- switch (type) {
223
- case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
224
- case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
225
- case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
226
- case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
227
- case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
228
- case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
229
- case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
230
- case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
231
- case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
232
- case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
233
- case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
234
- default: return format("unknown type %d", type);
235
- }
236
- }
237
-
238
- static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
239
- if (search.empty()) {
240
- return;
241
- }
242
- std::string builder;
243
- builder.reserve(s.length());
244
- size_t pos = 0;
245
- size_t last_pos = 0;
246
- while ((pos = s.find(search, last_pos)) != std::string::npos) {
247
- builder.append(s, last_pos, pos - last_pos);
248
- builder.append(replace);
249
- last_pos = pos + search.length();
250
- }
251
- builder.append(s, last_pos, std::string::npos);
252
- s = std::move(builder);
253
- }
254
-
255
- static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
256
- const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
257
-
258
- switch (type) {
259
- case GGUF_TYPE_STRING:
260
- return gguf_get_val_str(ctx_gguf, i);
261
- case GGUF_TYPE_ARRAY:
262
- {
263
- const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
264
- int arr_n = gguf_get_arr_n(ctx_gguf, i);
265
- const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
266
- std::stringstream ss;
267
- ss << "[";
268
- for (int j = 0; j < arr_n; j++) {
269
- if (arr_type == GGUF_TYPE_STRING) {
270
- std::string val = gguf_get_arr_str(ctx_gguf, i, j);
271
- // escape quotes
272
- replace_all(val, "\\", "\\\\");
273
- replace_all(val, "\"", "\\\"");
274
- ss << '"' << val << '"';
275
- } else if (arr_type == GGUF_TYPE_ARRAY) {
276
- ss << "???";
277
- } else {
278
- ss << gguf_data_to_str(arr_type, data, j);
279
- }
280
- if (j < arr_n - 1) {
281
- ss << ", ";
282
- }
283
- }
284
- ss << "]";
285
- return ss.str();
286
- }
287
- default:
288
- return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
289
- }
290
- }
291
-
292
- static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
293
- size_t tensor_size = ggml_nbytes(tensor);
294
- LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
295
- prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
296
- tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
297
- }
298
-
299
- static projector_type clip_projector_type_from_string(const std::string & name) {
300
- for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
301
- if (kv.second == name) {
302
- return kv.first;
303
- }
304
- }
305
- throw std::runtime_error(format("Unknown projector type: %s", name.c_str()));
306
- }
307
-
308
37
  #ifdef CLIP_DEBUG_FUNCTIONS
309
38
  static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
310
39
  std::ofstream file(filename, std::ios::binary);
@@ -418,6 +147,11 @@ static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u
418
147
  // clip layers
419
148
  //
420
149
 
150
+ enum patch_merge_type {
151
+ PATCH_MERGE_FLAT,
152
+ PATCH_MERGE_SPATIAL_UNPAD,
153
+ };
154
+
421
155
  struct clip_hparams {
422
156
  int32_t image_size;
423
157
  int32_t patch_size;
@@ -426,56 +160,69 @@ struct clip_hparams {
426
160
  int32_t projection_dim;
427
161
  int32_t n_head;
428
162
  int32_t n_layer;
163
+ int32_t proj_scale_factor = 0; // idefics3
429
164
 
430
- float eps;
165
+ patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
431
166
 
432
- char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
167
+ float eps = 1e-6;
168
+ float rope_theta = 0.0;
433
169
 
434
170
  std::vector<int32_t> image_grid_pinpoints;
435
171
  int32_t image_crop_resolution;
436
172
  std::unordered_set<int32_t> vision_feature_layer;
173
+ int32_t attn_window_size = 0;
174
+ int32_t n_wa_pattern = 0;
437
175
  };
438
176
 
439
177
  struct clip_layer {
440
178
  // attention
441
- struct ggml_tensor * k_w;
442
- struct ggml_tensor * k_b;
443
- struct ggml_tensor * q_w;
444
- struct ggml_tensor * q_b;
445
- struct ggml_tensor * v_w;
446
- struct ggml_tensor * v_b;
179
+ struct ggml_tensor * k_w = nullptr;
180
+ struct ggml_tensor * k_b = nullptr;
181
+ struct ggml_tensor * q_w = nullptr;
182
+ struct ggml_tensor * q_b = nullptr;
183
+ struct ggml_tensor * v_w = nullptr;
184
+ struct ggml_tensor * v_b = nullptr;
447
185
 
448
- struct ggml_tensor * o_w;
449
- struct ggml_tensor * o_b;
186
+ struct ggml_tensor * o_w = nullptr;
187
+ struct ggml_tensor * o_b = nullptr;
450
188
 
451
189
  // layernorm 1
452
- struct ggml_tensor * ln_1_w;
453
- struct ggml_tensor * ln_1_b;
190
+ struct ggml_tensor * ln_1_w = nullptr;
191
+ struct ggml_tensor * ln_1_b = nullptr;
454
192
 
455
193
  // ff
456
- struct ggml_tensor * ff_i_w;
457
- struct ggml_tensor * ff_i_b;
194
+ struct ggml_tensor * ff_i_w = nullptr; // legacy naming
195
+ struct ggml_tensor * ff_i_b = nullptr; // legacy naming
196
+ struct ggml_tensor * ff_o_w = nullptr; // legacy naming
197
+ struct ggml_tensor * ff_o_b = nullptr; // legacy naming
458
198
 
459
- struct ggml_tensor * ff_o_w;
460
- struct ggml_tensor * ff_o_b;
199
+ struct ggml_tensor * ff_up_w = nullptr;
200
+ struct ggml_tensor * ff_up_b = nullptr;
201
+ struct ggml_tensor * ff_gate_w = nullptr;
202
+ struct ggml_tensor * ff_gate_b = nullptr;
203
+ struct ggml_tensor * ff_down_w = nullptr;
204
+ struct ggml_tensor * ff_down_b = nullptr;
205
+
206
+ struct ggml_tensor * ff_g_w = NULL;
207
+ struct ggml_tensor * ff_g_b = NULL;
461
208
 
462
209
  // layernorm 2
463
- struct ggml_tensor * ln_2_w;
464
- struct ggml_tensor * ln_2_b;
210
+ struct ggml_tensor * ln_2_w = nullptr;
211
+ struct ggml_tensor * ln_2_b = nullptr;
465
212
  };
466
213
 
467
214
  struct clip_vision_model {
468
215
  struct clip_hparams hparams;
469
216
 
470
217
  // embeddings
471
- struct ggml_tensor * class_embedding;
472
- struct ggml_tensor * patch_embeddings_0;
473
- struct ggml_tensor * patch_embeddings_1; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
474
- struct ggml_tensor * patch_bias;
475
- struct ggml_tensor * position_embeddings;
218
+ struct ggml_tensor * class_embedding = nullptr;
219
+ struct ggml_tensor * patch_embeddings_0 = nullptr;
220
+ struct ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
221
+ struct ggml_tensor * patch_bias = nullptr;
222
+ struct ggml_tensor * position_embeddings = nullptr;
476
223
 
477
- struct ggml_tensor * pre_ln_w;
478
- struct ggml_tensor * pre_ln_b;
224
+ struct ggml_tensor * pre_ln_w = nullptr;
225
+ struct ggml_tensor * pre_ln_b = nullptr;
479
226
 
480
227
  std::vector<clip_layer> layers;
481
228
 
@@ -485,94 +232,90 @@ struct clip_vision_model {
485
232
  struct ggml_tensor * projection;
486
233
 
487
234
  // LLaVA projection
488
- struct ggml_tensor * mm_0_w = NULL;
489
- struct ggml_tensor * mm_0_b = NULL;
490
- struct ggml_tensor * mm_2_w = NULL;
491
- struct ggml_tensor * mm_2_b = NULL;
235
+ struct ggml_tensor * mm_0_w = nullptr;
236
+ struct ggml_tensor * mm_0_b = nullptr;
237
+ struct ggml_tensor * mm_2_w = nullptr;
238
+ struct ggml_tensor * mm_2_b = nullptr;
492
239
 
493
- struct ggml_tensor * image_newline = NULL;
240
+ struct ggml_tensor * image_newline = nullptr;
494
241
 
495
242
  // Yi type models with mlp+normalization projection
496
- struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
497
- struct ggml_tensor * mm_1_b = NULL;
498
- struct ggml_tensor * mm_3_w = NULL;
499
- struct ggml_tensor * mm_3_b = NULL;
500
- struct ggml_tensor * mm_4_w = NULL;
501
- struct ggml_tensor * mm_4_b = NULL;
243
+ struct ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
244
+ struct ggml_tensor * mm_1_b = nullptr;
245
+ struct ggml_tensor * mm_3_w = nullptr;
246
+ struct ggml_tensor * mm_3_b = nullptr;
247
+ struct ggml_tensor * mm_4_w = nullptr;
248
+ struct ggml_tensor * mm_4_b = nullptr;
502
249
 
503
250
  //GLMV-Edge projection
504
- struct ggml_tensor * mm_model_adapter_conv_w;
505
- struct ggml_tensor * mm_model_adapter_conv_b;
506
- struct ggml_tensor * boi_w;
507
- struct ggml_tensor * eoi_w;
251
+ struct ggml_tensor * mm_model_adapter_conv_w = nullptr;
252
+ struct ggml_tensor * mm_model_adapter_conv_b = nullptr;
508
253
 
509
254
  // MobileVLM projection
510
- struct ggml_tensor * mm_model_mlp_1_w;
511
- struct ggml_tensor * mm_model_mlp_1_b;
512
- struct ggml_tensor * mm_model_mlp_3_w;
513
- struct ggml_tensor * mm_model_mlp_3_b;
514
- struct ggml_tensor * mm_model_block_1_block_0_0_w;
515
- struct ggml_tensor * mm_model_block_1_block_0_1_w;
516
- struct ggml_tensor * mm_model_block_1_block_0_1_b;
517
- struct ggml_tensor * mm_model_block_1_block_1_fc1_w;
518
- struct ggml_tensor * mm_model_block_1_block_1_fc1_b;
519
- struct ggml_tensor * mm_model_block_1_block_1_fc2_w;
520
- struct ggml_tensor * mm_model_block_1_block_1_fc2_b;
521
- struct ggml_tensor * mm_model_block_1_block_2_0_w;
522
- struct ggml_tensor * mm_model_block_1_block_2_1_w;
523
- struct ggml_tensor * mm_model_block_1_block_2_1_b;
524
- struct ggml_tensor * mm_model_block_2_block_0_0_w;
525
- struct ggml_tensor * mm_model_block_2_block_0_1_w;
526
- struct ggml_tensor * mm_model_block_2_block_0_1_b;
527
- struct ggml_tensor * mm_model_block_2_block_1_fc1_w;
528
- struct ggml_tensor * mm_model_block_2_block_1_fc1_b;
529
- struct ggml_tensor * mm_model_block_2_block_1_fc2_w;
530
- struct ggml_tensor * mm_model_block_2_block_1_fc2_b;
531
- struct ggml_tensor * mm_model_block_2_block_2_0_w;
532
- struct ggml_tensor * mm_model_block_2_block_2_1_w;
533
- struct ggml_tensor * mm_model_block_2_block_2_1_b;
255
+ struct ggml_tensor * mm_model_mlp_1_w = nullptr;
256
+ struct ggml_tensor * mm_model_mlp_1_b = nullptr;
257
+ struct ggml_tensor * mm_model_mlp_3_w = nullptr;
258
+ struct ggml_tensor * mm_model_mlp_3_b = nullptr;
259
+ struct ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
260
+ struct ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
261
+ struct ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
262
+ struct ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
263
+ struct ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
264
+ struct ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
265
+ struct ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
266
+ struct ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
267
+ struct ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
268
+ struct ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
269
+ struct ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
270
+ struct ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
271
+ struct ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
272
+ struct ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
273
+ struct ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
274
+ struct ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
275
+ struct ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
276
+ struct ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
277
+ struct ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
278
+ struct ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
534
279
 
535
280
  // MobileVLM_V2 projection
536
- struct ggml_tensor * mm_model_mlp_0_w;
537
- struct ggml_tensor * mm_model_mlp_0_b;
538
- struct ggml_tensor * mm_model_mlp_2_w;
539
- struct ggml_tensor * mm_model_mlp_2_b;
540
- struct ggml_tensor * mm_model_peg_0_w;
541
- struct ggml_tensor * mm_model_peg_0_b;
281
+ struct ggml_tensor * mm_model_mlp_0_w = nullptr;
282
+ struct ggml_tensor * mm_model_mlp_0_b = nullptr;
283
+ struct ggml_tensor * mm_model_mlp_2_w = nullptr;
284
+ struct ggml_tensor * mm_model_mlp_2_b = nullptr;
285
+ struct ggml_tensor * mm_model_peg_0_w = nullptr;
286
+ struct ggml_tensor * mm_model_peg_0_b = nullptr;
542
287
 
543
288
  // MINICPMV projection
544
- struct ggml_tensor * mm_model_pos_embed_k;
545
- struct ggml_tensor * mm_model_query;
546
- struct ggml_tensor * mm_model_proj;
547
- struct ggml_tensor * mm_model_kv_proj;
548
- struct ggml_tensor * mm_model_attn_q_w;
549
- struct ggml_tensor * mm_model_attn_q_b;
550
- struct ggml_tensor * mm_model_attn_k_w;
551
- struct ggml_tensor * mm_model_attn_k_b;
552
- struct ggml_tensor * mm_model_attn_v_w;
553
- struct ggml_tensor * mm_model_attn_v_b;
554
- struct ggml_tensor * mm_model_attn_o_w;
555
- struct ggml_tensor * mm_model_attn_o_b;
556
- struct ggml_tensor * mm_model_ln_q_w;
557
- struct ggml_tensor * mm_model_ln_q_b;
558
- struct ggml_tensor * mm_model_ln_kv_w;
559
- struct ggml_tensor * mm_model_ln_kv_b;
560
- struct ggml_tensor * mm_model_ln_post_w;
561
- struct ggml_tensor * mm_model_ln_post_b;
289
+ struct ggml_tensor * mm_model_pos_embed_k = nullptr;
290
+ struct ggml_tensor * mm_model_query = nullptr;
291
+ struct ggml_tensor * mm_model_proj = nullptr;
292
+ struct ggml_tensor * mm_model_kv_proj = nullptr;
293
+ struct ggml_tensor * mm_model_attn_q_w = nullptr;
294
+ struct ggml_tensor * mm_model_attn_q_b = nullptr;
295
+ struct ggml_tensor * mm_model_attn_k_w = nullptr;
296
+ struct ggml_tensor * mm_model_attn_k_b = nullptr;
297
+ struct ggml_tensor * mm_model_attn_v_w = nullptr;
298
+ struct ggml_tensor * mm_model_attn_v_b = nullptr;
299
+ struct ggml_tensor * mm_model_attn_o_w = nullptr;
300
+ struct ggml_tensor * mm_model_attn_o_b = nullptr;
301
+ struct ggml_tensor * mm_model_ln_q_w = nullptr;
302
+ struct ggml_tensor * mm_model_ln_q_b = nullptr;
303
+ struct ggml_tensor * mm_model_ln_kv_w = nullptr;
304
+ struct ggml_tensor * mm_model_ln_kv_b = nullptr;
305
+ struct ggml_tensor * mm_model_ln_post_w = nullptr;
306
+ struct ggml_tensor * mm_model_ln_post_b = nullptr;
562
307
 
563
308
  // gemma3
564
- struct ggml_tensor * mm_input_proj_w;
565
- struct ggml_tensor * mm_soft_emb_norm_w;
309
+ struct ggml_tensor * mm_input_proj_w = nullptr;
310
+ struct ggml_tensor * mm_soft_emb_norm_w = nullptr;
311
+
312
+ // pixtral
313
+ struct ggml_tensor * token_embd_img_break = nullptr;
566
314
  };
567
315
 
568
316
  struct clip_ctx {
569
- bool has_text_encoder = false;
570
- bool has_vision_encoder = false;
571
317
  bool has_llava_projector = false;
572
- bool has_minicpmv_projector = false;
573
- bool has_glm_projector = false;
574
- bool has_qwen2vl_merger = false;
575
- int minicpmv_version = 2;
318
+ int minicpmv_version = 0;
576
319
 
577
320
  struct clip_vision_model vision_model;
578
321
  projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -582,28 +325,23 @@ struct clip_ctx {
582
325
  float image_std[3];
583
326
  bool use_gelu = false;
584
327
  bool use_silu = false;
585
- int32_t ftype = 1;
586
328
 
587
- bool has_class_embedding = true;
588
- bool has_pre_norm = true;
589
- bool has_post_norm = false;
590
- bool has_patch_bias = false;
591
-
592
- struct gguf_context * ctx_gguf = nullptr;
593
- struct ggml_context * ctx_data = nullptr;
329
+ gguf_context_ptr ctx_gguf;
330
+ ggml_context_ptr ctx_data;
594
331
 
595
332
  std::vector<uint8_t> buf_compute_meta;
596
333
 
597
334
  std::vector<ggml_backend_t> backend_ptrs;
598
335
  std::vector<ggml_backend_buffer_type_t> backend_buft;
599
336
 
600
- ggml_backend_t backend = nullptr;
601
- ggml_backend_t backend_cpu = nullptr;
602
- ggml_backend_buffer_t buf = nullptr;
337
+ ggml_backend_t backend;
338
+ ggml_backend_t backend_cpu;
339
+ ggml_backend_buffer_ptr buf;
603
340
 
341
+ int max_nodes = 8192;
604
342
  ggml_backend_sched_ptr sched;
605
343
 
606
- struct clip_image_size * load_image_size = nullptr;
344
+ clip_image_size load_image_size;
607
345
 
608
346
  clip_ctx(clip_context_params & ctx_params) {
609
347
  backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
@@ -629,33 +367,27 @@ struct clip_ctx {
629
367
  }
630
368
 
631
369
  ~clip_ctx() {
632
- ggml_free(ctx_data);
633
- gguf_free(ctx_gguf);
634
- ggml_backend_buffer_free(buf);
635
370
  ggml_backend_free(backend);
636
- if (backend_cpu != backend) {
371
+ if (backend != backend_cpu) {
637
372
  ggml_backend_free(backend_cpu);
638
373
  }
639
374
  }
640
375
  };
641
376
 
642
- static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
377
+ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) {
643
378
  const auto & model = ctx->vision_model;
644
379
  const auto & hparams = model.hparams;
645
380
 
646
- const int image_size = hparams.image_size;
647
- int image_size_width = image_size;
648
- int image_size_height = image_size;
649
-
650
- const int patch_size = hparams.patch_size;
651
- const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
652
- const int hidden_size = hparams.hidden_size;
653
- const int n_head = hparams.n_head;
654
- const int d_head = hidden_size / n_head;
655
- const int n_layer = hparams.n_layer;
656
- const float eps = hparams.eps;
381
+ int image_size_width = img.nx;
382
+ int image_size_height = img.ny;
657
383
 
658
- GGML_ASSERT(imgs->size == 1); // batch_size == 1
384
+ const int patch_size = hparams.patch_size;
385
+ const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
386
+ const int hidden_size = hparams.hidden_size;
387
+ const int n_head = hparams.n_head;
388
+ const int d_head = hidden_size / n_head;
389
+ const int n_layer = hparams.n_layer;
390
+ const float eps = hparams.eps;
659
391
 
660
392
  struct ggml_init_params params = {
661
393
  /*.mem_size =*/ ctx->buf_compute_meta.size(),
@@ -663,7 +395,9 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
663
395
  /*.no_alloc =*/ true,
664
396
  };
665
397
 
666
- struct ggml_context * ctx0 = ggml_init(params);
398
+ ggml_context_ptr ctx0_ptr(ggml_init(params));
399
+ auto ctx0 = ctx0_ptr.get();
400
+
667
401
  struct ggml_cgraph * gf = ggml_new_graph(ctx0);
668
402
 
669
403
  // input raw
@@ -711,8 +445,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
711
445
  V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
712
446
 
713
447
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
714
- KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
715
- KQ = ggml_soft_max_inplace(ctx0, KQ);
448
+ KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
716
449
 
717
450
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
718
451
  KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
@@ -751,7 +484,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
751
484
  }
752
485
 
753
486
  // post-layernorm
754
- if (ctx->has_post_norm) {
487
+ if (model.post_ln_w) {
755
488
  embeddings = ggml_norm(ctx0, embeddings, eps);
756
489
  ggml_set_name(embeddings, "post_ln");
757
490
 
@@ -781,63 +514,534 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
781
514
  embeddings = ggml_mul_mat(ctx0,
782
515
  ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
783
516
  embeddings);
517
+
518
+ } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
519
+ // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
520
+
521
+ ggml_tensor * cur = embeddings;
522
+ const int scale_factor = model.hparams.proj_scale_factor;
523
+ const int n_embd = cur->ne[0];
524
+ const int seq = cur->ne[1];
525
+ const int bsz = 1; // batch size, always 1 for now since we don't support batching
526
+ const int height = std::sqrt(seq);
527
+ const int width = std::sqrt(seq);
528
+ GGML_ASSERT(scale_factor != 0);
529
+ cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
530
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
531
+ cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
532
+ n_embd * scale_factor * scale_factor,
533
+ height / scale_factor,
534
+ width / scale_factor,
535
+ bsz);
536
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
537
+ cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur),
538
+ n_embd * scale_factor * scale_factor,
539
+ seq / (scale_factor * scale_factor),
540
+ bsz);
541
+
542
+ cur = ggml_mul_mat(ctx0, model.projection, cur);
543
+ embeddings = cur;
544
+ } else {
545
+ GGML_ABORT("SigLIP: Unsupported projector type");
784
546
  }
785
547
 
786
548
  // build the graph
787
549
  ggml_build_forward_expand(gf, embeddings);
788
550
 
789
- ggml_free(ctx0);
551
+ return gf;
552
+ }
553
+
554
+ // implementation of the 2D RoPE without adding a new op in ggml
555
+ // this is not efficient (use double the memory), but works on all backends
556
+ // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
557
+ static ggml_tensor * build_rope_2d(
558
+ ggml_context * ctx0,
559
+ ggml_tensor * cur,
560
+ ggml_tensor * pos_h,
561
+ ggml_tensor * pos_w,
562
+ const float freq_base
563
+ ) {
564
+ const int64_t n_dim = cur->ne[0];
565
+ const int64_t n_head = cur->ne[1];
566
+ const int64_t n_pos = cur->ne[2];
567
+
568
+ // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
569
+ // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
570
+ // first half of cur will use 1e-0, 1e-2 (even)
571
+ // second half of cur will use 1e-1, 1e-3 (odd)
572
+ // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
573
+ // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
574
+ // then for the second half, we use freq_scale to shift the inv_freq
575
+ // ^ why? replace (2i) with (2i+1) in the above equation
576
+ const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
577
+
578
+ // first half
579
+ ggml_tensor * first;
580
+ {
581
+ first = ggml_view_3d(ctx0, cur,
582
+ n_dim/2, n_head, n_pos,
583
+ ggml_row_size(cur->type, n_dim),
584
+ ggml_row_size(cur->type, n_dim*n_head),
585
+ 0);
586
+ first = ggml_rope_ext(
587
+ ctx0,
588
+ first,
589
+ pos_h, // positions
590
+ nullptr, // freq factors
591
+ n_dim/2, // n_dims
592
+ 0, 0, freq_base,
593
+ 1.0f, 0.0f, 1.0f, 0.0f, 0.0f
594
+ );
595
+ }
596
+
597
+ // second half
598
+ ggml_tensor * second;
599
+ {
600
+ second = ggml_view_3d(ctx0, cur,
601
+ n_dim/2, n_head, n_pos,
602
+ ggml_row_size(cur->type, n_dim),
603
+ ggml_row_size(cur->type, n_dim*n_head),
604
+ n_dim/2 * ggml_element_size(cur));
605
+ second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
606
+ second = ggml_rope_ext(
607
+ ctx0,
608
+ second,
609
+ pos_w, // positions
610
+ nullptr, // freq factors
611
+ n_dim/2, // n_dims
612
+ 0, 0, freq_base,
613
+ freq_scale_odd,
614
+ 0.0f, 1.0f, 0.0f, 0.0f
615
+ );
616
+ }
617
+
618
+ cur = ggml_concat(ctx0, first, second, 0);
619
+ return cur;
620
+ }
621
+
622
+ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) {
623
+ const auto & model = ctx->vision_model;
624
+ const auto & hparams = model.hparams;
625
+
626
+ GGML_ASSERT(ctx->proj_type == PROJECTOR_TYPE_PIXTRAL);
627
+
628
+ int image_size_width = img.nx;
629
+ int image_size_height = img.ny;
630
+
631
+ const int patch_size = hparams.patch_size;
632
+ const int n_patches_x = image_size_width / patch_size;
633
+ const int n_patches_y = image_size_height / patch_size;
634
+ const int num_patches = n_patches_x * n_patches_y;
635
+ const int hidden_size = hparams.hidden_size;
636
+ const int n_head = hparams.n_head;
637
+ const int d_head = hidden_size / n_head;
638
+ const int n_layer = hparams.n_layer;
639
+ const float eps = hparams.eps;
640
+
641
+ struct ggml_init_params params = {
642
+ /*.mem_size =*/ ctx->buf_compute_meta.size(),
643
+ /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
644
+ /*.no_alloc =*/ true,
645
+ };
646
+
647
+ ggml_context_ptr ctx0_ptr(ggml_init(params));
648
+ auto ctx0 = ctx0_ptr.get();
649
+
650
+ struct ggml_cgraph * gf = ggml_new_graph(ctx0);
651
+
652
+ // input raw
653
+ struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3);
654
+ ggml_set_name(inp_raw, "inp_raw");
655
+ ggml_set_input(inp_raw);
656
+
657
+ // 2D input positions
658
+ struct ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
659
+ ggml_set_name(pos_h, "pos_h");
660
+ ggml_set_input(pos_h);
661
+ struct ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
662
+ ggml_set_name(pos_w, "pos_w");
663
+ ggml_set_input(pos_w);
664
+
665
+ struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
666
+ inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
667
+ inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
668
+
669
+ struct ggml_tensor * embeddings = inp;
670
+
671
+ // pre-layer norm
672
+ embeddings = ggml_mul(ctx0, ggml_rms_norm(ctx0, embeddings, eps), model.pre_ln_w);
673
+
674
+ // loop over layers
675
+ for (int il = 0; il < n_layer; il++) {
676
+ struct ggml_tensor * cur = embeddings;
677
+
678
+ // pre-attention norm
679
+ cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_1_w);
680
+
681
+ // self-attention
682
+ {
683
+ struct ggml_tensor * Q = ggml_mul_mat(ctx0, model.layers[il].q_w, cur);
684
+
685
+ Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
686
+ Q = build_rope_2d(ctx0, Q, pos_h, pos_w, hparams.rope_theta);
687
+ Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
688
+
689
+ struct ggml_tensor * K = ggml_mul_mat(ctx0, model.layers[il].k_w, cur);
690
+
691
+ K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
692
+ K = build_rope_2d(ctx0, K, pos_h, pos_w, hparams.rope_theta);
693
+ K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
694
+
695
+ struct ggml_tensor * V = ggml_mul_mat(ctx0, model.layers[il].v_w, cur);
696
+
697
+ V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches);
698
+ V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
699
+
700
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
701
+ KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
702
+
703
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
704
+ KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
705
+ KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
706
+
707
+ cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
708
+
709
+ cur = ggml_mul_mat(ctx0, model.layers[il].o_w, cur);
710
+ }
711
+
712
+ // re-add the layer input, e.g., residual
713
+ cur = ggml_add(ctx0, cur, embeddings);
714
+
715
+ embeddings = cur; // embeddings = residual, cur = hidden_states
716
+
717
+ // pre-ffn norm
718
+ cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_2_w);
719
+
720
+ // feed-forward
721
+ {
722
+ ggml_tensor * gate_proj = ggml_mul_mat(ctx0, model.layers[il].ff_gate_w, cur);
723
+ ggml_tensor * up_proj = ggml_mul_mat(ctx0, model.layers[il].ff_up_w, cur);
724
+ gate_proj = ggml_silu(ctx0, gate_proj); // pixtral uses silu
725
+ cur = ggml_mul(ctx0, up_proj, gate_proj);
726
+ cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur);
727
+ }
728
+
729
+ // residual 2
730
+ cur = ggml_add(ctx0, embeddings, cur);
731
+
732
+ embeddings = cur;
733
+ }
734
+
735
+ // LlavaMultiModalProjector (with GELU activation)
736
+ {
737
+ embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
738
+ embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
739
+
740
+ embeddings = ggml_gelu(ctx0, embeddings);
741
+ embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
742
+ embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
743
+ }
744
+
745
+ // arrangement of the [IMG_BREAK] token
746
+ {
747
+ // not efficient, but works
748
+ // the trick is to view the embeddings as a 3D tensor with shape [hidden_size, n_patches_per_row, n_rows]
749
+ // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
750
+ // after the concatenation, we have a tensor with shape [hidden_size, n_patches_per_row + 1, n_rows]
751
+
752
+ const int n_embd_text = embeddings->ne[0];
753
+ const int n_tokens_output = num_patches + n_patches_y - 1; // one [IMG_BREAK] per row, except the last row
754
+
755
+ ggml_tensor * cur = ggml_reshape_3d(ctx0, embeddings, n_embd_text, n_patches_x, n_patches_y);
756
+ ggml_tensor * tok = ggml_new_tensor_3d(ctx0, embeddings->type, n_embd_text, 1, n_patches_y);
757
+ tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
758
+ tok = ggml_add(ctx0, tok, model.token_embd_img_break);
759
+ cur = ggml_concat(ctx0, cur, tok, 1);
760
+ embeddings = ggml_view_2d(ctx0, cur,
761
+ n_embd_text, n_tokens_output,
762
+ ggml_row_size(cur->type, n_embd_text), 0);
763
+ }
764
+
765
+ // build the graph
766
+ ggml_build_forward_expand(gf, embeddings);
790
767
 
791
768
  return gf;
792
769
  }
793
770
 
794
- static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
795
- if (!ctx->has_vision_encoder) {
796
- LOG_ERR("This gguf file seems to have no vision encoder\n");
797
- return nullptr;
771
+ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
772
+ const auto & model = ctx->vision_model;
773
+ const auto & hparams = model.hparams;
774
+
775
+ const int image_size_width = imgs.entries[0]->nx;
776
+ const int image_size_height = imgs.entries[0]->ny;
777
+
778
+ const bool use_window_attn = hparams.n_wa_pattern > 0;
779
+
780
+ const int n_wa_pattern = hparams.n_wa_pattern;
781
+ const int patch_size = hparams.patch_size;
782
+ const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
783
+ const int patches_w = image_size_width / patch_size;
784
+ const int patches_h = image_size_height / patch_size;
785
+ const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
786
+ const int num_position_ids = num_positions * 4; // m-rope requires 4 dim per position
787
+ const int hidden_size = hparams.hidden_size;
788
+ const int n_head = hparams.n_head;
789
+ const int d_head = hidden_size / n_head;
790
+ const int n_layer = hparams.n_layer;
791
+ const float eps = hparams.eps;
792
+
793
+ int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
794
+
795
+ const int batch_size = imgs.entries.size();
796
+ GGML_ASSERT(batch_size == 1);
797
+
798
+ struct ggml_init_params params = {
799
+ /*.mem_size =*/ ctx->buf_compute_meta.size(),
800
+ /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
801
+ /*.no_alloc =*/ true,
802
+ };
803
+
804
+ ggml_context_ptr ctx0_ptr(ggml_init(params));
805
+ auto ctx0 = ctx0_ptr.get();
806
+
807
+ struct ggml_cgraph * gf = ggml_new_graph(ctx0);
808
+
809
+ struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
810
+ ggml_set_name(inp_raw, "inp_raw");
811
+ ggml_set_input(inp_raw);
812
+
813
+ struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
814
+
815
+ GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
816
+ GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
817
+
818
+ auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
819
+ inp = ggml_add(ctx0, inp, inp_1);
820
+
821
+ inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
822
+ inp = ggml_reshape_4d(
823
+ ctx0, inp,
824
+ hidden_size * 2, patches_w / 2, patches_h, batch_size);
825
+ inp = ggml_reshape_4d(
826
+ ctx0, inp,
827
+ hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
828
+ inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
829
+ inp = ggml_reshape_3d(
830
+ ctx0, inp,
831
+ hidden_size, patches_w * patches_h, batch_size);
832
+
833
+ if (model.patch_bias) {
834
+ // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
835
+ inp = ggml_add(ctx0, inp, model.patch_bias);
836
+ }
837
+ struct ggml_tensor * embeddings = inp;
838
+ struct ggml_tensor * window_mask = nullptr;
839
+ struct ggml_tensor * window_idx = nullptr;
840
+ struct ggml_tensor * inv_window_idx = nullptr;
841
+
842
+ struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
843
+ ggml_set_name(positions, "positions");
844
+ ggml_set_input(positions);
845
+
846
+ // pre-layernorm
847
+ if (model.pre_ln_w) {
848
+ embeddings = ggml_rms_norm(ctx0, embeddings, eps);
849
+ ggml_set_name(embeddings, "pre_ln");
850
+
851
+ embeddings = ggml_mul(ctx0, embeddings, model.pre_ln_w);
852
+ }
853
+
854
+ if (use_window_attn) {
855
+ // handle window attention inputs
856
+ inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4);
857
+ ggml_set_name(inv_window_idx, "inv_window_idx");
858
+ ggml_set_input(inv_window_idx);
859
+ // mask for window attention
860
+ window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, num_positions, num_positions);
861
+ ggml_set_name(window_mask, "window_mask");
862
+ ggml_set_input(window_mask);
863
+
864
+ // embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
865
+ GGML_ASSERT(batch_size == 1);
866
+ embeddings = ggml_reshape_2d(ctx0, embeddings, hidden_size * 4, patches_w * patches_h * batch_size / 4);
867
+ embeddings = ggml_get_rows(ctx0, embeddings, inv_window_idx);
868
+ embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, patches_w * patches_h, batch_size);
869
+ }
870
+
871
+ // loop over layers
872
+ for (int il = 0; il < n_layer; il++) {
873
+ struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
874
+
875
+ // rmsnorm1
876
+ cur = ggml_rms_norm(ctx0, cur, eps);
877
+ cur = ggml_mul(ctx0, cur, model.layers[il].ln_1_w);
878
+
879
+ // self-attention
880
+ {
881
+
882
+ struct ggml_tensor * Q =
883
+ ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
884
+
885
+ Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
886
+ Q = ggml_rope_multi(
887
+ ctx0, Q, positions, nullptr,
888
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
889
+ Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
890
+ Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
891
+
892
+ struct ggml_tensor * K =
893
+ ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
894
+
895
+ K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
896
+ K = ggml_rope_multi(
897
+ ctx0, K, positions, nullptr,
898
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
899
+ K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
900
+ K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
901
+
902
+ struct ggml_tensor * V =
903
+ ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
904
+
905
+ V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
906
+ V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
907
+ V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
908
+
909
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
910
+ const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
911
+ if (full_attn) {
912
+ KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
913
+ } else {
914
+ KQ = ggml_soft_max_ext(ctx0, KQ, window_mask, 1.0f / sqrtf((float)d_head), 0.0f);
915
+ }
916
+
917
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
918
+ KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
919
+ KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
920
+
921
+ cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
922
+ }
923
+
924
+ // attention output
925
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
926
+
927
+ // re-add the layer input, e.g., residual
928
+ cur = ggml_add(ctx0, cur, embeddings);
929
+
930
+ embeddings = cur; // embeddings = residual, cur = hidden_states
931
+
932
+ // rms norm2
933
+ cur = ggml_rms_norm(ctx0, cur, eps);
934
+ cur = ggml_mul(ctx0, cur, model.layers[il].ln_2_w);
935
+
936
+ // mlp
937
+ // ffn_up
938
+ auto cur_up = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
939
+ cur_up = ggml_add(ctx0, cur_up, model.layers[il].ff_o_b);
940
+
941
+ auto cur_gate = ggml_mul_mat(ctx0, model.layers[il].ff_g_w, cur);
942
+ cur_gate = ggml_add(ctx0, cur_gate, model.layers[il].ff_g_b);
943
+ // TODO : only 2 of these 3 are actually used, should we remove one of them?
944
+ if (ctx->use_gelu) {
945
+ cur_gate = ggml_gelu_inplace(ctx0, cur_gate);
946
+ } else if (ctx->use_silu) {
947
+ cur_gate = ggml_silu_inplace(ctx0, cur_gate);
948
+ } else {
949
+ cur_gate = ggml_gelu_quick_inplace(ctx0, cur_gate);
950
+ }
951
+ cur = ggml_mul(ctx0, cur_gate, cur_up);
952
+
953
+ // ffn_down
954
+ cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
955
+ cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
956
+
957
+ // residual 2
958
+ cur = ggml_add(ctx0, embeddings, cur);
959
+
960
+ embeddings = cur;
961
+ }
962
+
963
+ // post-layernorm
964
+ if (model.post_ln_w) {
965
+ embeddings = ggml_rms_norm(ctx0, embeddings, eps);
966
+ ggml_set_name(embeddings, "post_ln");
967
+
968
+ embeddings = ggml_mul(ctx0, embeddings, model.post_ln_w);
969
+ }
970
+
971
+ embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
972
+
973
+ embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
974
+ embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
975
+
976
+ // GELU activation
977
+ embeddings = ggml_gelu(ctx0, embeddings);
978
+
979
+ // Second linear layer
980
+ embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
981
+ embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
982
+
983
+ if (use_window_attn) {
984
+ window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4);
985
+ ggml_set_name(window_idx, "window_idx");
986
+ ggml_set_input(window_idx);
987
+
988
+ // embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
989
+ GGML_ASSERT(batch_size == 1);
990
+ embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4);
991
+ embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
992
+ embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4, batch_size);
798
993
  }
799
994
 
995
+ // build the graph
996
+ ggml_build_forward_expand(gf, embeddings);
997
+
998
+ return gf;
999
+ }
1000
+
1001
+ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
800
1002
  const auto & model = ctx->vision_model;
801
1003
  const auto & hparams = model.hparams;
802
1004
 
803
1005
  const int image_size = hparams.image_size;
804
1006
  int image_size_width = image_size;
805
1007
  int image_size_height = image_size;
806
- if (ctx->has_minicpmv_projector) {
807
- if (load_image_size == nullptr) {
808
- load_image_size = clip_image_size_init();
809
- }
810
- LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
811
- image_size_width = load_image_size->width;
812
- image_size_height = load_image_size->height;
1008
+
1009
+ if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
1010
+ LOG_DBG("%s: %d %d\n", __func__, load_image_size.width, load_image_size.height);
1011
+ image_size_width = load_image_size.width;
1012
+ image_size_height = load_image_size.height;
813
1013
  if (is_inf) {
814
- image_size_width = imgs->data->nx;
815
- image_size_height = imgs->data->ny;
1014
+ image_size_width = imgs.entries[0]->nx;
1015
+ image_size_height = imgs.entries[0]->ny;
816
1016
  }
817
1017
  }
818
- else if (ctx->has_qwen2vl_merger) {
1018
+
1019
+ else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
819
1020
  // use the image's native resolution when image is avaible
820
1021
  if (is_inf) {
821
1022
  // if (imgs->data->nx && imgs->data->ny) {
822
- image_size_width = imgs->data->nx;
823
- image_size_height = imgs->data->ny;
1023
+ image_size_width = imgs.entries[0]->nx;
1024
+ image_size_height = imgs.entries[0]->ny;
824
1025
  }
825
1026
  }
1027
+
826
1028
  const int patch_size = hparams.patch_size;
827
1029
  const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
828
1030
  const int patches_w = image_size_width / patch_size;
829
1031
  const int patches_h = image_size_height / patch_size;
830
- const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
831
- const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions;
1032
+ const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
1033
+ const int num_position_ids = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL ? num_positions * 4 : num_positions;
832
1034
  const int hidden_size = hparams.hidden_size;
833
1035
  const int n_head = hparams.n_head;
834
1036
  const int d_head = hidden_size / n_head;
835
1037
  const float eps = hparams.eps;
836
1038
  int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
837
1039
 
838
- const int batch_size = imgs->size;
1040
+ const int batch_size = imgs.entries.size();
839
1041
 
840
- if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) {
1042
+ if (ctx->has_llava_projector
1043
+ || ctx->proj_type == PROJECTOR_TYPE_MINICPMV
1044
+ || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
841
1045
  GGML_ASSERT(batch_size == 1);
842
1046
  }
843
1047
 
@@ -847,7 +1051,9 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
847
1051
  /*.no_alloc =*/ true,
848
1052
  };
849
1053
 
850
- struct ggml_context * ctx0 = ggml_init(params);
1054
+ ggml_context_ptr ctx0_ptr(ggml_init(params));
1055
+ auto ctx0 = ctx0_ptr.get();
1056
+
851
1057
  struct ggml_cgraph * gf = ggml_new_graph(ctx0);
852
1058
 
853
1059
  struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
@@ -856,8 +1062,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
856
1062
 
857
1063
  struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
858
1064
 
859
- if (ctx->has_qwen2vl_merger) {
860
- GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
1065
+ if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
1066
+ GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
861
1067
  GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
862
1068
 
863
1069
  auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
@@ -879,53 +1085,43 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
879
1085
  inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
880
1086
  }
881
1087
 
882
- if (ctx->has_patch_bias) {
1088
+ if (model.patch_bias) {
883
1089
  // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
884
1090
  inp = ggml_add(ctx0, inp, model.patch_bias);
885
1091
  }
886
1092
  struct ggml_tensor * embeddings = inp;
887
1093
  struct ggml_tensor * pos_embed = nullptr;
888
1094
 
889
- if (ctx->has_llava_projector) {
890
- // concat class_embeddings and patch_embeddings
891
- if (ctx->has_class_embedding) {
892
- embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
893
- ggml_set_name(embeddings, "embeddings");
894
- ggml_set_input(embeddings);
895
- embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
896
- embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
897
- embeddings = ggml_acc(ctx0, embeddings, inp,
898
- embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
899
- }
1095
+ // concat class_embeddings and patch_embeddings
1096
+ if (model.class_embedding) {
1097
+ embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
1098
+ embeddings = ggml_scale(ctx0, embeddings, 0.0f); // set to all zeros
1099
+ embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
1100
+ embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
1101
+ embeddings = ggml_acc(ctx0, embeddings, inp,
1102
+ embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
900
1103
  }
901
1104
 
902
1105
  struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
903
1106
  ggml_set_name(positions, "positions");
904
1107
  ggml_set_input(positions);
905
1108
 
906
- if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding
1109
+ if (ctx->proj_type != PROJECTOR_TYPE_QWEN2VL) { // qwen2vl does NOT use learned position embeddings
907
1110
  embeddings =
908
1111
  ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
909
1112
  }
910
1113
 
911
- if (ctx->has_minicpmv_projector) {
1114
+ if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
912
1115
  int pos_w = image_size_width/patch_size;
913
1116
  int pos_h = image_size_height/patch_size;
914
- if (ctx->minicpmv_version == 2) {
915
- pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
916
- }
917
- else if (ctx->minicpmv_version == 3) {
918
- pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
919
- }
920
- else if (ctx->minicpmv_version == 4) {
921
- pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
922
- }
1117
+ int n_output_dim = clip_n_mmproj_embd(ctx);
1118
+ pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, pos_w * pos_h, 1);
923
1119
  ggml_set_name(pos_embed, "pos_embed");
924
1120
  ggml_set_input(pos_embed);
925
1121
  }
926
1122
 
927
1123
  // pre-layernorm
928
- if (ctx->has_pre_norm) {
1124
+ if (model.pre_ln_w) {
929
1125
  embeddings = ggml_norm(ctx0, embeddings, eps);
930
1126
  ggml_set_name(embeddings, "pre_ln");
931
1127
 
@@ -962,12 +1158,11 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
962
1158
  ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
963
1159
 
964
1160
  Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
965
- if (ctx->has_qwen2vl_merger) {
1161
+ if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
966
1162
  Q = ggml_rope_multi(
967
1163
  ctx0, Q, positions, nullptr,
968
1164
  d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
969
1165
  }
970
- Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
971
1166
  Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
972
1167
  Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
973
1168
 
@@ -975,7 +1170,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
975
1170
  ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
976
1171
 
977
1172
  K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
978
- if (ctx->has_qwen2vl_merger) {
1173
+ if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
979
1174
  K = ggml_rope_multi(
980
1175
  ctx0, K, positions, nullptr,
981
1176
  d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
@@ -991,7 +1186,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
991
1186
  V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
992
1187
 
993
1188
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
994
- KQ = ggml_soft_max_inplace(ctx0, KQ);
1189
+ KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
995
1190
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
996
1191
  KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
997
1192
  KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
@@ -1035,7 +1230,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
1035
1230
  }
1036
1231
 
1037
1232
  // post-layernorm
1038
- if (ctx->has_post_norm) {
1233
+ if (model.post_ln_w) {
1039
1234
  embeddings = ggml_norm(ctx0, embeddings, eps);
1040
1235
  ggml_set_name(embeddings, "post_ln");
1041
1236
 
@@ -1075,8 +1270,10 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
1075
1270
  embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
1076
1271
 
1077
1272
  embeddings = ggml_gelu(ctx0, embeddings);
1078
- embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
1079
- embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
1273
+ if (model.mm_2_w) {
1274
+ embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
1275
+ embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
1276
+ }
1080
1277
  }
1081
1278
  else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
1082
1279
  embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
@@ -1238,107 +1435,92 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
1238
1435
  }
1239
1436
  }
1240
1437
  // minicpmv projector
1241
- else if (ctx->has_minicpmv_projector)
1242
- {
1243
- if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
1244
- struct ggml_tensor * q = model.mm_model_query;
1245
- { // layernorm
1246
- q = ggml_norm(ctx0, q, eps);
1247
- q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
1438
+ else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
1439
+ struct ggml_tensor * q = model.mm_model_query;
1440
+ { // layernorm
1441
+ q = ggml_norm(ctx0, q, eps);
1442
+ q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
1443
+ }
1444
+ struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
1445
+ { // layernorm
1446
+ v = ggml_norm(ctx0, v, eps);
1447
+ v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
1448
+ }
1449
+ struct ggml_tensor * k;
1450
+ { // position
1451
+ // q = ggml_add(ctx0, q, model.mm_model_pos_embed);
1452
+ k = ggml_add(ctx0, v, pos_embed);
1453
+ }
1454
+
1455
+ { // attention
1456
+ int hidden_size = clip_n_mmproj_embd(ctx);
1457
+ const int d_head = 128;
1458
+ int n_head = hidden_size/d_head;
1459
+ int num_query = 96;
1460
+ if (ctx->minicpmv_version == 2) {
1461
+ num_query = 96;
1248
1462
  }
1249
- struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
1250
- { // layernorm
1251
- v = ggml_norm(ctx0, v, eps);
1252
- v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
1463
+ else if (ctx->minicpmv_version == 3) {
1464
+ num_query = 64;
1253
1465
  }
1254
- struct ggml_tensor * k;
1255
- { // position
1256
- // q = ggml_add(ctx0, q, model.mm_model_pos_embed);
1257
- k = ggml_add(ctx0, v, pos_embed);
1466
+ else if (ctx->minicpmv_version == 4) {
1467
+ num_query = 64;
1258
1468
  }
1259
1469
 
1260
- { // attention
1261
- int hidden_size = 4096;
1262
- const int d_head = 128;
1263
- int n_head = hidden_size/d_head;
1264
- int num_query = 96;
1265
- if (ctx->minicpmv_version == 2) {
1266
- hidden_size = 4096;
1267
- n_head = hidden_size/d_head;
1268
- num_query = 96;
1269
- }
1270
- else if (ctx->minicpmv_version == 3) {
1271
- hidden_size = 3584;
1272
- n_head = hidden_size/d_head;
1273
- num_query = 64;
1274
- }
1275
- else if (ctx->minicpmv_version == 4) {
1276
- hidden_size = 3584;
1277
- n_head = hidden_size/d_head;
1278
- num_query = 64;
1279
- }
1470
+ struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
1471
+ struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
1472
+ struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
1473
+ // permute
1474
+ Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
1475
+ Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
1476
+ Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
1477
+ K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
1478
+ K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
1479
+ K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
1480
+ V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
1481
+ V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
1482
+ V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
1483
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1484
+ KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
1485
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
1486
+ KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
1487
+ KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1488
+ KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
1280
1489
 
1281
- struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
1282
- Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
1283
- struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
1284
- struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
1285
- // permute
1286
- Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
1287
- Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
1288
- Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
1289
- K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
1290
- K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
1291
- K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
1292
- V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
1293
- V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
1294
- V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
1295
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1296
- KQ = ggml_soft_max_inplace(ctx0, KQ);
1297
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
1298
- KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
1299
- KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1300
- KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
1301
-
1302
- embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
1303
- }
1304
- { // layernorm
1305
- embeddings = ggml_norm(ctx0, embeddings, eps);
1306
- embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
1307
- }
1308
- embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
1490
+ embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
1309
1491
  }
1310
- else {
1311
- GGML_ASSERT(false);
1492
+ { // layernorm
1493
+ embeddings = ggml_norm(ctx0, embeddings, eps);
1494
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
1312
1495
  }
1496
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
1313
1497
  }
1498
+
1314
1499
  // glm projector
1315
- else if (ctx->has_glm_projector) {
1316
- if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
1317
- size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
1318
- embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
1319
- embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
1320
- embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
1321
- embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
1322
- embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
1323
- embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
1324
- //GLU
1325
- {
1326
- embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
1327
- embeddings = ggml_norm(ctx0, embeddings, eps);
1328
- embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
1329
- embeddings = ggml_gelu_inplace(ctx0, embeddings);
1330
- struct ggml_tensor * x = embeddings;
1331
- embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
1332
- x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
1333
- embeddings = ggml_silu_inplace(ctx0, embeddings);
1334
- embeddings = ggml_mul(ctx0, embeddings,x);
1335
- embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
1336
- }
1337
- } else {
1338
- GGML_ABORT("fatel error");
1500
+ else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
1501
+ size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
1502
+ embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
1503
+ embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
1504
+ embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
1505
+ embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
1506
+ embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
1507
+ embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
1508
+ // GLU
1509
+ {
1510
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
1511
+ embeddings = ggml_norm(ctx0, embeddings, eps);
1512
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
1513
+ embeddings = ggml_gelu_inplace(ctx0, embeddings);
1514
+ struct ggml_tensor * x = embeddings;
1515
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
1516
+ x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
1517
+ embeddings = ggml_silu_inplace(ctx0, embeddings);
1518
+ embeddings = ggml_mul(ctx0, embeddings,x);
1519
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
1339
1520
  }
1340
1521
  }
1341
- else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
1522
+
1523
+ else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
1342
1524
  embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
1343
1525
 
1344
1526
  embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
@@ -1355,561 +1537,493 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
1355
1537
  // build the graph
1356
1538
  ggml_build_forward_expand(gf, embeddings);
1357
1539
 
1358
- ggml_free(ctx0);
1359
-
1360
1540
  return gf;
1361
1541
  }
1362
1542
 
1363
- static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
1364
- if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
1365
- return clip_image_build_graph_siglip(ctx, imgs);
1366
- } else {
1367
- // TODO: we should have one build_* function per model
1368
- return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
1543
+ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
1544
+ ggml_cgraph * res;
1545
+ switch (ctx->proj_type) {
1546
+ case PROJECTOR_TYPE_GEMMA3:
1547
+ case PROJECTOR_TYPE_IDEFICS3:
1548
+ {
1549
+ GGML_ASSERT(imgs.entries.size() == 1);
1550
+ res = clip_image_build_graph_siglip(ctx, *imgs.entries[0]);
1551
+ } break;
1552
+ case PROJECTOR_TYPE_PIXTRAL:
1553
+ {
1554
+ GGML_ASSERT(imgs.entries.size() == 1);
1555
+ res = clip_image_build_graph_pixtral(ctx, *imgs.entries[0]);
1556
+ } break;
1557
+ case PROJECTOR_TYPE_QWEN25VL:
1558
+ {
1559
+ res = clip_image_build_graph_qwen25vl(ctx, imgs);
1560
+ } break;
1561
+ default:
1562
+ {
1563
+ // TODO: we should have one build_* function per model
1564
+ res = clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
1565
+ } break;
1369
1566
  }
1567
+ return res;
1370
1568
  }
1371
1569
 
1372
- // read and create ggml_context containing the tensors and their data
1373
- struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1374
- return clip_init(fname, clip_context_params{
1375
- /* use_gpu */ true,
1376
- /* verbosity */ verbosity,
1377
- });
1378
- }
1570
+ struct clip_model_loader {
1571
+ ggml_context_ptr ctx_meta;
1572
+ gguf_context_ptr ctx_gguf;
1379
1573
 
1380
- struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
1381
- int verbosity = ctx_params.verbosity;
1382
- struct ggml_context * meta = NULL;
1574
+ clip_ctx & ctx_clip;
1575
+ std::string fname;
1383
1576
 
1384
- struct gguf_init_params params = {
1385
- /*.no_alloc = */ true,
1386
- /*.ctx = */ &meta,
1387
- };
1577
+ size_t model_size = 0; // in bytes
1388
1578
 
1389
- struct gguf_context * ctx = gguf_init_from_file(fname, params);
1390
- if (!ctx) {
1391
- throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
1392
- }
1393
-
1394
- if (verbosity >= 1) {
1395
- const int n_tensors = gguf_get_n_tensors(ctx);
1396
- const int n_kv = gguf_get_n_kv(ctx);
1397
- const int ftype = get_u32(ctx, KEY_FTYPE);
1398
- const std::string ftype_str = get_ftype(ftype);
1399
- const int idx_desc = get_key_idx(ctx, KEY_DESCRIPTION);
1400
- const std::string description = gguf_get_val_str(ctx, idx_desc);
1401
- const int idx_name = gguf_find_key(ctx, KEY_NAME);
1402
- if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
1403
- const std::string name = gguf_get_val_str(ctx, idx_name);
1404
- LOG_INF("%s: model name: %s\n", __func__, name.c_str());
1405
- }
1406
- LOG_INF("%s: description: %s\n", __func__, description.c_str());
1407
- LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
1408
- LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
1409
- LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
1410
- LOG_INF("%s: n_kv: %d\n", __func__, n_kv);
1411
- LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
1412
- LOG_INF("\n");
1413
- }
1414
- const int n_tensors = gguf_get_n_tensors(ctx);
1415
-
1416
- // kv
1417
- const int n_kv = gguf_get_n_kv(ctx);
1418
- LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
1419
- __func__, n_kv, n_tensors, fname);
1420
- {
1421
- std::map<enum ggml_type, uint32_t> n_type;
1579
+ // TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model
1580
+ clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) {
1581
+ struct ggml_context * meta = nullptr;
1422
1582
 
1423
- for (int i = 0; i < n_tensors; i++) {
1424
- enum ggml_type type = gguf_get_tensor_type(ctx, i);
1583
+ struct gguf_init_params params = {
1584
+ /*.no_alloc = */ true,
1585
+ /*.ctx = */ &meta,
1586
+ };
1425
1587
 
1426
- n_type[type]++;
1588
+ ctx_gguf = gguf_context_ptr(gguf_init_from_file(fname, params));
1589
+ if (!ctx_gguf.get()) {
1590
+ throw std::runtime_error(string_format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
1427
1591
  }
1428
1592
 
1429
- LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
1430
- for (int i = 0; i < n_kv; i++) {
1431
- const char * name = gguf_get_key(ctx, i);
1432
- const enum gguf_type type = gguf_get_kv_type(ctx, i);
1433
- const std::string type_name =
1434
- type == GGUF_TYPE_ARRAY
1435
- ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i))
1436
- : gguf_type_name(type);
1593
+ ctx_meta.reset(meta);
1437
1594
 
1438
- std::string value = gguf_kv_to_str(ctx, i);
1439
- const size_t MAX_VALUE_LEN = 40;
1440
- if (value.size() > MAX_VALUE_LEN) {
1441
- value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
1442
- }
1443
- replace_all(value, "\n", "\\n");
1444
-
1445
- LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
1446
- }
1447
-
1448
- // print type counts
1449
- for (auto & kv : n_type) {
1450
- if (kv.second == 0) {
1451
- continue;
1452
- }
1595
+ const int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
1453
1596
 
1454
- LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
1597
+ // print gguf info
1598
+ {
1599
+ std::string name;
1600
+ get_string(KEY_NAME, name, false);
1601
+ std::string description;
1602
+ get_string(KEY_DESCRIPTION, description, false);
1603
+ LOG_INF("%s: model name: %s\n", __func__, name.c_str());
1604
+ LOG_INF("%s: description: %s\n", __func__, description.c_str());
1605
+ LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx_gguf.get()));
1606
+ LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx_gguf.get()));
1607
+ LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
1608
+ LOG_INF("%s: n_kv: %d\n", __func__, (int)gguf_get_n_kv(ctx_gguf.get()));
1609
+ LOG_INF("\n");
1455
1610
  }
1456
- }
1457
1611
 
1458
- // data
1459
- size_t model_size = 0;
1460
- {
1461
- for (int i = 0; i < n_tensors; ++i) {
1462
- const char * name = gguf_get_tensor_name(ctx, i);
1463
- const size_t offset = gguf_get_tensor_offset(ctx, i);
1464
- enum ggml_type type = gguf_get_tensor_type(ctx, i);
1465
- struct ggml_tensor * cur = ggml_get_tensor(meta, name);
1466
- size_t tensor_size = ggml_nbytes(cur);
1467
- model_size += tensor_size;
1468
- if (verbosity >= 3) {
1469
- LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
1470
- __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
1612
+ // tensors
1613
+ {
1614
+ for (int i = 0; i < n_tensors; ++i) {
1615
+ const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
1616
+ const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i);
1617
+ enum ggml_type type = gguf_get_tensor_type(ctx_gguf.get(), i);
1618
+ struct ggml_tensor * cur = ggml_get_tensor(meta, name);
1619
+ size_t tensor_size = ggml_nbytes(cur);
1620
+ model_size += tensor_size;
1621
+ LOG_DBG("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
1622
+ __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
1471
1623
  }
1472
1624
  }
1473
1625
  }
1474
1626
 
1475
- clip_ctx * new_clip = new clip_ctx(ctx_params);
1476
-
1477
- // update projector type
1478
- {
1479
- int idx = gguf_find_key(ctx, KEY_PROJ_TYPE);
1480
- if (idx != -1) {
1481
- const std::string proj_type = gguf_get_val_str(ctx, idx);
1482
- new_clip->proj_type = clip_projector_type_from_string(proj_type);
1483
- } else {
1484
- new_clip->proj_type = PROJECTOR_TYPE_MLP;
1485
- }
1627
+ void load_hparams() {
1628
+ auto & hparams = ctx_clip.vision_model.hparams;
1486
1629
 
1487
- if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
1488
- if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
1489
- new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
1630
+ // projector type
1631
+ std::string proj_type;
1632
+ {
1633
+ get_string(KEY_PROJ_TYPE, proj_type, false);
1634
+ if (!proj_type.empty()) {
1635
+ ctx_clip.proj_type = clip_projector_type_from_string(proj_type);
1636
+ }
1637
+ if (ctx_clip.proj_type == PROJECTOR_TYPE_UNKNOWN) {
1638
+ throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
1490
1639
  }
1491
1640
  }
1492
- }
1493
-
1494
- // model size and capabilities
1495
- {
1496
- int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
1497
- new_clip->has_text_encoder = gguf_get_val_bool(ctx, idx);
1498
1641
 
1499
- idx = get_key_idx(ctx, KEY_HAS_VIS_ENC);
1500
- new_clip->has_vision_encoder = gguf_get_val_bool(ctx, idx);
1642
+ // other hparams
1643
+ {
1644
+ get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false);
1645
+
1646
+ get_bool(KEY_USE_GELU, ctx_clip.use_gelu, false);
1647
+ get_bool(KEY_USE_SILU, ctx_clip.use_silu, false);
1648
+
1649
+ get_u32(KEY_N_EMBD, hparams.hidden_size);
1650
+ get_u32(KEY_N_HEAD, hparams.n_head);
1651
+ get_u32(KEY_N_FF, hparams.n_intermediate);
1652
+ get_u32(KEY_N_BLOCK, hparams.n_layer);
1653
+ get_u32(KEY_PROJ_DIM, hparams.projection_dim);
1654
+ get_f32(KEY_LAYER_NORM_EPS, hparams.eps);
1655
+ get_u32(KEY_IMAGE_SIZE, hparams.image_size);
1656
+ get_u32(KEY_PATCH_SIZE, hparams.patch_size);
1657
+ get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
1658
+ get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
1659
+
1660
+ ctx_clip.has_llava_projector = ctx_clip.proj_type == PROJECTOR_TYPE_MLP
1661
+ || ctx_clip.proj_type == PROJECTOR_TYPE_MLP_NORM
1662
+ || ctx_clip.proj_type == PROJECTOR_TYPE_LDP
1663
+ || ctx_clip.proj_type == PROJECTOR_TYPE_LDPV2;
1501
1664
 
1502
- idx = gguf_find_key(ctx, KEY_HAS_LLAVA_PROJ);
1503
- if (idx != -1) {
1504
- new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
1505
- }
1665
+ {
1666
+ std::string mm_patch_merge_type;
1667
+ get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false);
1668
+ if (mm_patch_merge_type == "spatial_unpad") {
1669
+ hparams.mm_patch_merge_type = PATCH_MERGE_SPATIAL_UNPAD;
1670
+ }
1671
+ }
1506
1672
 
1507
- idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ);
1508
- if (idx != -1) {
1509
- new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx);
1510
- }
1673
+ {
1674
+ int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN);
1675
+ int idx_std = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD);
1676
+ GGML_ASSERT(idx_mean >= 0 && "image_mean not found");
1677
+ GGML_ASSERT(idx_std >= 0 && "image_std not found");
1678
+ const float * mean_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_mean);
1679
+ const float * std_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_std);
1680
+ for (int i = 0; i < 3; ++i) {
1681
+ ctx_clip.image_mean[i] = mean_data[i];
1682
+ ctx_clip.image_std[i] = std_data[i];
1683
+ }
1684
+ }
1511
1685
 
1512
- idx = gguf_find_key(ctx, KEY_MINICPMV_VERSION);
1513
- if (idx != -1) {
1514
- new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
1515
- }
1686
+ // Load the vision feature layer indices if they are explicitly provided;
1687
+ // if multiple vision feature layers are present, the values will be concatenated
1688
+ // to form the final visual features.
1689
+ // NOTE: gguf conversions should standardize the values of the vision feature layer to
1690
+ // be non-negative, since we use -1 to mark values as unset here.
1691
+ std::vector<int> vision_feature_layer;
1692
+ get_arr_int(KEY_FEATURE_LAYER, vision_feature_layer, false);
1693
+ // convert std::vector to std::unordered_set
1694
+ for (auto & layer : vision_feature_layer) {
1695
+ hparams.vision_feature_layer.insert(layer);
1696
+ }
1516
1697
 
1517
- idx = gguf_find_key(ctx, KEY_HAS_GLM_PROJ);
1518
- if (idx != -1) {
1519
- new_clip->has_glm_projector = gguf_get_val_bool(ctx, idx);
1520
- }
1698
+ // Calculate the deepest feature layer based on hparams and projector type
1699
+ // NOTE: This is only used by build_graph_legacy()
1700
+ {
1701
+ // Get the index of the second to last layer; this is the default for models that have a llava projector
1702
+ int n_layer = hparams.n_layer - 1;
1703
+ int deepest_feature_layer = -1;
1704
+
1705
+ if (ctx_clip.proj_type == PROJECTOR_TYPE_MINICPMV
1706
+ || ctx_clip.proj_type == PROJECTOR_TYPE_GLM_EDGE
1707
+ || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL
1708
+ || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN25VL) {
1709
+ n_layer += 1;
1710
+ }
1521
1711
 
1522
- idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER);
1523
- if (idx != -1) {
1524
- new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
1525
- }
1526
- // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
1712
+ // If we set explicit vision feature layers, only go up to the deepest one
1713
+ // NOTE: only used by granite-vision models for now
1714
+ for (const auto & feature_layer : hparams.vision_feature_layer) {
1715
+ if (feature_layer > deepest_feature_layer) {
1716
+ deepest_feature_layer = feature_layer;
1717
+ }
1718
+ }
1719
+ ctx_clip.max_feature_layer = deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
1720
+ }
1527
1721
 
1528
- GGML_ASSERT(new_clip->has_vision_encoder);
1529
- GGML_ASSERT(!new_clip->has_text_encoder);
1722
+ // model-specific params
1723
+ switch (ctx_clip.proj_type) {
1724
+ case PROJECTOR_TYPE_MINICPMV:
1725
+ {
1726
+ if (ctx_clip.minicpmv_version == 0) {
1727
+ ctx_clip.minicpmv_version = 2; // default to 2 if not set
1728
+ }
1729
+ } break;
1730
+ case PROJECTOR_TYPE_IDEFICS3:
1731
+ {
1732
+ get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
1733
+ } break;
1734
+ case PROJECTOR_TYPE_PIXTRAL:
1735
+ {
1736
+ hparams.rope_theta = 10000.0f;
1737
+ } break;
1738
+ case PROJECTOR_TYPE_QWEN25VL:
1739
+ {
1740
+ get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
1741
+ } break;
1742
+ default:
1743
+ break;
1744
+ }
1530
1745
 
1531
- try {
1532
- idx = get_key_idx(ctx, KEY_USE_GELU);
1533
- new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
1534
- } catch (std::runtime_error & /*e*/) {
1535
- new_clip->use_gelu = false;
1746
+ LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
1747
+ LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector);
1748
+ LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version);
1749
+ LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
1750
+ LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
1751
+ LOG_INF("%s: use_silu: %d\n", __func__, ctx_clip.use_silu);
1752
+ LOG_INF("%s: use_gelu: %d\n", __func__, ctx_clip.use_gelu);
1753
+ LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
1754
+ LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
1536
1755
  }
1756
+ }
1537
1757
 
1538
- try {
1539
- idx = get_key_idx(ctx, KEY_USE_SILU);
1540
- new_clip->use_silu = gguf_get_val_bool(ctx, idx);
1541
- } catch (std::runtime_error & /*e*/) {
1542
- new_clip->use_silu = false;
1543
- }
1758
+ void load_tensors() {
1759
+ std::map<std::string, size_t> tensor_offset;
1760
+ std::vector<ggml_tensor *> tensors_to_load;
1544
1761
 
1545
- if (verbosity >= 1) {
1546
- LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
1547
- LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
1548
- LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
1549
- LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
1550
- LOG_INF("%s: minicpmv_version: %d\n", __func__, new_clip->minicpmv_version);
1551
- LOG_INF("%s: glm_projector: %d\n", __func__, new_clip->has_glm_projector);
1552
- LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
1553
- LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
1762
+ // get offsets
1763
+ for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) {
1764
+ const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
1765
+ tensor_offset[name] = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), i);
1554
1766
  }
1555
- }
1556
-
1557
- LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
1558
1767
 
1559
- // load tensors
1560
- {
1561
- std::vector<uint8_t> read_buf;
1768
+ // create data context
1562
1769
  struct ggml_init_params params = {
1563
- /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(),
1770
+ /*.mem_size =*/ (gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(),
1564
1771
  /*.mem_buffer =*/ NULL,
1565
1772
  /*.no_alloc =*/ true,
1566
1773
  };
1774
+ ctx_clip.ctx_data.reset(ggml_init(params));
1775
+ if (!ctx_clip.ctx_data) {
1776
+ throw std::runtime_error(string_format("%s: failed to init ggml context\n", __func__));
1777
+ }
1567
1778
 
1568
- new_clip->ctx_data = ggml_init(params);
1569
- if (!new_clip->ctx_data) {
1570
- LOG_ERR("%s: ggml_init() failed\n", __func__);
1571
- clip_free(new_clip);
1572
- gguf_free(ctx);
1573
- return nullptr;
1574
- }
1575
-
1576
- auto fin = std::ifstream(fname, std::ios::binary);
1577
- if (!fin) {
1578
- LOG_ERR("cannot open model file for loading tensors\n");
1579
- clip_free(new_clip);
1580
- gguf_free(ctx);
1581
- return nullptr;
1582
- }
1583
-
1584
- // add tensors to context
1585
- for (int i = 0; i < n_tensors; ++i) {
1586
- const char * name = gguf_get_tensor_name(ctx, i);
1587
- struct ggml_tensor * t = ggml_get_tensor(meta, name);
1588
- struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t);
1589
- ggml_set_name(cur, name);
1590
- }
1591
-
1592
- // alloc memory and offload data
1593
- ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(new_clip->backend);
1594
- new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft(new_clip->ctx_data, buft);
1595
- ggml_backend_buffer_set_usage(new_clip->buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
1596
- for (int i = 0; i < n_tensors; ++i) {
1597
- const char * name = gguf_get_tensor_name(ctx, i);
1598
- struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
1599
- const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
1600
- fin.seekg(offset, std::ios::beg);
1601
- if (!fin) {
1602
- LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
1603
- clip_free(new_clip);
1604
- gguf_free(ctx);
1605
- return nullptr;
1606
- }
1607
- int num_bytes = ggml_nbytes(cur);
1608
- if (ggml_backend_buft_is_host(buft)) {
1609
- // for the CPU and Metal backend, we can read directly into the tensor
1610
- fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
1611
- } else {
1612
- // read into a temporary buffer first, then copy to device memory
1613
- read_buf.resize(num_bytes);
1614
- fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
1615
- ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
1779
+ // helper function
1780
+ auto get_tensor = [&](const std::string & name, bool required = true) {
1781
+ struct ggml_tensor * cur = ggml_get_tensor(ctx_meta.get(), name.c_str());
1782
+ if (!cur && required) {
1783
+ throw std::runtime_error(string_format("%s: unable to find tensor %s\n", __func__, name.c_str()));
1616
1784
  }
1617
- }
1618
- fin.close();
1619
- }
1620
-
1621
- // vision model
1622
- if (new_clip->has_vision_encoder) {
1623
- // load vision model
1624
- auto & vision_model = new_clip->vision_model;
1625
- auto & hparams = vision_model.hparams;
1626
- hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision"));
1627
- hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision"));
1628
- hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision"));
1629
- hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
1630
- hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE);
1631
- hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE);
1632
- hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
1633
- hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
1634
-
1635
- try {
1636
- int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
1637
- int n = gguf_get_arr_n(ctx, idx);
1638
- const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
1639
- for (int i = 0; i < n; ++i) {
1640
- hparams.image_grid_pinpoints.push_back(pinpoints[i]);
1785
+ if (cur) {
1786
+ tensors_to_load.push_back(cur);
1787
+ // add tensors to context
1788
+ struct ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur);
1789
+ ggml_set_name(data_tensor, cur->name);
1790
+ cur = data_tensor;
1641
1791
  }
1642
- } catch (std::runtime_error & /*e*/) { }
1792
+ return cur;
1793
+ };
1643
1794
 
1644
- // Load the vision feature layer indices if they are explicitly provided;
1645
- // if multiple vision feature layers are present, the values will be concatenated
1646
- // to form the final visual features.
1647
- // NOTE: gguf conversions should standardize the values of the vision feature layer to
1648
- // be non-negative, since we use -1 to mark values as unset here.
1649
- try {
1650
- int idx = get_key_idx(ctx, KEY_FEATURE_LAYER);
1651
- int n = gguf_get_arr_n(ctx, idx);
1795
+ auto & vision_model = ctx_clip.vision_model;
1652
1796
 
1653
- const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data(ctx, idx);
1797
+ vision_model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
1654
1798
 
1655
- for (int i = 0; i < n; ++i) {
1656
- hparams.vision_feature_layer.insert(vision_feature_layer[i]);
1657
- }
1658
- } catch (std::runtime_error & /*e*/) { }
1659
-
1660
- try {
1661
- int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
1662
- strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx));
1663
- } catch (std::runtime_error & /*e*/) {
1664
- strcpy(hparams.mm_patch_merge_type, "flat");
1665
- }
1666
-
1667
- try {
1668
- hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6
1669
- } catch(const std::exception& /*e*/) {
1670
- hparams.image_crop_resolution = hparams.image_size;
1671
- }
1672
-
1673
- int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
1674
- int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
1675
-
1676
- const float * mean_data = (const float *)gguf_get_arr_data(ctx, idx_mean);
1677
- const float * std_data = (const float *)gguf_get_arr_data(ctx, idx_std);
1678
-
1679
- for (int i = 0; i < 3; ++i) {
1680
- new_clip->image_mean[i] = mean_data[i];
1681
- new_clip->image_std[i] = std_data[i];
1682
- }
1683
-
1684
- // Calculate the deepest feature layer based on hparams and projector type
1685
- new_clip->max_feature_layer = get_deepest_feature_layer(new_clip);
1686
-
1687
- if (verbosity >= 2) {
1688
- LOG_INF("\n%s: vision model hparams\n", __func__);
1689
- LOG_INF("image_size %d\n", hparams.image_size);
1690
- LOG_INF("patch_size %d\n", hparams.patch_size);
1691
- LOG_INF("v_hidden_size %d\n", hparams.hidden_size);
1692
- LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate);
1693
- LOG_INF("v_projection_dim %d\n", hparams.projection_dim);
1694
- LOG_INF("v_n_head %d\n", hparams.n_head);
1695
- LOG_INF("v_n_layer %d\n", hparams.n_layer);
1696
- LOG_INF("v_eps %f\n", hparams.eps);
1697
- LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
1698
- LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
1699
- LOG_INF("v_image_grid_pinpoints: ");
1700
- for (const auto & pp : hparams.image_grid_pinpoints) {
1701
- LOG_INF("%d ", pp);
1799
+ vision_model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, "v", "weight"), false);
1800
+ vision_model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, "v", "bias"), false);
1801
+
1802
+ vision_model.post_ln_w = get_tensor(string_format(TN_LN_POST, "v", "weight"), false);
1803
+ vision_model.post_ln_b = get_tensor(string_format(TN_LN_POST, "v", "bias"), false);
1804
+
1805
+ vision_model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
1806
+ vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
1807
+ vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
1808
+
1809
+ vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false);
1810
+
1811
+ // layers
1812
+ vision_model.layers.resize(vision_model.hparams.n_layer);
1813
+ for (int il = 0; il < vision_model.hparams.n_layer; ++il) {
1814
+ auto & layer = vision_model.layers[il];
1815
+ layer.k_w = get_tensor(string_format(TN_ATTN_K, "v", il, "weight"));
1816
+ layer.q_w = get_tensor(string_format(TN_ATTN_Q, "v", il, "weight"));
1817
+ layer.v_w = get_tensor(string_format(TN_ATTN_V, "v", il, "weight"));
1818
+ layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight"));
1819
+ layer.ln_1_w = get_tensor(string_format(TN_LN_1, "v", il, "weight"), false);
1820
+ layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false);
1821
+ layer.k_b = get_tensor(string_format(TN_ATTN_K, "v", il, "bias"), false);
1822
+ layer.q_b = get_tensor(string_format(TN_ATTN_Q, "v", il, "bias"), false);
1823
+ layer.v_b = get_tensor(string_format(TN_ATTN_V, "v", il, "bias"), false);
1824
+ layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "bias"), false);
1825
+ layer.ln_1_b = get_tensor(string_format(TN_LN_1, "v", il, "bias"), false);
1826
+ layer.ln_2_b = get_tensor(string_format(TN_LN_2, "v", il, "bias"), false);
1827
+
1828
+ // new naming
1829
+ layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, "v", il, "weight"));
1830
+ layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, "v", il, "bias"), false);
1831
+ layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, "v", il, "weight"), false);
1832
+ layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, "v", il, "bias"), false);
1833
+ layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight"));
1834
+ layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"), false);
1835
+
1836
+ // legacy naming (the in and out is reversed! don't ask me why)
1837
+ layer.ff_i_w = layer.ff_down_w;
1838
+ layer.ff_o_w = layer.ff_up_w;
1839
+ layer.ff_g_w = layer.ff_gate_w;
1840
+ layer.ff_i_b = layer.ff_down_b;
1841
+ layer.ff_o_b = layer.ff_up_b;
1842
+ layer.ff_g_b = layer.ff_gate_b;
1843
+ }
1844
+
1845
+ switch (ctx_clip.proj_type) {
1846
+ case PROJECTOR_TYPE_MLP:
1847
+ case PROJECTOR_TYPE_MLP_NORM:
1848
+ {
1849
+ // LLaVA projection
1850
+ vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"), false);
1851
+ vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
1852
+ // Yi-type llava
1853
+ vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"), false);
1854
+ vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
1855
+ // missing in Yi-type llava
1856
+ vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"), false);
1857
+ vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
1858
+ // Yi-type llava
1859
+ vision_model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, 3, "weight"), false);
1860
+ vision_model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, 3, "bias"), false);
1861
+ vision_model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, 4, "weight"), false);
1862
+ vision_model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, 4, "bias"), false);
1863
+ if (vision_model.mm_3_w) {
1864
+ // TODO: this is a hack to support Yi-type llava
1865
+ ctx_clip.proj_type = PROJECTOR_TYPE_MLP_NORM;
1866
+ }
1867
+ vision_model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
1868
+ } break;
1869
+ case PROJECTOR_TYPE_LDP:
1870
+ {
1871
+ // MobileVLM projection
1872
+ vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
1873
+ vision_model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
1874
+ vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
1875
+ vision_model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
1876
+ vision_model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
1877
+ vision_model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
1878
+ vision_model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
1879
+ vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
1880
+ vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
1881
+ vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
1882
+ vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
1883
+ vision_model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
1884
+ vision_model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
1885
+ vision_model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
1886
+ vision_model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
1887
+ vision_model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
1888
+ vision_model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
1889
+ vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
1890
+ vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
1891
+ vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
1892
+ vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
1893
+ vision_model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
1894
+ vision_model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
1895
+ vision_model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
1896
+ } break;
1897
+ case PROJECTOR_TYPE_LDPV2:
1898
+ {
1899
+ // MobilVLM_V2 projection
1900
+ vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
1901
+ vision_model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
1902
+ vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
1903
+ vision_model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "bias"));
1904
+ vision_model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight"));
1905
+ vision_model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias"));
1906
+ } break;
1907
+ case PROJECTOR_TYPE_MINICPMV:
1908
+ {
1909
+ // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
1910
+ vision_model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
1911
+ vision_model.mm_model_query = get_tensor(TN_MINICPMV_QUERY);
1912
+ vision_model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ);
1913
+ vision_model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ);
1914
+ vision_model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight"));
1915
+ vision_model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight"));
1916
+ vision_model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight"));
1917
+ vision_model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias"));
1918
+ vision_model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias"));
1919
+ vision_model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias"));
1920
+ vision_model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight"));
1921
+ vision_model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias"));
1922
+ vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight"));
1923
+ vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias"));
1924
+ vision_model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight"));
1925
+ vision_model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias"));
1926
+ vision_model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight"));
1927
+ vision_model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias"));
1928
+ } break;
1929
+ case PROJECTOR_TYPE_GLM_EDGE:
1930
+ {
1931
+ vision_model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
1932
+ vision_model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
1933
+ vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR,"weight"));
1934
+ vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1,"weight"));
1935
+ vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1,"bias"));
1936
+ vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
1937
+ vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight"));
1938
+ vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
1939
+ } break;
1940
+ case PROJECTOR_TYPE_QWEN2VL:
1941
+ case PROJECTOR_TYPE_QWEN25VL:
1942
+ {
1943
+ vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
1944
+ vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
1945
+ vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
1946
+ vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
1947
+ } break;
1948
+ case PROJECTOR_TYPE_GEMMA3:
1949
+ {
1950
+ vision_model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
1951
+ vision_model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
1952
+ } break;
1953
+ case PROJECTOR_TYPE_IDEFICS3:
1954
+ {
1955
+ vision_model.projection = get_tensor(TN_MM_PROJECTOR);
1956
+ } break;
1957
+ case PROJECTOR_TYPE_PIXTRAL:
1958
+ {
1959
+ vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
1960
+ vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
1961
+ vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
1962
+ vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
1963
+ // [IMG_BREAK] token embedding
1964
+ vision_model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
1965
+ } break;
1966
+ default:
1967
+ GGML_ASSERT(false && "unknown projector type");
1968
+ }
1969
+
1970
+ // load data
1971
+ {
1972
+ std::vector<uint8_t> read_buf;
1973
+
1974
+ auto fin = std::ifstream(fname, std::ios::binary);
1975
+ if (!fin) {
1976
+ throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
1702
1977
  }
1703
- LOG_INF("\n");
1704
- LOG_INF("v_vision_feature_layer: ");
1705
- for (const auto & feature_layer: hparams.vision_feature_layer) {
1706
- LOG_INF("%d ", feature_layer);
1978
+
1979
+ // alloc memory and offload data
1980
+ ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
1981
+ ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
1982
+ ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
1983
+ for (auto & t : tensors_to_load) {
1984
+ struct ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
1985
+ const size_t offset = tensor_offset[t->name];
1986
+ fin.seekg(offset, std::ios::beg);
1987
+ if (!fin) {
1988
+ throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));
1989
+ }
1990
+ size_t num_bytes = ggml_nbytes(cur);
1991
+ if (ggml_backend_buft_is_host(buft)) {
1992
+ // for the CPU and Metal backend, we can read directly into the tensor
1993
+ fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
1994
+ } else {
1995
+ // read into a temporary buffer first, then copy to device memory
1996
+ read_buf.resize(num_bytes);
1997
+ fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
1998
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
1999
+ }
1707
2000
  }
1708
- LOG_INF("\n");
1709
- LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
1710
-
1711
- }
1712
-
1713
- try {
1714
- vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
1715
- new_clip->has_class_embedding = true;
1716
- } catch (const std::exception& /*e*/) {
1717
- new_clip->has_class_embedding = false;
1718
- }
1719
-
1720
- try {
1721
- vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
1722
- vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
1723
- new_clip->has_pre_norm = true;
1724
- } catch (std::exception & /*e*/) {
1725
- new_clip->has_pre_norm = false;
1726
- }
1727
-
1728
- try {
1729
- vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
1730
- vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
1731
- new_clip->has_post_norm = true;
1732
- } catch (std::exception & /*e*/) {
1733
- new_clip->has_post_norm = false;
1734
- }
1735
-
1736
- try {
1737
- vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
1738
- new_clip->has_patch_bias = true;
1739
- } catch (std::exception & /*e*/) {
1740
- new_clip->has_patch_bias = false;
1741
- }
1742
-
1743
- try {
1744
- vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
1745
- } catch(const std::exception& /*e*/) {
1746
- vision_model.patch_embeddings_0 = nullptr;
1747
- }
1748
-
1749
- try {
1750
- vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
1751
- } catch(const std::exception& /*e*/) {
1752
- vision_model.position_embeddings = nullptr;
1753
- }
1754
-
1755
- try {
1756
- vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
1757
- } catch(const std::exception& /*e*/) {
1758
- new_clip->has_qwen2vl_merger = false;
1759
- }
1760
-
1761
- // LLaVA projection
1762
- if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
1763
- vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
1764
- vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
1765
- try {
1766
- // Yi-type llava
1767
- vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
1768
- vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
1769
- } catch (std::runtime_error & /*e*/) { }
1770
- try {
1771
- // missing in Yi-type llava
1772
- vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
1773
- vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
1774
- } catch (std::runtime_error & /*e*/) { }
1775
- try {
1776
- // Yi-type llava
1777
- vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
1778
- vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
1779
- } catch (std::runtime_error & /*e*/) { }
1780
- try {
1781
- // Yi-type llava
1782
- vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
1783
- vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
1784
- } catch (std::runtime_error & /*e*/) { }
1785
- try {
1786
- vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
1787
- // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
1788
- } catch (std::runtime_error & /*e*/) { }
1789
- } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
1790
- // MobileVLM projection
1791
- vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
1792
- vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
1793
- vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
1794
- vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
1795
- vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
1796
- vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
1797
- vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
1798
- vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
1799
- vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
1800
- vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
1801
- vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
1802
- vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
1803
- vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
1804
- vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
1805
- vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
1806
- vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
1807
- vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
1808
- vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
1809
- vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
1810
- vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
1811
- vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
1812
- vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
1813
- vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
1814
- vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
1815
- }
1816
- else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2)
1817
- {
1818
- // MobilVLM_V2 projection
1819
- vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight"));
1820
- vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias"));
1821
- vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight"));
1822
- vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias"));
1823
- vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
1824
- vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
1825
- }
1826
- else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
1827
- // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
1828
- vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
1829
- vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
1830
- vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
1831
- vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
1832
- vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight"));
1833
- vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight"));
1834
- vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight"));
1835
- vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias"));
1836
- vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias"));
1837
- vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias"));
1838
- vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight"));
1839
- vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias"));
1840
- vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight"));
1841
- vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias"));
1842
- vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight"));
1843
- vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias"));
1844
- vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
1845
- vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
1846
- }
1847
- else if (new_clip->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
1848
- vision_model.mm_model_adapter_conv_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "weight"));
1849
- vision_model.mm_model_adapter_conv_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "bias"));
1850
- vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_LINEAR,"weight"));
1851
- vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"weight"));
1852
- vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"bias"));
1853
- vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
1854
- vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_GATE,"weight"));
1855
- vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
1856
- vision_model.boi_w = get_tensor(new_clip->ctx_data, TN_GLM_BOI_W);
1857
- vision_model.eoi_w = get_tensor(new_clip->ctx_data, TN_GLM_EOI_W);
1858
- }
1859
- else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
1860
- vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
1861
- vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
1862
- vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
1863
- vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
1864
- }
1865
- else if (new_clip->proj_type == PROJECTOR_TYPE_GEMMA3) {
1866
- vision_model.mm_input_proj_w = get_tensor(new_clip->ctx_data, TN_MM_INP_PROJ);
1867
- vision_model.mm_soft_emb_norm_w = get_tensor(new_clip->ctx_data, TN_MM_SOFT_EMB_N);
1868
- }
1869
- else {
1870
- std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
1871
- throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
2001
+ fin.close();
2002
+
2003
+ LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
1872
2004
  }
2005
+ }
1873
2006
 
1874
- vision_model.layers.resize(hparams.n_layer);
2007
+ void alloc_compute_meta() {
2008
+ ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
1875
2009
 
1876
- for (int il = 0; il < hparams.n_layer; ++il) {
1877
- auto & layer = vision_model.layers[il];
1878
- layer.k_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "weight"));
1879
- layer.q_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "weight"));
1880
- layer.v_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "weight"));
1881
- layer.o_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight"));
1882
- layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "weight"));
1883
- layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "weight"));
1884
- layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "weight"));
1885
- layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "weight"));
1886
- layer.k_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "bias"));
1887
- layer.q_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "bias"));
1888
- layer.v_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "bias"));
1889
- layer.o_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias"));
1890
- layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "bias"));
1891
- layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "bias"));
1892
- layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "bias"));
1893
- layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "bias"));
1894
- }
1895
- }
1896
-
1897
- ggml_free(meta);
1898
-
1899
- new_clip->ctx_gguf = ctx;
1900
-
1901
- // measure mem requirement and allocate
1902
- {
1903
- new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
2010
+ // create a fake batch
1904
2011
  clip_image_f32_batch batch;
1905
- batch.size = 1;
1906
- batch.data = nullptr;
1907
- ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
1908
- ggml_backend_sched_reserve(new_clip->sched.get(), gf);
1909
- for (size_t i = 0; i < new_clip->backend_ptrs.size(); ++i) {
1910
- ggml_backend_t backend = new_clip->backend_ptrs[i];
1911
- ggml_backend_buffer_type_t buft = new_clip->backend_buft[i];
1912
- size_t size = ggml_backend_sched_get_buffer_size(new_clip->sched.get(), backend);
2012
+ clip_image_f32_ptr img(clip_image_f32_init());
2013
+ clip_image_size image_size;
2014
+ image_size.width = ctx_clip.vision_model.hparams.image_size;
2015
+ image_size.height = ctx_clip.vision_model.hparams.image_size;
2016
+ img->nx = image_size.width;
2017
+ img->ny = image_size.height;
2018
+ img->buf.resize(image_size.width * image_size.height * 3);
2019
+ batch.entries.push_back(std::move(img));
2020
+
2021
+ ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch, image_size, false);
2022
+ ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
2023
+ for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
2024
+ ggml_backend_t backend = ctx_clip.backend_ptrs[i];
2025
+ ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i];
2026
+ size_t size = ggml_backend_sched_get_buffer_size(ctx_clip.sched.get(), backend);
1913
2027
  if (size > 1) {
1914
2028
  LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
1915
2029
  ggml_backend_buft_name(buft),
@@ -1918,15 +2032,98 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
1918
2032
  }
1919
2033
  }
1920
2034
 
1921
- return new_clip;
2035
+ void get_bool(const std::string & key, bool & output, bool required = true) {
2036
+ const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
2037
+ if (i < 0) {
2038
+ if (required) throw std::runtime_error("Key not found: " + key);
2039
+ return;
2040
+ }
2041
+ output = gguf_get_val_bool(ctx_gguf.get(), i);
2042
+ }
2043
+
2044
+ void get_i32(const std::string & key, int & output, bool required = true) {
2045
+ const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
2046
+ if (i < 0) {
2047
+ if (required) throw std::runtime_error("Key not found: " + key);
2048
+ return;
2049
+ }
2050
+ output = gguf_get_val_i32(ctx_gguf.get(), i);
2051
+ }
2052
+
2053
+ void get_u32(const std::string & key, int & output, bool required = true) {
2054
+ const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
2055
+ if (i < 0) {
2056
+ if (required) throw std::runtime_error("Key not found: " + key);
2057
+ return;
2058
+ }
2059
+ output = gguf_get_val_u32(ctx_gguf.get(), i);
2060
+ }
2061
+
2062
+ void get_f32(const std::string & key, float & output, bool required = true) {
2063
+ const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
2064
+ if (i < 0) {
2065
+ if (required) throw std::runtime_error("Key not found: " + key);
2066
+ return;
2067
+ }
2068
+ output = gguf_get_val_f32(ctx_gguf.get(), i);
2069
+ }
2070
+
2071
+ void get_string(const std::string & key, std::string & output, bool required = true) {
2072
+ const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
2073
+ if (i < 0) {
2074
+ if (required) throw std::runtime_error("Key not found: " + key);
2075
+ return;
2076
+ }
2077
+ output = std::string(gguf_get_val_str(ctx_gguf.get(), i));
2078
+ }
2079
+
2080
+ void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) {
2081
+ const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
2082
+ if (i < 0) {
2083
+ if (required) throw std::runtime_error("Key not found: " + key);
2084
+ return;
2085
+ }
2086
+ int n = gguf_get_arr_n(ctx_gguf.get(), i);
2087
+ output.resize(n);
2088
+ const int32_t * values = (const int32_t *)gguf_get_arr_data(ctx_gguf.get(), i);
2089
+ for (int i = 0; i < n; ++i) {
2090
+ output[i] = values[i];
2091
+ }
2092
+ }
2093
+ };
2094
+
2095
+ // read and create ggml_context containing the tensors and their data
2096
+ struct clip_ctx * clip_model_load(const char * fname, const int verbosity) {
2097
+ return clip_init(fname, clip_context_params{
2098
+ /* use_gpu */ true,
2099
+ /* verbosity */ static_cast<ggml_log_level>(verbosity),
2100
+ });
2101
+ }
2102
+
2103
+ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
2104
+ g_logger_state.verbosity_thold = ctx_params.verbosity;
2105
+ clip_ctx * ctx_clip = new clip_ctx(ctx_params);
2106
+
2107
+ try {
2108
+ clip_model_loader loader(fname, *ctx_clip);
2109
+ loader.load_hparams();
2110
+ loader.load_tensors();
2111
+ loader.alloc_compute_meta();
2112
+ } catch (const std::exception & e) {
2113
+ LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
2114
+ delete ctx_clip;
2115
+ return nullptr;
2116
+ }
2117
+
2118
+ return ctx_clip;
1922
2119
  }
1923
2120
 
1924
2121
  void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
1925
- ctx_clip->load_image_size = load_image_size;
2122
+ ctx_clip->load_image_size = *load_image_size; // copy
1926
2123
  }
1927
2124
 
1928
2125
  struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
1929
- return ctx_clip->load_image_size;
2126
+ return &ctx_clip->load_image_size;
1930
2127
  }
1931
2128
 
1932
2129
  struct clip_image_size * clip_image_size_init() {
@@ -1944,19 +2141,53 @@ struct clip_image_f32 * clip_image_f32_init() {
1944
2141
  return new clip_image_f32();
1945
2142
  }
1946
2143
 
1947
- void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
1948
- void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
1949
- void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) {
1950
- if (batch->size > 0) {
1951
- delete[] batch->data;
1952
- batch->size = 0;
2144
+ struct clip_image_f32_batch * clip_image_f32_batch_init() {
2145
+ return new clip_image_f32_batch();
2146
+ }
2147
+
2148
+ unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
2149
+ if (nx) *nx = img->nx;
2150
+ if (ny) *ny = img->ny;
2151
+ return img->buf.data();
2152
+ }
2153
+
2154
+ void clip_image_size_free(struct clip_image_size * load_image_size) {
2155
+ if (load_image_size == nullptr) {
2156
+ return;
2157
+ }
2158
+ delete load_image_size;
2159
+ }
2160
+ void clip_image_u8_free(struct clip_image_u8 * img) { if (img) delete img; }
2161
+ void clip_image_f32_free(struct clip_image_f32 * img) { if (img) delete img; }
2162
+ void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { if (batch) delete batch; }
2163
+ void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { if (batch) delete batch; }
2164
+
2165
+ size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) {
2166
+ return batch->entries.size();
2167
+ }
2168
+
2169
+ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) {
2170
+ if (idx < 0 || idx >= (int)batch->entries.size()) {
2171
+ LOG_ERR("%s: invalid index %d\n", __func__, idx);
2172
+ return 0;
2173
+ }
2174
+ return batch->entries[idx]->nx;
2175
+ }
2176
+
2177
+ size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) {
2178
+ if (idx < 0 || idx >= (int)batch->entries.size()) {
2179
+ LOG_ERR("%s: invalid index %d\n", __func__, idx);
2180
+ return 0;
1953
2181
  }
2182
+ return batch->entries[idx]->ny;
1954
2183
  }
1955
- void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) {
1956
- if (batch->size > 0) {
1957
- delete[] batch->data;
1958
- batch->size = 0;
2184
+
2185
+ clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) {
2186
+ if (idx < 0 || idx >= (int)batch->entries.size()) {
2187
+ LOG_ERR("%s: invalid index %d\n", __func__, idx);
2188
+ return nullptr;
1959
2189
  }
2190
+ return batch->entries[idx].get();
1960
2191
  }
1961
2192
 
1962
2193
  void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
@@ -1990,605 +2221,597 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
1990
2221
  return true;
1991
2222
  }
1992
2223
 
1993
- // Linear interpolation between two points
1994
- inline float clip_lerp(float s, float e, float t) {
1995
- return s + (e - s) * t;
1996
- }
1997
- // Bilinear resize function
1998
- static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
1999
- dst.nx = target_width;
2000
- dst.ny = target_height;
2001
- dst.buf.resize(3 * target_width * target_height);
2002
-
2003
- float x_ratio = static_cast<float>(src.nx - 1) / target_width;
2004
- float y_ratio = static_cast<float>(src.ny - 1) / target_height;
2005
-
2006
- for (int y = 0; y < target_height; y++) {
2007
- for (int x = 0; x < target_width; x++) {
2008
- float px = x_ratio * x;
2009
- float py = y_ratio * y;
2010
- int x_floor = static_cast<int>(px);
2011
- int y_floor = static_cast<int>(py);
2012
- float x_lerp = px - x_floor;
2013
- float y_lerp = py - y_floor;
2014
-
2015
- for (int c = 0; c < 3; c++) {
2016
- float top = clip_lerp(
2017
- static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
2018
- static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
2019
- x_lerp
2020
- );
2021
- float bottom = clip_lerp(
2022
- static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
2023
- static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
2024
- x_lerp
2025
- );
2026
- dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(clip_lerp(top, bottom, y_lerp));
2224
+ // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
2225
+ static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
2226
+ dst.nx = src.nx;
2227
+ dst.ny = src.ny;
2228
+ dst.buf.resize(src.buf.size());
2229
+
2230
+ // TODO @ngxson : seems like this could be done more efficiently on cgraph
2231
+ for (size_t i = 0; i < src.buf.size(); ++i) {
2232
+ int c = i % 3; // rgb
2233
+ dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
2234
+ }
2235
+ }
2236
+
2237
+ // set of tools to manupulate images
2238
+ // in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
2239
+ struct image_manipulation {
2240
+ // Bilinear resize function
2241
+ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
2242
+ dst.nx = target_width;
2243
+ dst.ny = target_height;
2244
+ dst.buf.resize(3 * target_width * target_height);
2245
+
2246
+ float x_ratio = static_cast<float>(src.nx - 1) / target_width;
2247
+ float y_ratio = static_cast<float>(src.ny - 1) / target_height;
2248
+
2249
+ for (int y = 0; y < target_height; y++) {
2250
+ for (int x = 0; x < target_width; x++) {
2251
+ float px = x_ratio * x;
2252
+ float py = y_ratio * y;
2253
+ int x_floor = static_cast<int>(px);
2254
+ int y_floor = static_cast<int>(py);
2255
+ float x_lerp = px - x_floor;
2256
+ float y_lerp = py - y_floor;
2257
+
2258
+ for (int c = 0; c < 3; c++) {
2259
+ float top = lerp(
2260
+ static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
2261
+ static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
2262
+ x_lerp
2263
+ );
2264
+ float bottom = lerp(
2265
+ static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
2266
+ static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
2267
+ x_lerp
2268
+ );
2269
+ dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
2270
+ }
2027
2271
  }
2028
2272
  }
2029
2273
  }
2030
- }
2031
2274
 
2032
- // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
2033
- static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) {
2034
- dst->nx = src->nx;
2035
- dst->ny = src->ny;
2036
- dst->buf.resize(src->buf.size());
2275
+ // Bicubic resize function
2276
+ // part of image will be cropped if the aspect ratio is different
2277
+ static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
2278
+ const int nx = img.nx;
2279
+ const int ny = img.ny;
2280
+
2281
+ dst.nx = target_width;
2282
+ dst.ny = target_height;
2283
+ dst.buf.resize(3 * target_width * target_height);
2284
+
2285
+ float Cc;
2286
+ float C[5];
2287
+ float d0, d2, d3, a0, a1, a2, a3;
2288
+ int i, j, k, jj;
2289
+ int x, y;
2290
+ float dx, dy;
2291
+ float tx, ty;
2292
+
2293
+ tx = (float)nx / (float)target_width;
2294
+ ty = (float)ny / (float)target_height;
2295
+
2296
+ // Bicubic interpolation; adapted from ViT.cpp, inspired from :
2297
+ // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
2298
+ // -> https://en.wikipedia.org/wiki/Bicubic_interpolation
2299
+
2300
+ for (i = 0; i < target_height; i++) {
2301
+ for (j = 0; j < target_width; j++) {
2302
+ x = (int)(tx * j);
2303
+ y = (int)(ty * i);
2304
+
2305
+ dx = tx * j - x;
2306
+ dy = ty * i - y;
2307
+
2308
+ for (k = 0; k < 3; k++) {
2309
+ for (jj = 0; jj <= 3; jj++) {
2310
+ d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2311
+ d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2312
+ d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2313
+ a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2314
+
2315
+ a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
2316
+ a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
2317
+ a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
2318
+
2319
+ C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
2320
+
2321
+ d0 = C[0] - C[1];
2322
+ d2 = C[2] - C[1];
2323
+ d3 = C[3] - C[1];
2324
+ a0 = C[1];
2325
+ a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
2326
+ a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
2327
+ a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
2328
+ Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
2329
+
2330
+ const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
2331
+ dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
2332
+ }
2333
+ }
2334
+ }
2335
+ }
2037
2336
 
2038
- for (size_t i = 0; i < src->buf.size(); ++i) {
2039
- int c = i % 3; // rgb
2040
- dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c];
2337
+ return true;
2041
2338
  }
2042
- }
2043
2339
 
2044
- inline int clip(int x, int lower, int upper) {
2045
- return std::max(lower, std::min(x, upper));
2046
- }
2340
+ // llava-1.6 type of resize_and_pad
2341
+ // if the ratio is not 1:1, padding with pad_color will be applied
2342
+ // pad_color is single channel, default is 0 (black)
2343
+ static void resize_and_pad_image(const clip_image_u8 & image, clip_image_u8 & dst, const clip_image_size & target_resolution, std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
2344
+ int target_width = target_resolution.width;
2345
+ int target_height = target_resolution.height;
2346
+
2347
+ float scale_w = static_cast<float>(target_width) / image.nx;
2348
+ float scale_h = static_cast<float>(target_height) / image.ny;
2349
+
2350
+ int new_width, new_height;
2351
+
2352
+ if (scale_w < scale_h) {
2353
+ new_width = target_width;
2354
+ new_height = std::min(static_cast<int>(std::ceil(image.ny * scale_w)), target_height);
2355
+ } else {
2356
+ new_height = target_height;
2357
+ new_width = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width);
2358
+ }
2359
+
2360
+ clip_image_u8 resized_image;
2361
+ bicubic_resize(image, resized_image, new_width, new_height);
2362
+
2363
+ clip_image_u8 padded_image;
2364
+ padded_image.nx = target_width;
2365
+ padded_image.ny = target_height;
2366
+ padded_image.buf.resize(3 * target_width * target_height);
2047
2367
 
2048
- static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) {
2049
- const int nx = img.nx;
2050
- const int ny = img.ny;
2051
-
2052
- dst.nx = target_width;
2053
- dst.ny = target_height;
2054
- dst.buf.resize(3 * target_width * target_height);
2055
-
2056
- float Cc;
2057
- float C[5];
2058
- float d0, d2, d3, a0, a1, a2, a3;
2059
- int i, j, k, jj;
2060
- int x, y;
2061
- float dx, dy;
2062
- float tx, ty;
2063
-
2064
- tx = (float)nx / (float)target_width;
2065
- ty = (float)ny / (float)target_height;
2066
-
2067
- // Bicubic interpolation; adapted from ViT.cpp, inspired from :
2068
- // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
2069
- // -> https://en.wikipedia.org/wiki/Bicubic_interpolation
2070
-
2071
- for (i = 0; i < target_height; i++) {
2072
- for (j = 0; j < target_width; j++) {
2073
- x = (int)(tx * j);
2074
- y = (int)(ty * i);
2075
-
2076
- dx = tx * j - x;
2077
- dy = ty * i - y;
2078
-
2079
- for (k = 0; k < 3; k++) {
2080
- for (jj = 0; jj <= 3; jj++) {
2081
- d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2082
- d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2083
- d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2084
- a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2085
-
2086
- a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
2087
- a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
2088
- a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
2089
-
2090
- C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
2091
-
2092
- d0 = C[0] - C[1];
2093
- d2 = C[2] - C[1];
2094
- d3 = C[3] - C[1];
2095
- a0 = C[1];
2096
- a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
2097
- a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
2098
- a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
2099
- Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
2100
-
2101
- const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
2102
- dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
2368
+ // Fill the padded image with the fill color
2369
+ for (size_t i = 0; i < padded_image.buf.size(); i += 3) {
2370
+ padded_image.buf[i] = pad_color[0];
2371
+ padded_image.buf[i + 1] = pad_color[1];
2372
+ padded_image.buf[i + 2] = pad_color[2];
2373
+ }
2374
+
2375
+ // Calculate padding offsets
2376
+ int pad_x = (target_width - new_width) / 2;
2377
+ int pad_y = (target_height - new_height) / 2;
2378
+
2379
+ // Copy the resized image into the center of the padded buffer
2380
+ for (int y = 0; y < new_height; ++y) {
2381
+ for (int x = 0; x < new_width; ++x) {
2382
+ for (int c = 0; c < 3; ++c) {
2383
+ padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c];
2103
2384
  }
2104
2385
  }
2105
2386
  }
2387
+ dst = std::move(padded_image);
2106
2388
  }
2107
2389
 
2108
- return true;
2109
- }
2390
+ static void crop_image(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
2391
+ dst.nx = w;
2392
+ dst.ny = h;
2393
+ dst.buf.resize(3 * w * h);
2110
2394
 
2111
- // llava-1.6 type of resize_and_pad (black)
2112
- static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair<int, int>& target_resolution) {
2113
- int target_width = target_resolution.first;
2114
- int target_height = target_resolution.second;
2395
+ for (int i = 0; i < h; ++i) {
2396
+ for (int j = 0; j < w; ++j) {
2397
+ int src_idx = 3 * ((y + i)*image.nx + (x + j));
2398
+ int dst_idx = 3 * (i*w + j);
2399
+ dst.buf[dst_idx] = image.buf[src_idx];
2400
+ dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
2401
+ dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
2402
+ }
2403
+ }
2404
+ }
2115
2405
 
2116
- float scale_w = static_cast<float>(target_width) / image.nx;
2117
- float scale_h = static_cast<float>(target_height) / image.ny;
2406
+ // calculate the size of the **resized** image, while preserving the aspect ratio
2407
+ // the calculated size will be aligned to the nearest multiple of align_size
2408
+ // if H or W size is larger than max_dimension, it will be resized to max_dimension
2409
+ static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int max_dimension) {
2410
+ if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0) {
2411
+ return {0, 0};
2412
+ }
2118
2413
 
2119
- int new_width, new_height;
2414
+ float scale = std::min(1.0f, std::min(static_cast<float>(max_dimension) / inp_size.width,
2415
+ static_cast<float>(max_dimension) / inp_size.height));
2120
2416
 
2121
- if (scale_w < scale_h) {
2122
- new_width = target_width;
2123
- new_height = std::min(static_cast<int>(std::ceil(image.ny * scale_w)), target_height);
2124
- } else {
2125
- new_height = target_height;
2126
- new_width = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width);
2127
- }
2417
+ float target_width_f = static_cast<float>(inp_size.width) * scale;
2418
+ float target_height_f = static_cast<float>(inp_size.height) * scale;
2128
2419
 
2129
- clip_image_u8 resized_image;
2130
- // bilinear_resize(image, resized_image, new_width, new_height);
2131
- bicubic_resize(image, resized_image, new_width, new_height);
2420
+ int aligned_width = GGML_PAD((int)target_width_f, align_size);
2421
+ int aligned_height = GGML_PAD((int)target_height_f, align_size);
2132
2422
 
2133
- clip_image_u8 padded_image;
2134
- padded_image.nx = target_width;
2135
- padded_image.ny = target_height;
2136
- padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black
2423
+ return {aligned_width, aligned_height};
2424
+ }
2137
2425
 
2138
- // Calculate padding offsets
2139
- int pad_x = (target_width - new_width) / 2;
2140
- int pad_y = (target_height - new_height) / 2;
2426
+ private:
2427
+ static inline int clip(int x, int lower, int upper) {
2428
+ return std::max(lower, std::min(x, upper));
2429
+ }
2141
2430
 
2142
- // Copy the resized image into the center of the padded buffer
2143
- for (int y = 0; y < new_height; ++y) {
2144
- for (int x = 0; x < new_width; ++x) {
2145
- for (int c = 0; c < 3; ++c) {
2146
- padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c];
2147
- }
2148
- }
2431
+ // Linear interpolation between two points
2432
+ static inline float lerp(float s, float e, float t) {
2433
+ return s + (e - s) * t;
2149
2434
  }
2150
- image_output = std::move(padded_image);
2151
- }
2435
+ };
2152
2436
 
2153
2437
  /**
2154
- * Selects the best resolution from a list of possible resolutions based on the original size.
2438
+ * implementation of LLaVA-UHD:
2439
+ * - https://arxiv.org/pdf/2403.11703
2440
+ * - https://github.com/thunlp/LLaVA-UHD
2441
+ * - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
2442
+ *
2443
+ * overview:
2444
+ * - an image always have a single overview (downscaled image)
2445
+ * - an image can have 0 or multiple slices, depending on the image size
2446
+ * - each slice can then be considered as a separate image
2155
2447
  *
2156
- * @param original_size The original size of the image in the format (width, height).
2157
- * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
2158
- * @return The best fit resolution in the format (width, height).
2448
+ * for example:
2449
+ *
2450
+ * [overview] --> [slice 1] --> [slice 2]
2451
+ * | |
2452
+ * +--> [slice 3] --> [slice 4]
2159
2453
  */
2160
- static std::pair<int, int> select_best_resolution(const std::pair<int, int> & original_size, const std::vector<std::pair<int, int>> & possible_resolutions) {
2161
- int original_width = original_size.first;
2162
- int original_height = original_size.second;
2163
- std::pair<int, int> best_fit;
2164
- int max_effective_resolution = 0;
2165
- int min_wasted_resolution = std::numeric_limits<int>::max();
2166
-
2167
- for (const auto& resolution : possible_resolutions) {
2168
- int width = resolution.first;
2169
- int height = resolution.second;
2170
- float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
2171
- int downscaled_width = static_cast<int>(original_width * scale);
2172
- int downscaled_height = static_cast<int>(original_height * scale);
2173
- int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
2174
- int wasted_resolution = (width * height) - effective_resolution;
2175
- // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
2176
- if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
2177
- max_effective_resolution = effective_resolution;
2178
- min_wasted_resolution = wasted_resolution;
2179
- best_fit = resolution;
2180
- }
2181
- }
2182
-
2183
- return best_fit;
2184
- }
2454
+ struct llava_uhd {
2455
+ struct slice_coordinates {
2456
+ int x;
2457
+ int y;
2458
+ clip_image_size size;
2459
+ };
2460
+
2461
+ struct slice_instructions {
2462
+ clip_image_size overview_size; // size of downscaled image
2463
+ clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size)
2464
+ clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices
2465
+ std::vector<slice_coordinates> slices;
2466
+ bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
2467
+ };
2185
2468
 
2186
- static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & image, int patch_size) {
2187
- std::vector<clip_image_u8*> patches;
2188
- int width = image.nx;
2189
- int height = image.ny;
2190
- for (int i = 0; i < height; i += patch_size) {
2191
- for (int j = 0; j < width; j += patch_size) {
2192
- clip_image_u8 *patch = clip_image_u8_init();
2193
- patch->nx = std::min(patch_size, width - j);
2194
- patch->ny = std::min(patch_size, height - i);
2195
- patch->buf.resize(3 * patch->nx * patch->ny);
2196
- for (int y = 0; y < patch->ny; ++y) {
2197
- for (int x = 0; x < patch->nx; ++x) {
2198
- for (int c = 0; c < 3; ++c) {
2199
- patch->buf[3 * (y * patch->nx + x) + c] = image.buf[3 * ((i + y) * width + (j + x)) + c];
2469
+ static int get_max_slices(struct clip_ctx * ctx) {
2470
+ if (clip_is_minicpmv(ctx)) {
2471
+ return 9;
2472
+ }
2473
+ return 0;
2474
+ }
2475
+
2476
+ static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
2477
+ slice_instructions res;
2478
+ const int patch_size = clip_get_patch_size(ctx);
2479
+ const int slice_size = clip_get_image_size(ctx);
2480
+ const int max_slice_nums = get_max_slices(ctx);
2481
+ const int original_width = original_size.width;
2482
+ const int original_height = original_size.height;
2483
+ const float log_ratio = log((float)original_width / original_height);
2484
+ const float ratio = (float)original_width * original_height / (slice_size * slice_size);
2485
+ const int multiple = fmin(ceil(ratio), max_slice_nums);
2486
+ const bool has_slices = (multiple > 1);
2487
+ const bool has_pinpoints = !ctx->vision_model.hparams.image_grid_pinpoints.empty();
2488
+
2489
+ if (has_pinpoints) {
2490
+ // has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
2491
+ auto refine_size = llava_uhd::select_best_resolution(
2492
+ ctx->vision_model.hparams.image_grid_pinpoints,
2493
+ original_size);
2494
+ res.overview_size = clip_image_size{slice_size, slice_size};
2495
+ res.refined_size = refine_size;
2496
+ res.grid_size = clip_image_size{0, 0};
2497
+ res.padding_refined = true;
2498
+
2499
+ for (int y = 0; y < refine_size.height; y += slice_size) {
2500
+ for (int x = 0; x < refine_size.width; x += slice_size) {
2501
+ slice_coordinates slice;
2502
+ slice.x = x;
2503
+ slice.y = y;
2504
+ slice.size.width = std::min(slice_size, refine_size.width - x);
2505
+ slice.size.height = std::min(slice_size, refine_size.height - y);
2506
+ res.slices.push_back(slice);
2507
+ if (x == 0) {
2508
+ res.grid_size.width++;
2200
2509
  }
2201
2510
  }
2511
+ res.grid_size.height++;
2202
2512
  }
2203
- patches.push_back(patch);
2204
- }
2205
- }
2206
- return patches;
2207
- }
2208
2513
 
2209
- static int ensure_divide(int length, int patch_size) {
2210
- return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
2211
- }
2514
+ return res;
2515
+ }
2212
2516
 
2213
- static std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
2214
- int width = original_size.first;
2215
- int height = original_size.second;
2216
- if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
2217
- float r = static_cast<float>(width) / height;
2218
- height = static_cast<int>(scale_resolution / std::sqrt(r));
2219
- width = static_cast<int>(height * r);
2220
- }
2221
- int best_width = ensure_divide(width, patch_size);
2222
- int best_height = ensure_divide(height, patch_size);
2223
- return std::make_pair(best_width, best_height);
2224
- }
2517
+ // no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
2225
2518
 
2226
- static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
2227
- int width, height;
2228
- std::tie(width, height) = original_size;
2229
- int grid_x, grid_y;
2230
- std::tie(grid_x, grid_y) = grid;
2519
+ auto best_size = get_best_resize(original_size, slice_size, patch_size, has_slices);
2520
+ res.overview_size = best_size;
2231
2521
 
2232
- int refine_width = ensure_divide(width, grid_x);
2233
- int refine_height = ensure_divide(height, grid_y);
2522
+ if (!has_slices) {
2523
+ // skip slicing logic
2524
+ res.refined_size = clip_image_size{0, 0};
2525
+ res.grid_size = clip_image_size{0, 0};
2234
2526
 
2235
- int grid_width = refine_width / grid_x;
2236
- int grid_height = refine_height / grid_y;
2527
+ } else {
2528
+ auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio);
2529
+ auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
2530
+ res.grid_size = best_grid;
2531
+ res.refined_size = refine_size;
2532
+
2533
+ int width = refine_size.width;
2534
+ int height = refine_size.height;
2535
+ int grid_x = int(width / best_grid.width);
2536
+ int grid_y = int(height / best_grid.height);
2537
+ for (int patches_y = 0, ic = 0;
2538
+ patches_y < refine_size.height && ic < best_grid.height;
2539
+ patches_y += grid_y, ic += 1) {
2540
+ for (int patches_x = 0, jc = 0;
2541
+ patches_x < refine_size.width && jc < best_grid.width;
2542
+ patches_x += grid_x, jc += 1) {
2543
+ slice_coordinates slice;
2544
+ slice.x = patches_x;
2545
+ slice.y = patches_y;
2546
+ slice.size.width = grid_x;
2547
+ slice.size.height = grid_y;
2548
+ res.slices.push_back(slice);
2549
+ // LOG_INF("slice %d: %d %d %d %d\n", ic, patches_i, patches_j, grid_x, grid_y);
2550
+ }
2551
+ }
2552
+ }
2237
2553
 
2238
- // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
2239
- auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
2240
- int best_grid_width, best_grid_height;
2241
- std::tie(best_grid_width, best_grid_height) = best_grid_size;
2554
+ return res;
2555
+ }
2242
2556
 
2243
- // std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
2244
- std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
2245
- return refine_size;
2246
- }
2557
+ static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
2558
+ std::vector<clip_image_u8_ptr> output;
2247
2559
 
2248
- static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
2249
- std::vector<int> candidate_split_grids_nums;
2250
- for (int i : {multiple - 1, multiple, multiple + 1}) {
2251
- if (i == 1 || i > max_slice_nums) {
2252
- continue;
2560
+ // resize to overview size
2561
+ clip_image_u8_ptr resized_img(clip_image_u8_init());
2562
+ image_manipulation::bicubic_resize(*img, *resized_img, inst.overview_size.width, inst.overview_size.height);
2563
+ output.push_back(std::move(resized_img));
2564
+ if (inst.slices.empty()) {
2565
+ // no slices, just return the resized image
2566
+ return output;
2253
2567
  }
2254
- candidate_split_grids_nums.push_back(i);
2255
- }
2256
2568
 
2257
- std::vector<std::pair<int, int>> candidate_grids;
2258
- for (int split_grids_nums : candidate_split_grids_nums) {
2259
- int m = 1;
2260
- while (m <= split_grids_nums) {
2261
- if (split_grids_nums % m == 0) {
2262
- candidate_grids.emplace_back(m, split_grids_nums / m);
2569
+ // resize to refined size
2570
+ clip_image_u8_ptr refined_img(clip_image_u8_init());
2571
+ if (inst.padding_refined) {
2572
+ image_manipulation::resize_and_pad_image(*img, *refined_img, inst.refined_size);
2573
+ } else {
2574
+ image_manipulation::bilinear_resize(*img, *refined_img, inst.refined_size.width, inst.refined_size.height);
2575
+ }
2576
+
2577
+ // create slices
2578
+ for (const auto & slice : inst.slices) {
2579
+ int x = slice.x;
2580
+ int y = slice.y;
2581
+ int w = slice.size.width;
2582
+ int h = slice.size.height;
2583
+
2584
+ clip_image_u8_ptr img_slice(clip_image_u8_init());
2585
+ image_manipulation::crop_image(*refined_img, *img_slice, x, y, w, h);
2586
+ output.push_back(std::move(img_slice));
2587
+ }
2588
+
2589
+ return output;
2590
+ }
2591
+
2592
+ private:
2593
+ static clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
2594
+ int width = original_size.width;
2595
+ int height = original_size.height;
2596
+ if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
2597
+ float r = static_cast<float>(width) / height;
2598
+ height = static_cast<int>(scale_resolution / std::sqrt(r));
2599
+ width = static_cast<int>(height * r);
2600
+ }
2601
+ clip_image_size res;
2602
+ res.width = ensure_divide(width, patch_size);
2603
+ res.height = ensure_divide(height, patch_size);
2604
+ return res;
2605
+ }
2606
+
2607
+ /**
2608
+ * Selects the best resolution from a list of possible resolutions based on the original size.
2609
+ *
2610
+ * @param original_size The original size of the image
2611
+ * @param possible_resolutions A list of possible resolutions
2612
+ * @return The best fit resolution
2613
+ */
2614
+ static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
2615
+ int original_width = original_size.width;
2616
+ int original_height = original_size.height;
2617
+ clip_image_size best_fit;
2618
+ int max_effective_resolution = 0;
2619
+ int min_wasted_resolution = std::numeric_limits<int>::max();
2620
+
2621
+ for (const auto & resolution : possible_resolutions) {
2622
+ int width = resolution.width;
2623
+ int height = resolution.height;
2624
+ float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
2625
+ int downscaled_width = static_cast<int>(original_width * scale);
2626
+ int downscaled_height = static_cast<int>(original_height * scale);
2627
+ int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
2628
+ int wasted_resolution = (width * height) - effective_resolution;
2629
+ // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
2630
+ if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
2631
+ max_effective_resolution = effective_resolution;
2632
+ min_wasted_resolution = wasted_resolution;
2633
+ best_fit = resolution;
2263
2634
  }
2264
- ++m;
2265
2635
  }
2636
+
2637
+ return best_fit;
2266
2638
  }
2267
2639
 
2268
- std::pair<int, int> best_grid{1, 1};
2269
- float min_error = std::numeric_limits<float>::infinity();
2270
- for (const auto& grid : candidate_grids) {
2271
- float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
2272
- if (error < min_error) {
2273
- best_grid = grid;
2274
- min_error = error;
2640
+ // used by llava 1.6 with custom list of pinpoints
2641
+ static clip_image_size select_best_resolution(const std::vector<int32_t> & pinpoints, const clip_image_size & original_size) {
2642
+ std::vector<clip_image_size> possible_resolutions;
2643
+ for (size_t i = 0; i < pinpoints.size(); i += 2) {
2644
+ possible_resolutions.push_back(clip_image_size{pinpoints[i], pinpoints[i+1]});
2275
2645
  }
2646
+ return select_best_resolution(original_size, possible_resolutions);
2276
2647
  }
2277
- return best_grid;
2278
- }
2279
2648
 
2280
- // inspired from LLaVA-UHD:
2281
- // -> https://arxiv.org/pdf/2403.11703
2282
- // -> https://github.com/thunlp/LLaVA-UHD
2283
- // -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
2284
- static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
2285
- const std::pair<int, int> original_size={img->nx,img->ny};
2286
- const int original_width = img->nx;
2287
- const int original_height = img->ny;
2288
- const float log_ratio = log(1.0*original_width/original_height);
2289
- const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
2290
- const int multiple = fmin(ceil(ratio), max_slice_nums);
2291
-
2292
- std::vector<std::vector<clip_image_u8 *>> images;
2293
- LOG_INF("%s: multiple %d\n", __func__, multiple);
2294
- images.push_back(std::vector<clip_image_u8 *>());
2295
-
2296
- if (multiple <= 1) {
2297
- auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
2298
- clip_image_u8 * source_image = clip_image_u8_init();
2299
- bicubic_resize(*img, *source_image, best_size.first, best_size.second);
2300
- // source_image = image.resize(best_size, Image.Resampling.BICUBIC)
2301
- images[images.size()-1].push_back(source_image);
2302
- }
2303
- else if (multiple > 1) {
2304
- auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
2305
- clip_image_u8 * source_image = clip_image_u8_init();
2306
- bicubic_resize(*img, *source_image, best_size.first, best_size.second);
2307
- // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
2308
- LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
2309
- images[images.size()-1].push_back(source_image);
2310
-
2311
- std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
2312
- LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
2313
-
2314
- auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
2315
- clip_image_u8 * refine_image = clip_image_u8_init();
2316
- bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
2317
-
2318
- LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
2319
-
2320
- // split_to_patches
2321
- int width = refine_image->nx;
2322
- int height = refine_image->ny;
2323
- int grid_x = int(width / best_grid.first);
2324
- int grid_y = int(height / best_grid.second);
2325
- for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
2326
- images.push_back(std::vector<clip_image_u8 *>());
2327
- for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
2328
- clip_image_u8 * patch = clip_image_u8_init();
2329
- patch->nx = grid_x;
2330
- patch->ny = grid_y;
2331
- patch->buf.resize(3 * patch->nx * patch->ny);
2332
- for (int y = patches_i; y < patches_i + grid_y; ++y) {
2333
- for (int x = patches_j; x < patches_j + grid_x; ++x) {
2334
- const int i = 3 * (y * refine_image->nx + x);
2335
- const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
2336
- patch->buf[j] = refine_image->buf[i];
2337
- patch->buf[j+1] = refine_image->buf[i+1];
2338
- patch->buf[j+2] = refine_image->buf[i+2];
2339
- }
2649
+ static int ensure_divide(int length, int patch_size) {
2650
+ return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
2651
+ }
2652
+
2653
+ static clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
2654
+ int width = original_size.width;
2655
+ int height = original_size.height;
2656
+ int grid_x = grid.width;
2657
+ int grid_y = grid.height;
2658
+
2659
+ int refine_width = ensure_divide(width, grid_x);
2660
+ int refine_height = ensure_divide(height, grid_y);
2661
+
2662
+ clip_image_size grid_size;
2663
+ grid_size.width = refine_width / grid_x;
2664
+ grid_size.height = refine_height / grid_y;
2665
+
2666
+ auto best_grid_size = get_best_resize(grid_size, scale_resolution, patch_size, allow_upscale);
2667
+ int best_grid_width = best_grid_size.width;
2668
+ int best_grid_height = best_grid_size.height;
2669
+
2670
+ clip_image_size refine_size;
2671
+ refine_size.width = best_grid_width * grid_x;
2672
+ refine_size.height = best_grid_height * grid_y;
2673
+ return refine_size;
2674
+ }
2675
+
2676
+ static clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
2677
+ std::vector<int> candidate_split_grids_nums;
2678
+ for (int i : {multiple - 1, multiple, multiple + 1}) {
2679
+ if (i == 1 || i > max_slice_nums) {
2680
+ continue;
2681
+ }
2682
+ candidate_split_grids_nums.push_back(i);
2683
+ }
2684
+
2685
+ std::vector<clip_image_size> candidate_grids;
2686
+ for (int split_grids_nums : candidate_split_grids_nums) {
2687
+ int m = 1;
2688
+ while (m <= split_grids_nums) {
2689
+ if (split_grids_nums % m == 0) {
2690
+ candidate_grids.push_back(clip_image_size{m, split_grids_nums / m});
2340
2691
  }
2341
- images[images.size()-1].push_back(patch);
2692
+ ++m;
2342
2693
  }
2343
2694
  }
2344
- clip_image_u8_free(refine_image);
2695
+
2696
+ clip_image_size best_grid{1, 1};
2697
+ float min_error = std::numeric_limits<float>::infinity();
2698
+ for (const auto& grid : candidate_grids) {
2699
+ float error = std::abs(log_ratio - std::log(1.0 * grid.width / grid.height));
2700
+ if (error < min_error) {
2701
+ best_grid = grid;
2702
+ min_error = error;
2703
+ }
2704
+ }
2705
+ return best_grid;
2345
2706
  }
2346
- return images;
2347
- }
2707
+ };
2348
2708
 
2709
+ // TODO @ngxson : decprecate the load_image_size singleton pattern
2349
2710
  int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
2350
- const int max_slice_nums=9;
2351
- const int scale_resolution=448;
2352
- const int original_width = ctx_clip->load_image_size->width;
2353
- const int original_height = ctx_clip->load_image_size->height;
2354
- const float log_ratio = log(1.0*original_width/original_height);
2355
- const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
2356
- const int multiple = fmin(ceil(ratio), max_slice_nums);
2357
- std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
2358
- return best_grid.first;
2711
+ const auto inst = llava_uhd::get_slice_instructions(ctx_clip, ctx_clip->load_image_size);
2712
+ return inst.grid_size.width;
2359
2713
  }
2360
2714
 
2361
2715
  // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
2362
2716
  // res_imgs memory is being allocated here, previous allocations will be freed if found
2363
- bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
2717
+ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
2718
+ clip_image_size original_size{img->nx, img->ny};
2719
+ bool pad_to_square = true;
2720
+ auto & params = ctx->vision_model.hparams;
2721
+ // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
2722
+ if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) {
2723
+ pad_to_square = false;
2724
+ }
2725
+
2726
+ if (clip_is_minicpmv(ctx)) {
2727
+ auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
2728
+ std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
2364
2729
 
2365
- if(clip_is_minicpmv(ctx)){
2366
- int max_slice_nums = 9;
2367
- std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
2368
- res_imgs->size = 0;
2369
- for (size_t i = 0; i < imgs.size(); ++i){
2370
- res_imgs->size += imgs[i].size();
2371
- }
2372
- res_imgs->data = new clip_image_f32[res_imgs->size];
2373
- int idx = 0;
2374
- for (size_t i = 0; i < imgs.size(); ++i) {
2375
- for (size_t j = 0; j < imgs[i].size(); ++j) {
2376
- LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
2377
- clip_image_f32 * res = clip_image_f32_init();
2378
- normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
2379
- res_imgs->data[idx++] = *res;
2380
- clip_image_f32_free(res);
2381
- }
2382
- }
2383
2730
  for (size_t i = 0; i < imgs.size(); ++i) {
2384
- for (size_t j = 0; j < imgs[i].size(); ++j) {
2385
- if (imgs[i][j] != nullptr) {
2386
- clip_image_u8_free(imgs[i][j]);
2387
- }
2388
- }
2731
+ // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
2732
+ clip_image_f32_ptr res(clip_image_f32_init());
2733
+ normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
2734
+ res_imgs->entries.push_back(std::move(res));
2389
2735
  }
2390
2736
  return true;
2391
2737
  }
2392
- else if (ctx->has_qwen2vl_merger) {
2393
- clip_image_u8 * resized = clip_image_u8_init();
2394
- auto patch_size = clip_patch_size(ctx) * 2;
2738
+ else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
2739
+ clip_image_u8 resized;
2740
+ auto patch_size = clip_get_patch_size(ctx) * 2;
2395
2741
  int nx = ceil((float)img->nx / patch_size) * patch_size;
2396
2742
  int ny = ceil((float)img->ny / patch_size) * patch_size;
2397
- bicubic_resize(*img, *resized, nx, ny);
2743
+ image_manipulation::bicubic_resize(*img, resized, nx, ny);
2398
2744
 
2399
- res_imgs->data = new clip_image_f32[1];
2400
- // clip_image_f32 * res = clip_image_f32_init();
2401
- normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std);
2745
+ clip_image_f32_ptr img_f32(clip_image_f32_init());
2746
+ // clip_image_f32_ptr res(clip_image_f32_init());
2747
+ normalize_image_u8_to_f32(resized, *img_f32, ctx->image_mean, ctx->image_std);
2402
2748
  // res_imgs->data[0] = *res;
2403
- res_imgs->size = 1;
2404
-
2405
- // clip_image_f32_free(res);
2406
- clip_image_u8_free(resized);
2749
+ res_imgs->entries.push_back(std::move(img_f32));
2407
2750
  return true;
2408
2751
  }
2409
-
2410
- if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
2411
- res_imgs->size = 1;
2412
- res_imgs->data = new clip_image_f32[res_imgs->size];
2752
+ else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE
2753
+ || ctx->proj_type == PROJECTOR_TYPE_GEMMA3
2754
+ || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
2413
2755
  clip_image_u8 resized_image;
2414
- int32_t sz=ctx->vision_model.hparams.image_size;
2415
- bicubic_resize(*img, resized_image,sz,sz);
2416
- clip_image_f32 * res = clip_image_f32_init();
2756
+ int sz = params.image_size;
2757
+ image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz});
2758
+ clip_image_f32_ptr img_f32(clip_image_f32_init());
2417
2759
  //clip_image_save_to_bmp(resized_image, "resized.bmp");
2418
- normalize_image_u8_to_f32(&resized_image, res, ctx->image_mean, ctx->image_std);
2419
- res_imgs->data[0] = *res;
2420
- clip_image_f32_free(res);
2760
+ normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
2761
+ res_imgs->entries.push_back(std::move(img_f32));
2421
2762
  return true;
2422
2763
  }
2423
-
2424
- bool pad_to_square = true;
2425
- if (!ctx->has_vision_encoder) {
2426
- LOG_ERR("This gguf file seems to have no vision encoder\n");
2427
- return false;
2428
- }
2429
- auto & params = ctx->vision_model.hparams;
2430
- // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
2431
- if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) {
2432
- pad_to_square = false;
2433
- }
2434
- // free the previous res_imgs if any set
2435
- if (res_imgs->size > 0) {
2436
- clip_image_f32_batch_free(res_imgs);
2764
+ else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
2765
+ clip_image_u8 resized_image;
2766
+ auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
2767
+ image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
2768
+ clip_image_f32_ptr img_f32(clip_image_f32_init());
2769
+ normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
2770
+ res_imgs->entries.push_back(std::move(img_f32));
2771
+ return true;
2437
2772
  }
2438
- res_imgs->data = nullptr;
2439
- res_imgs->size = 0;
2440
2773
 
2441
2774
  // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
2442
2775
  // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
2443
2776
 
2444
- clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
2445
- if (pad_to_square && img->nx != img->ny) {
2446
- int longer_side = std::max(img->nx, img->ny);
2777
+ clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
2778
+
2779
+ if (pad_to_square) {
2780
+ // for llava-1.5, we resize image to a square, and pad the shorter side with a background color
2781
+ // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
2782
+ const int longer_side = std::max(img->nx, img->ny);
2447
2783
  temp->nx = longer_side;
2448
2784
  temp->ny = longer_side;
2449
2785
  temp->buf.resize(3 * longer_side * longer_side);
2450
- const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255)
2451
2786
 
2452
- // fill with background color
2453
- for (size_t i = 0; i < temp->buf.size(); i++) {
2454
- temp->buf[i] = bc[i % 3];
2455
- }
2787
+ // background color in RGB from LLaVA (this is the mean rgb color * 255)
2788
+ const std::array<uint8_t, 3> pad_color = {122, 116, 104};
2456
2789
 
2457
- // copy from the input image
2458
- for (int y = 0; y < img->ny; y++) {
2459
- for (int x = 0; x < img->nx; x++) {
2460
- const int i = 3 * (y * img->nx + x);
2461
- const int j = 3 * (y * temp->nx + x);
2462
- temp->buf[j] = img->buf[i];
2463
- temp->buf[j+1] = img->buf[i+1];
2464
- temp->buf[j+2] = img->buf[i+2];
2465
- }
2466
- }
2467
- } else {
2468
- if (!params.image_grid_pinpoints.empty()) {
2469
- // "spatial_unpad" with "anyres" processing for llava-1.6
2470
- std::vector<std::pair<int, int>> possible_resolutions;
2471
- for (size_t i = 0; i < params.image_grid_pinpoints.size(); i+=2) {
2472
- possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
2473
- }
2474
- std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
2475
- // clip_image_save_to_bmp(*img, "input.bmp");
2476
- resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6
2477
- // clip_image_save_to_bmp(*temp, "resized.bmp");
2478
- // visually verify normalized image:
2479
- // normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
2480
- // {
2481
- // clip_image_u8 * temp2 = clip_image_u8_init();
2482
- // clip_image_convert_f32_to_u8(*res, *temp2);
2483
- // clip_image_save_to_bmp(*temp2, "resized_normalized_f32.bmp");
2484
- // clip_image_u8_free(temp2);
2485
- // }
2486
-
2487
- std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
2488
-
2489
- clip_image_u8 *image_original_resize = clip_image_u8_init();
2490
- // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
2491
- bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
2492
- patches.insert(patches.begin(), image_original_resize);
2493
- // clip_image_f32_batch_init(patches.size());
2494
- res_imgs->size = patches.size();
2495
- res_imgs->data = new clip_image_f32[res_imgs->size];
2496
- int num=0;
2497
- for (auto& patch : patches) {
2498
- normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std);
2499
- num++;
2500
- }
2790
+ // resize the image to the target_size
2791
+ image_manipulation::resize_and_pad_image(*img, *temp, clip_image_size{params.image_size, params.image_size}, pad_color);
2501
2792
 
2502
- for (size_t i = 0; i < patches.size(); i++) {
2503
- // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
2504
- clip_image_u8_free(patches[i]);
2505
- }
2793
+ clip_image_f32_ptr res(clip_image_f32_init());
2794
+ normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
2795
+ res_imgs->entries.push_back(std::move(res));
2796
+ return true;
2506
2797
 
2507
- clip_image_u8_free(temp);
2798
+ } else if (!params.image_grid_pinpoints.empty()) {
2799
+ // "spatial_unpad" with "anyres" processing for llava-1.6
2800
+ auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
2801
+ std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
2508
2802
 
2509
- return true;
2510
- } else {
2511
- temp->nx = img->nx;
2512
- temp->ny = img->ny;
2513
- temp->buf.resize(img->buf.size());
2514
- memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
2803
+ for (size_t i = 0; i < imgs.size(); ++i) {
2804
+ // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
2805
+ clip_image_f32_ptr res(clip_image_f32_init());
2806
+ normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
2807
+ res_imgs->entries.push_back(std::move(res));
2515
2808
  }
2516
- }
2517
-
2518
- const int nx = temp->nx;
2519
- const int ny = temp->ny;
2520
- // clip_image_save_to_bmp(*temp, "resized_vanilla.bmp");
2521
-
2522
- const int nx2 = ctx->vision_model.hparams.image_size;
2523
- const int ny2 = ctx->vision_model.hparams.image_size;
2524
- clip_image_f32 * res = clip_image_f32_init();
2525
- res->nx = nx2;
2526
- res->ny = ny2;
2527
- res->buf.resize(3 * nx2 * ny2);
2528
-
2529
- const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;
2530
-
2531
- const int nx3 = int(nx / scale + 0.5f);
2532
- const int ny3 = int(ny / scale + 0.5f);
2533
-
2534
- const auto & m3 = ctx->image_mean; // {0.48145466f, 0.4578275f, 0.40821073f};
2535
- const auto & s3 = ctx->image_std; // {0.26862954f, 0.26130258f, 0.27577711f};
2536
-
2537
- for (int y = 0; y < ny3; y++) {
2538
- for (int x = 0; x < nx3; x++) {
2539
- for (int c = 0; c < 3; c++) {
2540
- // linear interpolation
2541
- const float sx = (x + 0.5f) * scale - 0.5f;
2542
- const float sy = (y + 0.5f) * scale - 0.5f;
2543
-
2544
- const int x0 = std::max(0, (int)std::floor(sx));
2545
- const int y0 = std::max(0, (int)std::floor(sy));
2546
2809
 
2547
- const int x1 = std::min(x0 + 1, nx - 1);
2548
- const int y1 = std::min(y0 + 1, ny - 1);
2549
-
2550
- const float dx = sx - x0;
2551
- const float dy = sy - y0;
2552
-
2553
- const int j00 = 3 * (y0 * nx + x0) + c;
2554
- const int j01 = 3 * (y0 * nx + x1) + c;
2555
- const int j10 = 3 * (y1 * nx + x0) + c;
2556
- const int j11 = 3 * (y1 * nx + x1) + c;
2557
-
2558
- const float v00 = temp->buf[j00];
2559
- const float v01 = temp->buf[j01];
2560
- const float v10 = temp->buf[j10];
2561
- const float v11 = temp->buf[j11];
2562
-
2563
- const float v0 = v00 * (1.0f - dx) + v01 * dx;
2564
- const float v1 = v10 * (1.0f - dx) + v11 * dx;
2565
-
2566
- const float v = v0 * (1.0f - dy) + v1 * dy;
2567
-
2568
- const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f);
2569
-
2570
- const int i = 3 * (y * nx3 + x) + c;
2810
+ return true;
2571
2811
 
2572
- res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
2573
- }
2574
- }
2575
2812
  }
2576
- clip_image_u8_free(temp);
2577
-
2578
- // {
2579
- // clip_image_u8 * temp2 = clip_image_u8_init();
2580
- // clip_image_convert_f32_to_u8(*res, *temp2);
2581
- // clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp");
2582
- // clip_image_u8_free(temp2);
2583
- // }
2584
- // res_imgs.push_back(res);
2585
-
2586
- res_imgs->size = 1;
2587
- res_imgs->data = new clip_image_f32[res_imgs->size];
2588
- res_imgs->data[0] = *res;
2589
- clip_image_f32_free(res);
2590
2813
 
2591
- return true;
2814
+ GGML_ASSERT(false && "Unknown image preprocessing type");
2592
2815
  }
2593
2816
 
2594
2817
  ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
@@ -2596,35 +2819,40 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
2596
2819
  }
2597
2820
 
2598
2821
  void clip_free(clip_ctx * ctx) {
2822
+ if (ctx == nullptr) {
2823
+ return;
2824
+ }
2599
2825
  delete ctx;
2600
2826
  }
2601
2827
 
2828
+ // deprecated
2602
2829
  size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
2603
- int extra_tokens = ctx->has_glm_projector ? 2 : 0;
2604
- return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
2830
+ const int32_t nx = ctx->vision_model.hparams.image_size;
2831
+ const int32_t ny = ctx->vision_model.hparams.image_size;
2832
+ return clip_embd_nbytes_by_img(ctx, nx, ny);
2605
2833
  }
2606
2834
 
2607
- size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
2835
+ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
2608
2836
  clip_image_f32 img;
2609
2837
  img.nx = img_w;
2610
2838
  img.ny = img_h;
2611
- return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
2839
+ return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
2612
2840
  }
2613
2841
 
2614
- int32_t clip_image_size(const struct clip_ctx * ctx) {
2842
+ int32_t clip_get_image_size(const struct clip_ctx * ctx) {
2615
2843
  return ctx->vision_model.hparams.image_size;
2616
2844
  }
2617
2845
 
2618
- int32_t clip_patch_size(const struct clip_ctx * ctx) {
2846
+ int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
2619
2847
  return ctx->vision_model.hparams.patch_size;
2620
2848
  }
2621
2849
 
2622
- int32_t clip_hidden_size(const struct clip_ctx * ctx) {
2850
+ int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
2623
2851
  return ctx->vision_model.hparams.hidden_size;
2624
2852
  }
2625
2853
 
2626
2854
  const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
2627
- return ctx->vision_model.hparams.mm_patch_merge_type;
2855
+ return ctx->vision_model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
2628
2856
  }
2629
2857
 
2630
2858
  const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
@@ -2638,21 +2866,44 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
2638
2866
  return ctx->vision_model.hparams.image_grid_pinpoints.size();
2639
2867
  }
2640
2868
 
2869
+ // deprecated
2641
2870
  int clip_n_patches(const struct clip_ctx * ctx) {
2642
2871
  clip_image_f32 img;
2643
2872
  img.nx = ctx->vision_model.hparams.image_size;
2644
2873
  img.ny = ctx->vision_model.hparams.image_size;
2645
- return clip_n_patches_by_img(ctx, &img);
2874
+ return clip_n_output_tokens(ctx, &img);
2646
2875
  }
2647
2876
 
2877
+ // deprecated
2648
2878
  int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
2879
+ return clip_n_output_tokens(ctx, img);
2880
+ }
2881
+
2882
+ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
2883
+ const auto & params = ctx->vision_model.hparams;
2884
+ const int n_total = clip_n_output_tokens(ctx, img);
2885
+ if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
2886
+ return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0);
2887
+ }
2888
+ return n_total;
2889
+ }
2890
+
2891
+ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
2892
+ const auto & params = ctx->vision_model.hparams;
2893
+ if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
2894
+ return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0);
2895
+ }
2896
+ return 1;
2897
+ }
2898
+
2899
+ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
2649
2900
  const auto & params = ctx->vision_model.hparams;
2650
2901
 
2651
2902
  int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
2652
2903
 
2653
2904
  if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
2654
2905
  n_patches /= 4;
2655
- } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2906
+ } else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
2656
2907
  if (ctx->minicpmv_version == 2) {
2657
2908
  n_patches = 96;
2658
2909
  }
@@ -2662,11 +2913,22 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
2662
2913
  else if (ctx->minicpmv_version == 4) {
2663
2914
  n_patches = 64;
2664
2915
  }
2665
- } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
2916
+ else {
2917
+ GGML_ABORT("Unknown minicpmv version");
2918
+ }
2919
+ } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
2666
2920
  int patch_size = params.patch_size * 2;
2667
2921
  int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
2668
2922
  int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
2669
2923
  n_patches = x_patch * y_patch;
2924
+ } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
2925
+ n_patches = 256;
2926
+ } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
2927
+ n_patches /= ctx->vision_model.hparams.proj_scale_factor;
2928
+ } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
2929
+ int n_patches_x = img->nx / params.patch_size;
2930
+ int n_patches_y = img->ny / params.patch_size;
2931
+ n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
2670
2932
  }
2671
2933
 
2672
2934
  return n_patches;
@@ -2759,35 +3021,22 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
2759
3021
  }
2760
3022
 
2761
3023
  bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
2762
- if (!ctx->has_vision_encoder) {
2763
- LOG_ERR("This gguf file seems to have no vision encoder\n");
2764
- return false;
2765
- }
3024
+ clip_image_f32_batch imgs;
3025
+ clip_image_f32_ptr img_copy(clip_image_f32_init());
3026
+ *img_copy = *img;
3027
+ imgs.entries.push_back(std::move(img_copy));
2766
3028
 
2767
- clip_image_f32_batch imgs{};
2768
- imgs.size = 1;
2769
- imgs.data = img;
2770
3029
  return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
2771
3030
  }
2772
3031
 
2773
- bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
2774
- if (!ctx->has_vision_encoder) {
2775
- LOG_ERR("This gguf file seems to have no vision encoder\n");
2776
- return false;
2777
- }
3032
+ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
3033
+ const clip_image_f32_batch & imgs = *imgs_c_ptr;
3034
+ int batch_size = imgs.entries.size();
2778
3035
 
2779
- int batch_size = imgs->size;
2780
- if (ctx->has_llava_projector) {
2781
- GGML_ASSERT(batch_size == 1); // TODO: support multiple images
2782
- }
2783
- if (ctx->has_minicpmv_projector) {
2784
- GGML_ASSERT(batch_size == 1);
2785
- }
2786
- if (ctx->has_glm_projector) {
3036
+ if (ctx->has_llava_projector
3037
+ || ctx->proj_type == PROJECTOR_TYPE_MINICPMV
3038
+ || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
2787
3039
  GGML_ASSERT(batch_size == 1);
2788
- ggml_tensor * boi = ctx->vision_model.boi_w;
2789
- ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
2790
- vec = (float*)(vec+ggml_nelements(boi)); //offset for boi
2791
3040
  }
2792
3041
 
2793
3042
  // build the inference graph
@@ -2796,169 +3045,283 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2796
3045
  ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
2797
3046
 
2798
3047
  // set inputs
2799
- const auto & model = ctx->vision_model;
3048
+ const auto & model = ctx->vision_model;
2800
3049
  const auto & hparams = model.hparams;
2801
3050
 
2802
- const int image_size = hparams.image_size;
2803
- int image_size_width = image_size;
2804
- int image_size_height = image_size;
2805
- if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
2806
- image_size_width = imgs->data[0].nx;
2807
- image_size_height = imgs->data[0].ny;
2808
- }
3051
+ const int image_size_width = imgs.entries[0]->nx;
3052
+ const int image_size_height = imgs.entries[0]->ny;
3053
+
2809
3054
  const int patch_size = hparams.patch_size;
2810
3055
  const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
2811
- const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
2812
- if(ctx->load_image_size==nullptr){
2813
- ctx->load_image_size= clip_image_size_init();
2814
- }
2815
- const int pos_w = ctx->load_image_size->width/patch_size;
2816
- const int pos_h = ctx->load_image_size->height/patch_size;
3056
+ const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
3057
+ const int pos_w = ctx->load_image_size.width / patch_size;
3058
+ const int pos_h = ctx->load_image_size.height / patch_size;
2817
3059
 
2818
- {
2819
- struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
2820
- float * data = (float *)malloc(ggml_nbytes(inp_raw));
2821
-
2822
- for (size_t i = 0; i < imgs->size; i++) {
2823
- const int nx = imgs->data[i].nx;
2824
- const int ny = imgs->data[i].ny;
2825
- if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
2826
- GGML_ASSERT(nx == image_size && ny == image_size);
2827
- }
3060
+ const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
3061
+
3062
+ auto get_inp_tensor = [&gf](const char * name) {
3063
+ struct ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
3064
+ if (inp == nullptr) {
3065
+ GGML_ABORT("Failed to get tensor %s", name);
3066
+ }
3067
+ if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) {
3068
+ GGML_ABORT("Tensor %s is not an input tensor", name);
3069
+ }
3070
+ return inp;
3071
+ };
2828
3072
 
3073
+ auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
3074
+ ggml_tensor * cur = get_inp_tensor(name);
3075
+ GGML_ASSERT(cur->type == GGML_TYPE_F32);
3076
+ GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
3077
+ ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
3078
+ };
3079
+
3080
+ auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
3081
+ ggml_tensor * cur = get_inp_tensor(name);
3082
+ GGML_ASSERT(cur->type == GGML_TYPE_I32);
3083
+ GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
3084
+ ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
3085
+ };
3086
+
3087
+ // set input pixel values
3088
+ {
3089
+ size_t nelem = 0;
3090
+ for (const auto & img : imgs.entries) {
3091
+ nelem += img->nx * img->ny * 3;
3092
+ }
3093
+ std::vector<float> inp_raw(nelem);
3094
+
3095
+ // layout of data (note: the channel dim is unrolled to better visualize the layout):
3096
+ //
3097
+ // ┌──W──┐
3098
+ // │ H │ channel = R
3099
+ // ├─────┤ │
3100
+ // │ H │ channel = G
3101
+ // ├─────┤ │
3102
+ // │ H │ channel = B
3103
+ // └─────┘ │
3104
+ // ──────┘ x B
3105
+
3106
+ for (size_t i = 0; i < imgs.entries.size(); i++) {
3107
+ const int nx = imgs.entries[i]->nx;
3108
+ const int ny = imgs.entries[i]->ny;
2829
3109
  const int n = nx * ny;
2830
3110
 
2831
3111
  for (int b = 0; b < batch_size; b++) {
2832
- for (int k = 0; k < 3; k++) {
2833
- for (int y = 0; y < ny; y++) {
2834
- for (int x = 0; x < nx; x++) {
2835
- data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
2836
- }
3112
+ float * batch_entry = inp_raw.data() + b * (3*n);
3113
+ for (int y = 0; y < ny; y++) {
3114
+ for (int x = 0; x < nx; x++) {
3115
+ size_t base_src = 3*(y * nx + x); // idx of the first channel
3116
+ size_t base_dst = y * nx + x; // idx of the first channel
3117
+ batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ];
3118
+ batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
3119
+ batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
2837
3120
  }
2838
3121
  }
2839
3122
  }
2840
3123
  }
2841
- ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
2842
- free(data);
3124
+ set_input_f32("inp_raw", inp_raw);
2843
3125
  }
2844
- if (ctx->has_minicpmv_projector) {
2845
- {
2846
- // inspired from siglip:
2847
- // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
2848
- // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
2849
- struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
2850
- int* positions_data = (int*)malloc(ggml_nbytes(positions));
2851
- int bucket_coords_h[1024];
2852
- int bucket_coords_w[1024];
2853
- for (int i = 0; i < pos_h; i++){
2854
- bucket_coords_h[i] = std::floor(70.0*i/pos_h);
2855
- }
2856
- for (int i = 0; i < pos_w; i++){
2857
- bucket_coords_w[i] = std::floor(70.0*i/pos_w);
2858
- }
2859
- for (int i = 0, id = 0; i < pos_h; i++){
2860
- for (int j = 0; j < pos_w; j++){
2861
- positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
2862
- }
2863
- }
2864
- ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
2865
- free(positions_data);
2866
- }
2867
-
2868
- {
2869
- // inspired from resampler of Qwen-VL:
2870
- // -> https://huggingface.co/Qwen/Qwen-VL/tree/main
2871
- // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
2872
- struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
2873
- int embed_dim = 4096;
2874
- if (ctx->minicpmv_version == 2) {
2875
- embed_dim = 4096;
2876
- }
2877
- else if (ctx->minicpmv_version == 3) {
2878
- embed_dim = 3584;
2879
- }
2880
- else if (ctx->minicpmv_version == 4) {
2881
- embed_dim = 3584;
2882
- }
2883
- auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
2884
3126
 
2885
- float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
2886
- for(int i=0;i < pos_w * pos_h; ++i){
2887
- for(int j=0; j < embed_dim; ++j){
2888
- pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j];
3127
+ // set input per projector
3128
+ switch (ctx->proj_type) {
3129
+ case PROJECTOR_TYPE_MINICPMV:
3130
+ {
3131
+ // inspired from siglip:
3132
+ // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
3133
+ // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
3134
+ std::vector<int32_t> positions(pos_h * pos_w);
3135
+ int bucket_coords_h[1024];
3136
+ int bucket_coords_w[1024];
3137
+ for (int i = 0; i < pos_h; i++){
3138
+ bucket_coords_h[i] = std::floor(70.0*i/pos_h);
2889
3139
  }
2890
- }
3140
+ for (int i = 0; i < pos_w; i++){
3141
+ bucket_coords_w[i] = std::floor(70.0*i/pos_w);
3142
+ }
3143
+ for (int i = 0, id = 0; i < pos_h; i++){
3144
+ for (int j = 0; j < pos_w; j++){
3145
+ positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
3146
+ }
3147
+ }
3148
+ set_input_i32("positions", positions);
2891
3149
 
2892
- ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
2893
- free(pos_embed_data);
2894
- }
2895
- }
2896
- else{
2897
- {
2898
- if (ctx->has_class_embedding) {
2899
- struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
3150
+ // inspired from resampler of Qwen-VL:
3151
+ // -> https://huggingface.co/Qwen/Qwen-VL/tree/main
3152
+ // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
3153
+ int embed_dim = clip_n_mmproj_embd(ctx);
2900
3154
 
2901
- void* zero_mem = malloc(ggml_nbytes(embeddings));
2902
- memset(zero_mem, 0, ggml_nbytes(embeddings));
2903
- ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
2904
- free(zero_mem);
2905
- }
2906
- }
3155
+ // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
3156
+ auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
2907
3157
 
2908
- if (ctx->has_qwen2vl_merger) {
2909
- struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
3158
+ std::vector<float> pos_embed(embed_dim * pos_w * pos_h);
3159
+ for(int i = 0; i < pos_w * pos_h; ++i){
3160
+ for(int j = 0; j < embed_dim; ++j){
3161
+ pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
3162
+ }
3163
+ }
2910
3164
 
2911
- const int pw = image_size_width / patch_size;
2912
- const int ph = image_size_height / patch_size;
2913
- int* positions_data = (int*)malloc(ggml_nbytes(positions));
3165
+ set_input_f32("pos_embed", pos_embed);
3166
+ } break;
3167
+ case PROJECTOR_TYPE_QWEN2VL:
3168
+ {
3169
+ const int merge_ratio = 2;
3170
+ const int pw = image_size_width / patch_size;
3171
+ const int ph = image_size_height / patch_size;
3172
+ std::vector<int> positions(num_positions * 4);
3173
+ int ptr = 0;
3174
+ for (int y = 0; y < ph; y += merge_ratio) {
3175
+ for (int x = 0; x < pw; x += merge_ratio) {
3176
+ for (int dy = 0; dy < 2; dy++) {
3177
+ for (int dx = 0; dx < 2; dx++) {
3178
+ positions[ ptr] = y + dy;
3179
+ positions[ num_patches + ptr] = x + dx;
3180
+ positions[2 * num_patches + ptr] = y + dy;
3181
+ positions[3 * num_patches + ptr] = x + dx;
3182
+ ptr++;
3183
+ }
3184
+ }
3185
+ }
3186
+ }
2914
3187
 
2915
- int ptr = 0;
2916
- for (int y = 0; y < ph; y+=2)
3188
+ set_input_i32("positions", positions);
3189
+ } break;
3190
+ case PROJECTOR_TYPE_QWEN25VL:
2917
3191
  {
2918
- for (int x = 0; x < pw; x+=2)
2919
- {
2920
- for (int dy = 0; dy < 2; dy++) {
2921
- for (int dx = 0; dx < 2; dx++) {
2922
- positions_data[ptr] = y + dy;
2923
- positions_data[num_patches + ptr] = x + dx;
2924
- positions_data[num_patches * 2 + ptr] = y + dy;
2925
- positions_data[num_patches * 3 + ptr] = x + dx;
2926
- ptr++;
3192
+ // pw * ph = number of tokens output by ViT after apply patch merger
3193
+ // ipw * ipw = number of vision token been processed inside ViT
3194
+ const int merge_ratio = 2;
3195
+ const int pw = image_size_width / patch_size / merge_ratio;
3196
+ const int ph = image_size_height / patch_size / merge_ratio;
3197
+ const int ipw = image_size_width / patch_size;
3198
+ const int iph = image_size_height / patch_size;
3199
+
3200
+ std::vector<int> idx (ph * pw);
3201
+ std::vector<int> inv_idx(ph * pw);
3202
+
3203
+ if (use_window_attn) {
3204
+ const int attn_window_size = 112;
3205
+ const int grid_window = attn_window_size / patch_size / merge_ratio;
3206
+ int dst = 0;
3207
+ // [num_vision_tokens, num_vision_tokens] attention mask tensor
3208
+ std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
3209
+ int mask_row = 0;
3210
+
3211
+ for (int y = 0; y < ph; y += grid_window) {
3212
+ for (int x = 0; x < pw; x += grid_window) {
3213
+ const int win_h = std::min(grid_window, ph - y);
3214
+ const int win_w = std::min(grid_window, pw - x);
3215
+ const int dst_0 = dst;
3216
+ // group all tokens belong to the same window togather (to a continue range)
3217
+ for (int dy = 0; dy < win_h; dy++) {
3218
+ for (int dx = 0; dx < win_w; dx++) {
3219
+ const int src = (y + dy) * pw + (x + dx);
3220
+ GGML_ASSERT(src < (int)idx.size());
3221
+ GGML_ASSERT(dst < (int)inv_idx.size());
3222
+ idx [src] = dst;
3223
+ inv_idx[dst] = src;
3224
+ dst++;
3225
+ }
3226
+ }
3227
+
3228
+ for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
3229
+ int row_offset = mask_row * (ipw * iph);
3230
+ std::fill(
3231
+ mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
3232
+ mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
3233
+ 0.0);
3234
+ mask_row++;
3235
+ }
2927
3236
  }
2928
3237
  }
3238
+
3239
+ set_input_i32("window_idx", idx);
3240
+ set_input_i32("inv_window_idx", inv_idx);
3241
+ set_input_f32("window_mask", mask);
3242
+ } else {
3243
+ for (int i = 0; i < ph * pw; i++) {
3244
+ idx[i] = i;
3245
+ }
2929
3246
  }
2930
- }
2931
3247
 
2932
- ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
2933
- free(positions_data);
2934
- }
2935
- else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
2936
- // do nothing
2937
- }
2938
- else {
2939
- struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
3248
+ const int mpow = merge_ratio * merge_ratio;
3249
+ std::vector<int> positions(num_positions * 4);
3250
+
3251
+ int ptr = 0;
3252
+ for (int y = 0; y < iph; y += merge_ratio) {
3253
+ for (int x = 0; x < ipw; x += merge_ratio) {
3254
+ for (int dy = 0; dy < 2; dy++) {
3255
+ for (int dx = 0; dx < 2; dx++) {
3256
+ auto remap = idx[ptr / mpow];
3257
+ remap = (remap * mpow) + (ptr % mpow);
3258
+
3259
+ positions[ remap] = y + dy;
3260
+ positions[ num_patches + remap] = x + dx;
3261
+ positions[2 * num_patches + remap] = y + dy;
3262
+ positions[3 * num_patches + remap] = x + dx;
3263
+ ptr++;
3264
+ }
3265
+ }
3266
+ }
3267
+ }
2940
3268
 
2941
- int* positions_data = (int*)malloc(ggml_nbytes(positions));
3269
+ set_input_i32("positions", positions);
3270
+ } break;
3271
+ case PROJECTOR_TYPE_PIXTRAL:
3272
+ {
3273
+ // set the 2D positions
3274
+ int n_patches_per_col = image_size_width / patch_size;
3275
+ std::vector<int> pos_data(num_positions);
3276
+ // dimension H
3277
+ for (int i = 0; i < num_positions; i++) {
3278
+ pos_data[i] = i / n_patches_per_col;
3279
+ }
3280
+ set_input_i32("pos_h", pos_data);
3281
+ // dimension W
3282
+ for (int i = 0; i < num_positions; i++) {
3283
+ pos_data[i] = i % n_patches_per_col;
3284
+ }
3285
+ set_input_i32("pos_w", pos_data);
3286
+ } break;
3287
+ case PROJECTOR_TYPE_GLM_EDGE:
3288
+ {
3289
+ // llava and other models
3290
+ std::vector<int32_t> positions(num_positions);
2942
3291
  for (int i = 0; i < num_positions; i++) {
2943
- positions_data[i] = i;
3292
+ positions[i] = i;
2944
3293
  }
2945
- ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
2946
- free(positions_data);
3294
+ set_input_i32("positions", positions);
3295
+ } break;
3296
+ case PROJECTOR_TYPE_MLP:
3297
+ case PROJECTOR_TYPE_MLP_NORM:
3298
+ case PROJECTOR_TYPE_LDP:
3299
+ case PROJECTOR_TYPE_LDPV2:
3300
+ {
3301
+ // llava and other models
3302
+ std::vector<int32_t> positions(num_positions);
3303
+ for (int i = 0; i < num_positions; i++) {
3304
+ positions[i] = i;
3305
+ }
3306
+ set_input_i32("positions", positions);
2947
3307
 
2948
- if (!ctx->has_glm_projector) {
2949
- struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
2950
3308
  // The patches vector is used to get rows to index into the embeds with;
2951
3309
  // we should skip dim 0 only if we have CLS to avoid going out of bounds
2952
3310
  // when retrieving the rows.
2953
- int patch_offset = ctx->has_class_embedding ? 1 : 0;
2954
- int* patches_data = (int*)malloc(ggml_nbytes(patches));
3311
+ int patch_offset = model.class_embedding ? 1 : 0;
3312
+ std::vector<int32_t> patches(num_patches);
2955
3313
  for (int i = 0; i < num_patches; i++) {
2956
- patches_data[i] = i + patch_offset;
3314
+ patches[i] = i + patch_offset;
2957
3315
  }
2958
- ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
2959
- free(patches_data);
2960
- }
2961
- }
3316
+ set_input_i32("patches", patches);
3317
+ } break;
3318
+ case PROJECTOR_TYPE_GEMMA3:
3319
+ case PROJECTOR_TYPE_IDEFICS3:
3320
+ {
3321
+ // do nothing
3322
+ } break;
3323
+ default:
3324
+ GGML_ABORT("Unknown projector type");
2962
3325
  }
2963
3326
 
2964
3327
  ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
@@ -2975,13 +3338,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2975
3338
  // copy the embeddings to the location passed by the user
2976
3339
  ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
2977
3340
 
2978
- if (ctx->has_glm_projector) {
2979
- //eoi
2980
- ggml_tensor * eoi = ctx->vision_model.eoi_w;
2981
- int offset = ggml_nelements(embeddings);
2982
- ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi));
2983
- }
2984
-
2985
3341
  return true;
2986
3342
  }
2987
3343
 
@@ -2989,10 +3345,13 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
2989
3345
  assert(itype < GGML_TYPE_COUNT);
2990
3346
  ggml_type type = static_cast<ggml_type>(itype);
2991
3347
 
2992
- auto * ctx_clip = clip_model_load(fname_inp, 2);
3348
+ auto * ctx_clip = clip_init(fname_inp, clip_context_params{
3349
+ /* use_gpu */ false,
3350
+ /* verbosity */ GGML_LOG_LEVEL_ERROR,
3351
+ });
2993
3352
 
2994
- const auto & ctx_src = ctx_clip->ctx_gguf;
2995
- const auto & ctx_data = ctx_clip->ctx_data;
3353
+ const auto & ctx_src = ctx_clip->ctx_gguf.get();
3354
+ const auto & ctx_data = ctx_clip->ctx_data.get();
2996
3355
 
2997
3356
  auto * ctx_out = gguf_init_empty();
2998
3357
  gguf_set_kv(ctx_out, ctx_src);
@@ -3066,7 +3425,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
3066
3425
  f32_data = (float *)conv_buf.data();
3067
3426
  break;
3068
3427
  default:
3069
- LOG_ERR("Please use an input file in f32 or f16\n");
3428
+ LOG_ERR("%s: Please use an input file in f32 or f16\n", __func__);
3070
3429
  gguf_free(ctx_out);
3071
3430
  return false;
3072
3431
  }
@@ -3118,78 +3477,60 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
3118
3477
  }
3119
3478
 
3120
3479
  int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
3121
- if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
3122
- return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
3123
- }
3124
- if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
3125
- return ctx->vision_model.mm_model_peg_0_b->ne[0];
3126
- }
3127
- if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
3128
- return ctx->vision_model.mm_2_b->ne[0];
3129
- }
3130
- if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
3131
- return ctx->vision_model.mm_3_b->ne[0];
3132
- }
3133
- if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
3134
- if (ctx->minicpmv_version == 2) {
3135
- return 4096;
3136
- }
3137
- else if (ctx->minicpmv_version == 3) {
3138
- return 3584;
3139
- }
3140
- else if (ctx->minicpmv_version == 4) {
3141
- return 3584;
3142
- }
3143
- }
3144
- if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE){
3145
- return ctx->vision_model.mm_model_mlp_3_w->ne[1];
3146
- }
3147
- if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
3148
- return ctx->vision_model.mm_1_b->ne[0];
3149
- }
3150
- if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
3151
- return ctx->vision_model.mm_input_proj_w->ne[0];
3480
+ switch (ctx->proj_type) {
3481
+ case PROJECTOR_TYPE_LDP:
3482
+ return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
3483
+ case PROJECTOR_TYPE_LDPV2:
3484
+ return ctx->vision_model.mm_model_peg_0_b->ne[0];
3485
+ case PROJECTOR_TYPE_MLP:
3486
+ case PROJECTOR_TYPE_PIXTRAL:
3487
+ return ctx->vision_model.mm_2_b->ne[0];
3488
+ case PROJECTOR_TYPE_MLP_NORM:
3489
+ return ctx->vision_model.mm_3_b->ne[0];
3490
+ case PROJECTOR_TYPE_MINICPMV:
3491
+ if (ctx->minicpmv_version == 2) {
3492
+ return 4096;
3493
+ } else if (ctx->minicpmv_version == 3) {
3494
+ return 3584;
3495
+ } else if (ctx->minicpmv_version == 4) {
3496
+ return 3584;
3497
+ }
3498
+ GGML_ABORT("Unknown minicpmv version");
3499
+ case PROJECTOR_TYPE_GLM_EDGE:
3500
+ return ctx->vision_model.mm_model_mlp_3_w->ne[1];
3501
+ case PROJECTOR_TYPE_QWEN2VL:
3502
+ case PROJECTOR_TYPE_QWEN25VL:
3503
+ return ctx->vision_model.mm_1_b->ne[0];
3504
+ case PROJECTOR_TYPE_GEMMA3:
3505
+ return ctx->vision_model.mm_input_proj_w->ne[0];
3506
+ case PROJECTOR_TYPE_IDEFICS3:
3507
+ return ctx->vision_model.projection->ne[1];
3508
+ default:
3509
+ GGML_ABORT("Unknown projector type");
3152
3510
  }
3153
-
3154
- std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
3155
- throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
3156
3511
  }
3157
3512
 
3158
3513
  int clip_is_minicpmv(const struct clip_ctx * ctx) {
3159
- if (ctx->has_minicpmv_projector) {
3514
+ if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
3160
3515
  return ctx->minicpmv_version;
3161
3516
  }
3162
3517
  return 0;
3163
3518
  }
3164
3519
 
3165
3520
  bool clip_is_glm(const struct clip_ctx * ctx) {
3166
- return ctx->has_glm_projector;
3521
+ return ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE;
3167
3522
  }
3523
+
3168
3524
  bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
3169
- return ctx->has_qwen2vl_merger;
3525
+ return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL;
3170
3526
  }
3171
3527
 
3172
- // Determine the number of encoder layers to iterate over
3173
- int get_deepest_feature_layer(const struct clip_ctx * ctx) {
3174
- // Get the index of the second to last layer; this is the
3175
- // default for models that have a llava projector
3176
- const auto & hparams = ctx->vision_model.hparams;
3177
- int n_layer = hparams.n_layer - 1;
3178
- int deepest_feature_layer = -1;
3179
-
3180
- // Handle other projectors; incrementing here indicates that we
3181
- // should use the last encoder layer for the vision features.
3182
- if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
3183
- n_layer += 1;
3184
- }
3528
+ bool clip_is_llava(const struct clip_ctx * ctx) {
3529
+ return ctx->has_llava_projector;
3530
+ }
3185
3531
 
3186
- // If we set explicit vision feature layers, only go up to the deepest one
3187
- for (const auto & feature_layer : hparams.vision_feature_layer) {
3188
- if (feature_layer > deepest_feature_layer) {
3189
- deepest_feature_layer = feature_layer;
3190
- }
3191
- }
3192
- return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
3532
+ bool clip_is_gemma3(const struct clip_ctx * ctx) {
3533
+ return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
3193
3534
  }
3194
3535
 
3195
3536
  bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
@@ -3204,3 +3545,11 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
3204
3545
  clip_image_encode(ctx, n_threads, &clip_img, vec);
3205
3546
  return true;
3206
3547
  }
3548
+
3549
+ //
3550
+ // API used internally with mtmd
3551
+ //
3552
+
3553
+ projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
3554
+ return ctx->proj_type;
3555
+ }