@fugood/llama.node 0.3.17 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. package/CMakeLists.txt +3 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +39 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +366 -19
  24. package/src/LlamaCompletionWorker.h +30 -10
  25. package/src/LlamaContext.cpp +213 -5
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
  29. package/src/llama.cpp/.github/workflows/build.yml +41 -762
  30. package/src/llama.cpp/.github/workflows/docker.yml +5 -2
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +12 -12
  33. package/src/llama.cpp/CMakeLists.txt +5 -17
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +31 -3
  37. package/src/llama.cpp/common/arg.cpp +48 -29
  38. package/src/llama.cpp/common/chat.cpp +128 -106
  39. package/src/llama.cpp/common/chat.h +2 -0
  40. package/src/llama.cpp/common/common.cpp +37 -1
  41. package/src/llama.cpp/common/common.h +18 -9
  42. package/src/llama.cpp/common/llguidance.cpp +1 -0
  43. package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
  44. package/src/llama.cpp/common/minja/minja.hpp +69 -36
  45. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  46. package/src/llama.cpp/common/regex-partial.h +56 -0
  47. package/src/llama.cpp/common/sampling.cpp +57 -50
  48. package/src/llama.cpp/examples/CMakeLists.txt +2 -23
  49. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
  50. package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
  51. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  52. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  53. package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
  54. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  55. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  56. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  57. package/src/llama.cpp/ggml/include/ggml.h +10 -7
  58. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  60. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  61. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
  62. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
  63. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
  64. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
  65. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
  66. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  67. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  68. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  69. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
  71. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  73. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
  74. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
  75. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  76. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  77. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
  78. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
  80. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
  81. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
  82. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  83. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  84. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  85. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  86. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
  87. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  88. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
  89. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  90. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  91. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  92. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
  93. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
  94. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
  95. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
  96. package/src/llama.cpp/ggml/src/ggml.c +29 -20
  97. package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
  98. package/src/llama.cpp/include/llama.h +52 -11
  99. package/src/llama.cpp/requirements/requirements-all.txt +3 -3
  100. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  101. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  102. package/src/llama.cpp/src/llama-adapter.cpp +6 -0
  103. package/src/llama.cpp/src/llama-arch.cpp +3 -0
  104. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  105. package/src/llama.cpp/src/llama-batch.h +2 -1
  106. package/src/llama.cpp/src/llama-chat.cpp +17 -7
  107. package/src/llama.cpp/src/llama-chat.h +1 -0
  108. package/src/llama.cpp/src/llama-context.cpp +389 -501
  109. package/src/llama.cpp/src/llama-context.h +44 -32
  110. package/src/llama.cpp/src/llama-cparams.h +1 -0
  111. package/src/llama.cpp/src/llama-graph.cpp +20 -38
  112. package/src/llama.cpp/src/llama-graph.h +12 -8
  113. package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
  114. package/src/llama.cpp/src/llama-kv-cache.h +271 -85
  115. package/src/llama.cpp/src/llama-memory.h +11 -1
  116. package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
  117. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  118. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  119. package/src/llama.cpp/src/llama-model.cpp +316 -69
  120. package/src/llama.cpp/src/llama-model.h +8 -1
  121. package/src/llama.cpp/src/llama-quant.cpp +15 -13
  122. package/src/llama.cpp/src/llama-sampling.cpp +18 -6
  123. package/src/llama.cpp/src/llama-vocab.cpp +42 -4
  124. package/src/llama.cpp/src/llama-vocab.h +6 -0
  125. package/src/llama.cpp/src/llama.cpp +14 -0
  126. package/src/llama.cpp/tests/CMakeLists.txt +10 -2
  127. package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
  128. package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
  129. package/src/llama.cpp/tests/test-chat.cpp +3 -1
  130. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  131. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  132. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  133. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  134. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  135. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
  136. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  137. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
  138. package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
  139. package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
  140. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
  141. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
  142. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  143. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
  144. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  145. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
  146. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  147. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
  148. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
  149. package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
  150. package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
  151. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  152. package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
  153. package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
  154. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  155. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  156. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  157. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  158. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  159. package/src/llama.cpp/examples/llava/clip.h +0 -135
  160. package/src/llama.cpp/examples/llava/llava.cpp +0 -586
  161. package/src/llama.cpp/examples/llava/llava.h +0 -49
  162. package/src/llama.cpp/examples/llava/mtmd.h +0 -168
  163. package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
  164. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  165. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  166. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  167. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  168. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  169. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  170. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  171. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  172. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  173. /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
  174. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  175. /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
  176. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  177. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  178. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  179. /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
  180. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  181. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  182. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  183. /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
  184. /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
  185. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  186. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  187. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  188. /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
  189. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  190. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  191. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  192. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
  193. /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
@@ -29,9 +29,21 @@
29
29
  #include <limits>
30
30
  #include <array>
31
31
  #include <numeric>
32
+ #include <functional>
32
33
 
33
34
  struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
34
35
 
36
+ enum ffn_op_type {
37
+ FFN_GELU,
38
+ FFN_SILU,
39
+ FFN_GELU_QUICK,
40
+ };
41
+
42
+ enum norm_type {
43
+ NORM_TYPE_NORMAL,
44
+ NORM_TYPE_RMS,
45
+ };
46
+
35
47
  //#define CLIP_DEBUG_FUNCTIONS
36
48
 
37
49
  #ifdef CLIP_DEBUG_FUNCTIONS
@@ -155,13 +167,19 @@ enum patch_merge_type {
155
167
  struct clip_hparams {
156
168
  int32_t image_size;
157
169
  int32_t patch_size;
158
- int32_t hidden_size;
159
- int32_t n_intermediate;
170
+ int32_t n_embd;
171
+ int32_t n_ff;
160
172
  int32_t projection_dim;
161
173
  int32_t n_head;
162
174
  int32_t n_layer;
163
175
  int32_t proj_scale_factor = 0; // idefics3
164
176
 
177
+ // for models using dynamic image size, we need to have a smaller image size to warmup
178
+ // otherwise, user will get OOM everytime they load the model
179
+ int32_t warmup_image_size = 0;
180
+
181
+ ffn_op_type ffn_op = FFN_GELU;
182
+
165
183
  patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
166
184
 
167
185
  float eps = 1e-6;
@@ -172,145 +190,148 @@ struct clip_hparams {
172
190
  std::unordered_set<int32_t> vision_feature_layer;
173
191
  int32_t attn_window_size = 0;
174
192
  int32_t n_wa_pattern = 0;
193
+ int32_t spatial_merge_size = 0;
175
194
  };
176
195
 
177
196
  struct clip_layer {
178
197
  // attention
179
- struct ggml_tensor * k_w = nullptr;
180
- struct ggml_tensor * k_b = nullptr;
181
- struct ggml_tensor * q_w = nullptr;
182
- struct ggml_tensor * q_b = nullptr;
183
- struct ggml_tensor * v_w = nullptr;
184
- struct ggml_tensor * v_b = nullptr;
198
+ ggml_tensor * k_w = nullptr;
199
+ ggml_tensor * k_b = nullptr;
200
+ ggml_tensor * q_w = nullptr;
201
+ ggml_tensor * q_b = nullptr;
202
+ ggml_tensor * v_w = nullptr;
203
+ ggml_tensor * v_b = nullptr;
185
204
 
186
- struct ggml_tensor * o_w = nullptr;
187
- struct ggml_tensor * o_b = nullptr;
205
+ ggml_tensor * o_w = nullptr;
206
+ ggml_tensor * o_b = nullptr;
188
207
 
189
- // layernorm 1
190
- struct ggml_tensor * ln_1_w = nullptr;
191
- struct ggml_tensor * ln_1_b = nullptr;
192
-
193
- // ff
194
- struct ggml_tensor * ff_i_w = nullptr; // legacy naming
195
- struct ggml_tensor * ff_i_b = nullptr; // legacy naming
196
- struct ggml_tensor * ff_o_w = nullptr; // legacy naming
197
- struct ggml_tensor * ff_o_b = nullptr; // legacy naming
208
+ ggml_tensor * k_norm = nullptr;
209
+ ggml_tensor * q_norm = nullptr;
198
210
 
199
- struct ggml_tensor * ff_up_w = nullptr;
200
- struct ggml_tensor * ff_up_b = nullptr;
201
- struct ggml_tensor * ff_gate_w = nullptr;
202
- struct ggml_tensor * ff_gate_b = nullptr;
203
- struct ggml_tensor * ff_down_w = nullptr;
204
- struct ggml_tensor * ff_down_b = nullptr;
211
+ // layernorm 1
212
+ ggml_tensor * ln_1_w = nullptr;
213
+ ggml_tensor * ln_1_b = nullptr;
205
214
 
206
- struct ggml_tensor * ff_g_w = NULL;
207
- struct ggml_tensor * ff_g_b = NULL;
215
+ ggml_tensor * ff_up_w = nullptr;
216
+ ggml_tensor * ff_up_b = nullptr;
217
+ ggml_tensor * ff_gate_w = nullptr;
218
+ ggml_tensor * ff_gate_b = nullptr;
219
+ ggml_tensor * ff_down_w = nullptr;
220
+ ggml_tensor * ff_down_b = nullptr;
208
221
 
209
222
  // layernorm 2
210
- struct ggml_tensor * ln_2_w = nullptr;
211
- struct ggml_tensor * ln_2_b = nullptr;
223
+ ggml_tensor * ln_2_w = nullptr;
224
+ ggml_tensor * ln_2_b = nullptr;
225
+
226
+ // layer scale (no bias)
227
+ ggml_tensor * ls_1_w = nullptr;
228
+ ggml_tensor * ls_2_w = nullptr;
212
229
  };
213
230
 
214
231
  struct clip_vision_model {
215
232
  struct clip_hparams hparams;
216
233
 
217
234
  // embeddings
218
- struct ggml_tensor * class_embedding = nullptr;
219
- struct ggml_tensor * patch_embeddings_0 = nullptr;
220
- struct ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
221
- struct ggml_tensor * patch_bias = nullptr;
222
- struct ggml_tensor * position_embeddings = nullptr;
235
+ ggml_tensor * class_embedding = nullptr;
236
+ ggml_tensor * patch_embeddings_0 = nullptr;
237
+ ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
238
+ ggml_tensor * patch_bias = nullptr;
239
+ ggml_tensor * position_embeddings = nullptr;
223
240
 
224
- struct ggml_tensor * pre_ln_w = nullptr;
225
- struct ggml_tensor * pre_ln_b = nullptr;
241
+ ggml_tensor * pre_ln_w = nullptr;
242
+ ggml_tensor * pre_ln_b = nullptr;
226
243
 
227
244
  std::vector<clip_layer> layers;
228
245
 
229
- struct ggml_tensor * post_ln_w;
230
- struct ggml_tensor * post_ln_b;
246
+ ggml_tensor * post_ln_w;
247
+ ggml_tensor * post_ln_b;
231
248
 
232
- struct ggml_tensor * projection;
249
+ ggml_tensor * projection;
233
250
 
234
251
  // LLaVA projection
235
- struct ggml_tensor * mm_0_w = nullptr;
236
- struct ggml_tensor * mm_0_b = nullptr;
237
- struct ggml_tensor * mm_2_w = nullptr;
238
- struct ggml_tensor * mm_2_b = nullptr;
252
+ ggml_tensor * mm_input_norm_w = nullptr;
253
+ ggml_tensor * mm_0_w = nullptr;
254
+ ggml_tensor * mm_0_b = nullptr;
255
+ ggml_tensor * mm_2_w = nullptr;
256
+ ggml_tensor * mm_2_b = nullptr;
239
257
 
240
- struct ggml_tensor * image_newline = nullptr;
258
+ ggml_tensor * image_newline = nullptr;
241
259
 
242
260
  // Yi type models with mlp+normalization projection
243
- struct ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
244
- struct ggml_tensor * mm_1_b = nullptr;
245
- struct ggml_tensor * mm_3_w = nullptr;
246
- struct ggml_tensor * mm_3_b = nullptr;
247
- struct ggml_tensor * mm_4_w = nullptr;
248
- struct ggml_tensor * mm_4_b = nullptr;
249
-
250
- //GLMV-Edge projection
251
- struct ggml_tensor * mm_model_adapter_conv_w = nullptr;
252
- struct ggml_tensor * mm_model_adapter_conv_b = nullptr;
261
+ ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
262
+ ggml_tensor * mm_1_b = nullptr;
263
+ ggml_tensor * mm_3_w = nullptr;
264
+ ggml_tensor * mm_3_b = nullptr;
265
+ ggml_tensor * mm_4_w = nullptr;
266
+ ggml_tensor * mm_4_b = nullptr;
267
+
268
+ // GLMV-Edge projection
269
+ ggml_tensor * mm_model_adapter_conv_w = nullptr;
270
+ ggml_tensor * mm_model_adapter_conv_b = nullptr;
271
+ ggml_tensor * mm_glm_tok_boi = nullptr;
272
+ ggml_tensor * mm_glm_tok_eoi = nullptr;
253
273
 
254
274
  // MobileVLM projection
255
- struct ggml_tensor * mm_model_mlp_1_w = nullptr;
256
- struct ggml_tensor * mm_model_mlp_1_b = nullptr;
257
- struct ggml_tensor * mm_model_mlp_3_w = nullptr;
258
- struct ggml_tensor * mm_model_mlp_3_b = nullptr;
259
- struct ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
260
- struct ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
261
- struct ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
262
- struct ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
263
- struct ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
264
- struct ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
265
- struct ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
266
- struct ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
267
- struct ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
268
- struct ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
269
- struct ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
270
- struct ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
271
- struct ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
272
- struct ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
273
- struct ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
274
- struct ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
275
- struct ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
276
- struct ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
277
- struct ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
278
- struct ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
275
+ ggml_tensor * mm_model_mlp_1_w = nullptr;
276
+ ggml_tensor * mm_model_mlp_1_b = nullptr;
277
+ ggml_tensor * mm_model_mlp_3_w = nullptr;
278
+ ggml_tensor * mm_model_mlp_3_b = nullptr;
279
+ ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
280
+ ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
281
+ ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
282
+ ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
283
+ ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
284
+ ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
285
+ ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
286
+ ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
287
+ ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
288
+ ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
289
+ ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
290
+ ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
291
+ ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
292
+ ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
293
+ ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
294
+ ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
295
+ ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
296
+ ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
297
+ ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
298
+ ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
279
299
 
280
300
  // MobileVLM_V2 projection
281
- struct ggml_tensor * mm_model_mlp_0_w = nullptr;
282
- struct ggml_tensor * mm_model_mlp_0_b = nullptr;
283
- struct ggml_tensor * mm_model_mlp_2_w = nullptr;
284
- struct ggml_tensor * mm_model_mlp_2_b = nullptr;
285
- struct ggml_tensor * mm_model_peg_0_w = nullptr;
286
- struct ggml_tensor * mm_model_peg_0_b = nullptr;
301
+ ggml_tensor * mm_model_mlp_0_w = nullptr;
302
+ ggml_tensor * mm_model_mlp_0_b = nullptr;
303
+ ggml_tensor * mm_model_mlp_2_w = nullptr;
304
+ ggml_tensor * mm_model_mlp_2_b = nullptr;
305
+ ggml_tensor * mm_model_peg_0_w = nullptr;
306
+ ggml_tensor * mm_model_peg_0_b = nullptr;
287
307
 
288
308
  // MINICPMV projection
289
- struct ggml_tensor * mm_model_pos_embed_k = nullptr;
290
- struct ggml_tensor * mm_model_query = nullptr;
291
- struct ggml_tensor * mm_model_proj = nullptr;
292
- struct ggml_tensor * mm_model_kv_proj = nullptr;
293
- struct ggml_tensor * mm_model_attn_q_w = nullptr;
294
- struct ggml_tensor * mm_model_attn_q_b = nullptr;
295
- struct ggml_tensor * mm_model_attn_k_w = nullptr;
296
- struct ggml_tensor * mm_model_attn_k_b = nullptr;
297
- struct ggml_tensor * mm_model_attn_v_w = nullptr;
298
- struct ggml_tensor * mm_model_attn_v_b = nullptr;
299
- struct ggml_tensor * mm_model_attn_o_w = nullptr;
300
- struct ggml_tensor * mm_model_attn_o_b = nullptr;
301
- struct ggml_tensor * mm_model_ln_q_w = nullptr;
302
- struct ggml_tensor * mm_model_ln_q_b = nullptr;
303
- struct ggml_tensor * mm_model_ln_kv_w = nullptr;
304
- struct ggml_tensor * mm_model_ln_kv_b = nullptr;
305
- struct ggml_tensor * mm_model_ln_post_w = nullptr;
306
- struct ggml_tensor * mm_model_ln_post_b = nullptr;
309
+ ggml_tensor * mm_model_pos_embed_k = nullptr;
310
+ ggml_tensor * mm_model_query = nullptr;
311
+ ggml_tensor * mm_model_proj = nullptr;
312
+ ggml_tensor * mm_model_kv_proj = nullptr;
313
+ ggml_tensor * mm_model_attn_q_w = nullptr;
314
+ ggml_tensor * mm_model_attn_q_b = nullptr;
315
+ ggml_tensor * mm_model_attn_k_w = nullptr;
316
+ ggml_tensor * mm_model_attn_k_b = nullptr;
317
+ ggml_tensor * mm_model_attn_v_w = nullptr;
318
+ ggml_tensor * mm_model_attn_v_b = nullptr;
319
+ ggml_tensor * mm_model_attn_o_w = nullptr;
320
+ ggml_tensor * mm_model_attn_o_b = nullptr;
321
+ ggml_tensor * mm_model_ln_q_w = nullptr;
322
+ ggml_tensor * mm_model_ln_q_b = nullptr;
323
+ ggml_tensor * mm_model_ln_kv_w = nullptr;
324
+ ggml_tensor * mm_model_ln_kv_b = nullptr;
325
+ ggml_tensor * mm_model_ln_post_w = nullptr;
326
+ ggml_tensor * mm_model_ln_post_b = nullptr;
307
327
 
308
328
  // gemma3
309
- struct ggml_tensor * mm_input_proj_w = nullptr;
310
- struct ggml_tensor * mm_soft_emb_norm_w = nullptr;
329
+ ggml_tensor * mm_input_proj_w = nullptr;
330
+ ggml_tensor * mm_soft_emb_norm_w = nullptr;
311
331
 
312
332
  // pixtral
313
- struct ggml_tensor * token_embd_img_break = nullptr;
333
+ ggml_tensor * token_embd_img_break = nullptr;
334
+ ggml_tensor * mm_patch_merger_w = nullptr;
314
335
  };
315
336
 
316
337
  struct clip_ctx {
@@ -320,11 +341,8 @@ struct clip_ctx {
320
341
  struct clip_vision_model vision_model;
321
342
  projector_type proj_type = PROJECTOR_TYPE_MLP;
322
343
 
323
- int32_t max_feature_layer; // unused in newer models like gemma3
324
344
  float image_mean[3];
325
345
  float image_std[3];
326
- bool use_gelu = false;
327
- bool use_silu = false;
328
346
 
329
347
  gguf_context_ptr ctx_gguf;
330
348
  ggml_context_ptr ctx_data;
@@ -345,9 +363,12 @@ struct clip_ctx {
345
363
 
346
364
  clip_ctx(clip_context_params & ctx_params) {
347
365
  backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
348
- backend = ctx_params.use_gpu
349
- ? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
350
- : nullptr;
366
+ if (!backend_cpu) {
367
+ throw std::runtime_error("failed to initialize CPU backend");
368
+ }
369
+ backend = ctx_params.use_gpu
370
+ ? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
371
+ : nullptr;
351
372
 
352
373
  if (backend) {
353
374
  LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
@@ -362,7 +383,7 @@ struct clip_ctx {
362
383
  backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
363
384
 
364
385
  sched.reset(
365
- ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
386
+ ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
366
387
  );
367
388
  }
368
389
 
@@ -374,1194 +395,1337 @@ struct clip_ctx {
374
395
  }
375
396
  };
376
397
 
377
- static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) {
378
- const auto & model = ctx->vision_model;
379
- const auto & hparams = model.hparams;
380
-
381
- int image_size_width = img.nx;
382
- int image_size_height = img.ny;
383
-
384
- const int patch_size = hparams.patch_size;
385
- const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
386
- const int hidden_size = hparams.hidden_size;
387
- const int n_head = hparams.n_head;
388
- const int d_head = hidden_size / n_head;
389
- const int n_layer = hparams.n_layer;
390
- const float eps = hparams.eps;
391
-
392
- struct ggml_init_params params = {
393
- /*.mem_size =*/ ctx->buf_compute_meta.size(),
394
- /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
395
- /*.no_alloc =*/ true,
396
- };
397
-
398
- ggml_context_ptr ctx0_ptr(ggml_init(params));
399
- auto ctx0 = ctx0_ptr.get();
400
-
401
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
402
-
403
- // input raw
404
- struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3);
405
- ggml_set_name(inp_raw, "inp_raw");
406
- ggml_set_input(inp_raw);
407
-
408
- struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
409
- inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
410
- inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
411
- inp = ggml_add(ctx0, inp, model.patch_bias);
412
-
413
- // position embeddings
414
- struct ggml_tensor * embeddings = ggml_add(ctx0, inp, model.position_embeddings);
415
-
416
- // loop over layers
417
- for (int il = 0; il < n_layer; il++) {
418
- struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
419
-
420
- // layernorm1
421
- {
422
- cur = ggml_norm(ctx0, cur, eps);
423
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), model.layers[il].ln_1_b);
398
+ struct clip_graph {
399
+ clip_ctx * ctx;
400
+ const clip_vision_model & model;
401
+ const clip_hparams & hparams;
402
+
403
+ // we only support single image per batch
404
+ const clip_image_f32 & img;
405
+
406
+ const int patch_size;
407
+ const int n_patches_x;
408
+ const int n_patches_y;
409
+ const int n_patches;
410
+ const int n_embd;
411
+ const int n_head;
412
+ const int d_head;
413
+ const int n_layer;
414
+ const float eps;
415
+ const float kq_scale;
416
+
417
+ ggml_context_ptr ctx0_ptr;
418
+ ggml_context * ctx0;
419
+ ggml_cgraph * gf;
420
+
421
+ clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
422
+ ctx(ctx),
423
+ model(ctx->vision_model),
424
+ hparams(model.hparams),
425
+ img(img),
426
+ patch_size(hparams.patch_size),
427
+ n_patches_x(img.nx / patch_size),
428
+ n_patches_y(img.ny / patch_size),
429
+ n_patches(n_patches_x * n_patches_y),
430
+ n_embd(hparams.n_embd),
431
+ n_head(hparams.n_head),
432
+ d_head(n_embd / n_head),
433
+ n_layer(hparams.n_layer),
434
+ eps(hparams.eps),
435
+ kq_scale(1.0f / sqrtf((float)d_head)) {
436
+ struct ggml_init_params params = {
437
+ /*.mem_size =*/ ctx->buf_compute_meta.size(),
438
+ /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
439
+ /*.no_alloc =*/ true,
440
+ };
441
+ ctx0_ptr.reset(ggml_init(params));
442
+ ctx0 = ctx0_ptr.get();
443
+ gf = ggml_new_graph(ctx0);
444
+ }
445
+
446
+ ggml_cgraph * build_siglip() {
447
+ ggml_tensor * inp = build_inp();
448
+ ggml_tensor * cur = build_vit(
449
+ inp, n_patches,
450
+ NORM_TYPE_NORMAL,
451
+ hparams.ffn_op,
452
+ model.position_embeddings,
453
+ nullptr);
454
+
455
+ if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
456
+ const int batch_size = 1;
457
+ GGML_ASSERT(n_patches_x == n_patches_y);
458
+ const int patches_per_image = n_patches_x;
459
+ const int kernel_size = hparams.proj_scale_factor;
460
+
461
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
462
+ cur = ggml_reshape_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
463
+
464
+ // doing a pool2d to reduce the number of output tokens
465
+ cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
466
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size);
467
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
468
+
469
+ // apply norm before projection
470
+ cur = ggml_rms_norm(ctx0, cur, eps);
471
+ cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
472
+
473
+ // apply projection
474
+ cur = ggml_mul_mat(ctx0,
475
+ ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
476
+ cur);
477
+
478
+ } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
479
+ // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
480
+
481
+ const int scale_factor = model.hparams.proj_scale_factor;
482
+ const int n_embd = cur->ne[0];
483
+ const int seq = cur->ne[1];
484
+ const int bsz = 1; // batch size, always 1 for now since we don't support batching
485
+ const int height = std::sqrt(seq);
486
+ const int width = std::sqrt(seq);
487
+ GGML_ASSERT(scale_factor != 0);
488
+ cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
489
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
490
+ cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
491
+ n_embd * scale_factor * scale_factor,
492
+ height / scale_factor,
493
+ width / scale_factor,
494
+ bsz);
495
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
496
+ cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur),
497
+ n_embd * scale_factor * scale_factor,
498
+ seq / (scale_factor * scale_factor),
499
+ bsz);
500
+
501
+ cur = ggml_mul_mat(ctx0, model.projection, cur);
502
+ } else {
503
+ GGML_ABORT("SigLIP: Unsupported projector type");
424
504
  }
425
505
 
426
- // self-attention
427
- {
428
-
429
- struct ggml_tensor * Q =
430
- ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
506
+ // build the graph
507
+ ggml_build_forward_expand(gf, cur);
431
508
 
432
- Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
433
- Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
509
+ return gf;
510
+ }
434
511
 
435
- struct ggml_tensor * K =
436
- ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
512
+ ggml_cgraph * build_pixtral() {
513
+ const int n_merge = hparams.spatial_merge_size;
437
514
 
438
- K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
439
- K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
515
+ // 2D input positions
516
+ ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
517
+ ggml_set_name(pos_h, "pos_h");
518
+ ggml_set_input(pos_h);
440
519
 
441
- struct ggml_tensor * V =
442
- ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
520
+ ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
521
+ ggml_set_name(pos_w, "pos_w");
522
+ ggml_set_input(pos_w);
443
523
 
444
- V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches);
445
- V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
524
+ auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
525
+ return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta);
526
+ };
446
527
 
447
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
448
- KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
528
+ ggml_tensor * inp = build_inp();
529
+ ggml_tensor * cur = build_vit(
530
+ inp, n_patches,
531
+ NORM_TYPE_RMS,
532
+ hparams.ffn_op,
533
+ nullptr, // no learned pos embd
534
+ add_pos);
449
535
 
450
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
451
- KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
452
- KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
536
+ // mistral small 3.1 patch merger
537
+ // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
538
+ if (model.mm_patch_merger_w) {
539
+ GGML_ASSERT(hparams.spatial_merge_size > 0);
453
540
 
454
- cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
455
- }
541
+ cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
456
542
 
457
- // attention output
458
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
543
+ // reshape image tokens to 2D grid
544
+ cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
545
+ cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
546
+ cur = ggml_cont(ctx0, cur);
459
547
 
460
- // re-add the layer input, e.g., residual
461
- cur = ggml_add(ctx0, cur, embeddings);
548
+ // torch.nn.functional.unfold is just an im2col under the hood
549
+ // we just need a dummy kernel to make it work
550
+ ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
551
+ cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
462
552
 
463
- embeddings = cur; // embeddings = residual, cur = hidden_states
464
-
465
- // layernorm2
466
- {
467
- cur = ggml_norm(ctx0, cur, eps);
468
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
553
+ // project to n_embd
554
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
555
+ cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
469
556
  }
470
557
 
471
- cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
472
- cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
473
-
474
- // siglip uses gelu
475
- cur = ggml_gelu(ctx0, cur);
476
-
477
- cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
478
- cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
479
-
480
- // residual 2
481
- cur = ggml_add(ctx0, embeddings, cur);
558
+ // LlavaMultiModalProjector (always using GELU activation)
559
+ {
560
+ cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
561
+ if (model.mm_1_b) {
562
+ cur = ggml_add(ctx0, cur, model.mm_1_b);
563
+ }
482
564
 
483
- embeddings = cur;
484
- }
565
+ cur = ggml_gelu(ctx0, cur);
566
+ cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
567
+ if (model.mm_2_b) {
568
+ cur = ggml_add(ctx0, cur, model.mm_2_b);
569
+ }
570
+ }
485
571
 
486
- // post-layernorm
487
- if (model.post_ln_w) {
488
- embeddings = ggml_norm(ctx0, embeddings, eps);
489
- ggml_set_name(embeddings, "post_ln");
572
+ // arrangement of the [IMG_BREAK] token
573
+ {
574
+ // not efficient, but works
575
+ // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
576
+ // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
577
+ // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
490
578
 
491
- embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
492
- }
579
+ const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
580
+ const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
581
+ const int p_total = p_x * p_y;
582
+ const int n_embd_text = cur->ne[0];
583
+ const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row
493
584
 
494
- if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
495
- const int batch_size = 1;
496
- const int mm_tokens_per_image = 256; // default value for gemma3
497
- const int tokens_per_side = sqrt(mm_tokens_per_image);
498
- const int patches_per_image = sqrt(num_patches);
499
- const int kernel_size = patches_per_image / tokens_per_side;
500
-
501
- embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
502
- embeddings = ggml_reshape_4d(ctx0, embeddings, patches_per_image, patches_per_image, hidden_size, batch_size);
503
-
504
- // doing a pool2d to reduce the number of output tokens to 256
505
- embeddings = ggml_pool_2d(ctx0, embeddings, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
506
- embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], hidden_size, batch_size);
507
- embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
508
-
509
- // apply norm before projection
510
- embeddings = ggml_rms_norm(ctx0, embeddings, eps);
511
- embeddings = ggml_mul(ctx0, embeddings, model.mm_soft_emb_norm_w);
512
-
513
- // apply projection
514
- embeddings = ggml_mul_mat(ctx0,
515
- ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
516
- embeddings);
517
-
518
- } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
519
- // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
520
-
521
- ggml_tensor * cur = embeddings;
522
- const int scale_factor = model.hparams.proj_scale_factor;
523
- const int n_embd = cur->ne[0];
524
- const int seq = cur->ne[1];
525
- const int bsz = 1; // batch size, always 1 for now since we don't support batching
526
- const int height = std::sqrt(seq);
527
- const int width = std::sqrt(seq);
528
- GGML_ASSERT(scale_factor != 0);
529
- cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
530
- cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
531
- cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
532
- n_embd * scale_factor * scale_factor,
533
- height / scale_factor,
534
- width / scale_factor,
535
- bsz);
536
- cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
537
- cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur),
538
- n_embd * scale_factor * scale_factor,
539
- seq / (scale_factor * scale_factor),
540
- bsz);
541
-
542
- cur = ggml_mul_mat(ctx0, model.projection, cur);
543
- embeddings = cur;
544
- } else {
545
- GGML_ABORT("SigLIP: Unsupported projector type");
546
- }
547
-
548
- // build the graph
549
- ggml_build_forward_expand(gf, embeddings);
550
-
551
- return gf;
552
- }
585
+ ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y);
586
+ ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y);
587
+ tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
588
+ tok = ggml_add(ctx0, tok, model.token_embd_img_break);
589
+ tmp = ggml_concat(ctx0, tmp, tok, 1);
590
+ cur = ggml_view_2d(ctx0, tmp,
591
+ n_embd_text, n_tokens_output,
592
+ ggml_row_size(tmp->type, n_embd_text), 0);
593
+ }
553
594
 
554
- // implementation of the 2D RoPE without adding a new op in ggml
555
- // this is not efficient (use double the memory), but works on all backends
556
- // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
557
- static ggml_tensor * build_rope_2d(
558
- ggml_context * ctx0,
559
- ggml_tensor * cur,
560
- ggml_tensor * pos_h,
561
- ggml_tensor * pos_w,
562
- const float freq_base
563
- ) {
564
- const int64_t n_dim = cur->ne[0];
565
- const int64_t n_head = cur->ne[1];
566
- const int64_t n_pos = cur->ne[2];
567
-
568
- // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
569
- // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
570
- // first half of cur will use 1e-0, 1e-2 (even)
571
- // second half of cur will use 1e-1, 1e-3 (odd)
572
- // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
573
- // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
574
- // then for the second half, we use freq_scale to shift the inv_freq
575
- // ^ why? replace (2i) with (2i+1) in the above equation
576
- const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
577
-
578
- // first half
579
- ggml_tensor * first;
580
- {
581
- first = ggml_view_3d(ctx0, cur,
582
- n_dim/2, n_head, n_pos,
583
- ggml_row_size(cur->type, n_dim),
584
- ggml_row_size(cur->type, n_dim*n_head),
585
- 0);
586
- first = ggml_rope_ext(
587
- ctx0,
588
- first,
589
- pos_h, // positions
590
- nullptr, // freq factors
591
- n_dim/2, // n_dims
592
- 0, 0, freq_base,
593
- 1.0f, 0.0f, 1.0f, 0.0f, 0.0f
594
- );
595
- }
595
+ // build the graph
596
+ ggml_build_forward_expand(gf, cur);
596
597
 
597
- // second half
598
- ggml_tensor * second;
599
- {
600
- second = ggml_view_3d(ctx0, cur,
601
- n_dim/2, n_head, n_pos,
602
- ggml_row_size(cur->type, n_dim),
603
- ggml_row_size(cur->type, n_dim*n_head),
604
- n_dim/2 * ggml_element_size(cur));
605
- second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
606
- second = ggml_rope_ext(
607
- ctx0,
608
- second,
609
- pos_w, // positions
610
- nullptr, // freq factors
611
- n_dim/2, // n_dims
612
- 0, 0, freq_base,
613
- freq_scale_odd,
614
- 0.0f, 1.0f, 0.0f, 0.0f
615
- );
598
+ return gf;
616
599
  }
617
600
 
618
- cur = ggml_concat(ctx0, first, second, 0);
619
- return cur;
620
- }
601
+ // Qwen2VL and Qwen2.5VL use M-RoPE
602
+ ggml_cgraph * build_qwen2vl() {
603
+ GGML_ASSERT(model.patch_bias == nullptr);
604
+ GGML_ASSERT(model.class_embedding == nullptr);
621
605
 
622
- static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) {
623
- const auto & model = ctx->vision_model;
624
- const auto & hparams = model.hparams;
625
-
626
- GGML_ASSERT(ctx->proj_type == PROJECTOR_TYPE_PIXTRAL);
627
-
628
- int image_size_width = img.nx;
629
- int image_size_height = img.ny;
630
-
631
- const int patch_size = hparams.patch_size;
632
- const int n_patches_x = image_size_width / patch_size;
633
- const int n_patches_y = image_size_height / patch_size;
634
- const int num_patches = n_patches_x * n_patches_y;
635
- const int hidden_size = hparams.hidden_size;
636
- const int n_head = hparams.n_head;
637
- const int d_head = hidden_size / n_head;
638
- const int n_layer = hparams.n_layer;
639
- const float eps = hparams.eps;
640
-
641
- struct ggml_init_params params = {
642
- /*.mem_size =*/ ctx->buf_compute_meta.size(),
643
- /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
644
- /*.no_alloc =*/ true,
645
- };
606
+ const int batch_size = 1;
607
+ const bool use_window_attn = hparams.n_wa_pattern > 0;
608
+ const int n_wa_pattern = hparams.n_wa_pattern;
609
+ const int n_pos = n_patches;
610
+ const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
646
611
 
647
- ggml_context_ptr ctx0_ptr(ggml_init(params));
648
- auto ctx0 = ctx0_ptr.get();
612
+ norm_type norm_t = ctx->proj_type == PROJECTOR_TYPE_QWEN25VL
613
+ ? NORM_TYPE_RMS // qwen 2.5 vl
614
+ : NORM_TYPE_NORMAL; // qwen 2 vl
649
615
 
650
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
616
+ int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
651
617
 
652
- // input raw
653
- struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3);
654
- ggml_set_name(inp_raw, "inp_raw");
655
- ggml_set_input(inp_raw);
618
+ ggml_tensor * inp_raw = build_inp_raw();
619
+ ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
656
620
 
657
- // 2D input positions
658
- struct ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
659
- ggml_set_name(pos_h, "pos_h");
660
- ggml_set_input(pos_h);
661
- struct ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
662
- ggml_set_name(pos_w, "pos_w");
663
- ggml_set_input(pos_w);
621
+ GGML_ASSERT(img.nx % (patch_size * 2) == 0);
622
+ GGML_ASSERT(img.ny % (patch_size * 2) == 0);
664
623
 
665
- struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
666
- inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
667
- inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
624
+ // second conv dimension
625
+ {
626
+ auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
627
+ inp = ggml_add(ctx0, inp, inp_1);
628
+
629
+ inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
630
+ inp = ggml_reshape_4d(
631
+ ctx0, inp,
632
+ n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
633
+ inp = ggml_reshape_4d(
634
+ ctx0, inp,
635
+ n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
636
+ inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
637
+ inp = ggml_reshape_3d(
638
+ ctx0, inp,
639
+ n_embd, n_patches_x * n_patches_y, batch_size);
640
+ }
641
+
642
+ ggml_tensor * inpL = inp;
643
+ ggml_tensor * window_mask = nullptr;
644
+ ggml_tensor * window_idx = nullptr;
645
+ ggml_tensor * inv_window_idx = nullptr;
646
+
647
+ ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
648
+ ggml_set_name(positions, "positions");
649
+ ggml_set_input(positions);
650
+
651
+ // pre-layernorm
652
+ if (model.pre_ln_w) {
653
+ inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
654
+ }
655
+
656
+ if (use_window_attn) {
657
+ // handle window attention inputs
658
+ inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
659
+ ggml_set_name(inv_window_idx, "inv_window_idx");
660
+ ggml_set_input(inv_window_idx);
661
+ // mask for window attention
662
+ window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
663
+ ggml_set_name(window_mask, "window_mask");
664
+ ggml_set_input(window_mask);
665
+
666
+ // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
667
+ GGML_ASSERT(batch_size == 1);
668
+ inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
669
+ inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
670
+ inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
671
+ }
672
+
673
+ // loop over layers
674
+ for (int il = 0; il < n_layer; il++) {
675
+ auto & layer = model.layers[il];
676
+ const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
668
677
 
669
- struct ggml_tensor * embeddings = inp;
678
+ ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
670
679
 
671
- // pre-layer norm
672
- embeddings = ggml_mul(ctx0, ggml_rms_norm(ctx0, embeddings, eps), model.pre_ln_w);
680
+ // layernorm1
681
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
682
+ cb(cur, "ln1", il);
673
683
 
674
- // loop over layers
675
- for (int il = 0; il < n_layer; il++) {
676
- struct ggml_tensor * cur = embeddings;
684
+ // self-attention
685
+ {
686
+ ggml_tensor * Qcur = ggml_add(ctx0,
687
+ ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
688
+ ggml_tensor * Kcur = ggml_add(ctx0,
689
+ ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
690
+ ggml_tensor * Vcur = ggml_add(ctx0,
691
+ ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
692
+
693
+ Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
694
+ Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
695
+ Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
696
+
697
+ cb(Qcur, "Qcur", il);
698
+ cb(Kcur, "Kcur", il);
699
+ cb(Vcur, "Vcur", il);
700
+
701
+ // apply M-RoPE
702
+ Qcur = ggml_rope_multi(
703
+ ctx0, Qcur, positions, nullptr,
704
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
705
+ Kcur = ggml_rope_multi(
706
+ ctx0, Kcur, positions, nullptr,
707
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
677
708
 
678
- // pre-attention norm
679
- cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_1_w);
709
+ cb(Qcur, "Qcur_rope", il);
710
+ cb(Kcur, "Kcur_rope", il);
680
711
 
681
- // self-attention
682
- {
683
- struct ggml_tensor * Q = ggml_mul_mat(ctx0, model.layers[il].q_w, cur);
712
+ ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
684
713
 
685
- Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
686
- Q = build_rope_2d(ctx0, Q, pos_h, pos_w, hparams.rope_theta);
687
- Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
714
+ cur = build_attn(layer.o_w, layer.o_b,
715
+ Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
716
+ cb(cur, "attn_out", il);
717
+ }
688
718
 
689
- struct ggml_tensor * K = ggml_mul_mat(ctx0, model.layers[il].k_w, cur);
719
+ // re-add the layer input, e.g., residual
720
+ cur = ggml_add(ctx0, cur, inpL);
690
721
 
691
- K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
692
- K = build_rope_2d(ctx0, K, pos_h, pos_w, hparams.rope_theta);
693
- K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
722
+ inpL = cur; // inpL = residual, cur = hidden_states
694
723
 
695
- struct ggml_tensor * V = ggml_mul_mat(ctx0, model.layers[il].v_w, cur);
724
+ cb(cur, "ffn_inp", il);
696
725
 
697
- V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches);
698
- V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
726
+ // layernorm2
727
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
728
+ cb(cur, "ffn_inp_normed", il);
699
729
 
700
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
701
- KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
730
+ // ffn
731
+ cur = build_ffn(cur,
732
+ layer.ff_up_w, layer.ff_up_b,
733
+ layer.ff_gate_w, layer.ff_gate_b,
734
+ layer.ff_down_w, layer.ff_down_b,
735
+ hparams.ffn_op, il);
702
736
 
703
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
704
- KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
705
- KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
737
+ cb(cur, "ffn_out", il);
706
738
 
707
- cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
739
+ // residual 2
740
+ cur = ggml_add(ctx0, inpL, cur);
741
+ cb(cur, "layer_out", il);
708
742
 
709
- cur = ggml_mul_mat(ctx0, model.layers[il].o_w, cur);
743
+ inpL = cur;
710
744
  }
711
745
 
712
- // re-add the layer input, e.g., residual
713
- cur = ggml_add(ctx0, cur, embeddings);
714
-
715
- embeddings = cur; // embeddings = residual, cur = hidden_states
716
-
717
- // pre-ffn norm
718
- cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_2_w);
719
-
720
- // feed-forward
721
- {
722
- ggml_tensor * gate_proj = ggml_mul_mat(ctx0, model.layers[il].ff_gate_w, cur);
723
- ggml_tensor * up_proj = ggml_mul_mat(ctx0, model.layers[il].ff_up_w, cur);
724
- gate_proj = ggml_silu(ctx0, gate_proj); // pixtral uses silu
725
- cur = ggml_mul(ctx0, up_proj, gate_proj);
726
- cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur);
746
+ // post-layernorm
747
+ if (model.post_ln_w) {
748
+ inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
727
749
  }
728
750
 
729
- // residual 2
730
- cur = ggml_add(ctx0, embeddings, cur);
751
+ // multimodal projection
752
+ ggml_tensor * embeddings = inpL;
753
+ embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
731
754
 
732
- embeddings = cur;
733
- }
755
+ embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
756
+ embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
734
757
 
735
- // LlavaMultiModalProjector (with GELU activation)
736
- {
758
+ // GELU activation
759
+ embeddings = ggml_gelu(ctx0, embeddings);
760
+
761
+ // Second linear layer
737
762
  embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
738
763
  embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
739
764
 
740
- embeddings = ggml_gelu(ctx0, embeddings);
741
- embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
742
- embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
743
- }
765
+ if (use_window_attn) {
766
+ window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
767
+ ggml_set_name(window_idx, "window_idx");
768
+ ggml_set_input(window_idx);
744
769
 
745
- // arrangement of the [IMG_BREAK] token
746
- {
747
- // not efficient, but works
748
- // the trick is to view the embeddings as a 3D tensor with shape [hidden_size, n_patches_per_row, n_rows]
749
- // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
750
- // after the concatenation, we have a tensor with shape [hidden_size, n_patches_per_row + 1, n_rows]
770
+ // embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size]
771
+ GGML_ASSERT(batch_size == 1);
772
+ embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4);
773
+ embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
774
+ embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size);
775
+ }
751
776
 
752
- const int n_embd_text = embeddings->ne[0];
753
- const int n_tokens_output = num_patches + n_patches_y - 1; // one [IMG_BREAK] per row, except the last row
777
+ // build the graph
778
+ ggml_build_forward_expand(gf, embeddings);
754
779
 
755
- ggml_tensor * cur = ggml_reshape_3d(ctx0, embeddings, n_embd_text, n_patches_x, n_patches_y);
756
- ggml_tensor * tok = ggml_new_tensor_3d(ctx0, embeddings->type, n_embd_text, 1, n_patches_y);
757
- tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
758
- tok = ggml_add(ctx0, tok, model.token_embd_img_break);
759
- cur = ggml_concat(ctx0, cur, tok, 1);
760
- embeddings = ggml_view_2d(ctx0, cur,
761
- n_embd_text, n_tokens_output,
762
- ggml_row_size(cur->type, n_embd_text), 0);
780
+ return gf;
763
781
  }
764
782
 
765
- // build the graph
766
- ggml_build_forward_expand(gf, embeddings);
783
+ ggml_cgraph * build_minicpmv() {
784
+ const int batch_size = 1;
767
785
 
768
- return gf;
769
- }
786
+ GGML_ASSERT(model.class_embedding == nullptr);
787
+ const int n_pos = n_patches;
770
788
 
771
- static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
772
- const auto & model = ctx->vision_model;
773
- const auto & hparams = model.hparams;
789
+ // position embeddings for the projector (not for ViT)
790
+ int n_output_dim = clip_n_mmproj_embd(ctx);
791
+ ggml_tensor * pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, n_pos, batch_size);
792
+ ggml_set_name(pos_embed, "pos_embed");
793
+ ggml_set_input(pos_embed);
774
794
 
775
- const int image_size_width = imgs.entries[0]->nx;
776
- const int image_size_height = imgs.entries[0]->ny;
795
+ // for selecting learned pos embd, used by ViT
796
+ struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
797
+ ggml_set_name(positions, "positions");
798
+ ggml_set_input(positions);
777
799
 
778
- const bool use_window_attn = hparams.n_wa_pattern > 0;
779
-
780
- const int n_wa_pattern = hparams.n_wa_pattern;
781
- const int patch_size = hparams.patch_size;
782
- const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
783
- const int patches_w = image_size_width / patch_size;
784
- const int patches_h = image_size_height / patch_size;
785
- const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
786
- const int num_position_ids = num_positions * 4; // m-rope requires 4 dim per position
787
- const int hidden_size = hparams.hidden_size;
788
- const int n_head = hparams.n_head;
789
- const int d_head = hidden_size / n_head;
790
- const int n_layer = hparams.n_layer;
791
- const float eps = hparams.eps;
792
-
793
- int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
794
-
795
- const int batch_size = imgs.entries.size();
796
- GGML_ASSERT(batch_size == 1);
797
-
798
- struct ggml_init_params params = {
799
- /*.mem_size =*/ ctx->buf_compute_meta.size(),
800
- /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
801
- /*.no_alloc =*/ true,
802
- };
800
+ ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
801
+
802
+ ggml_tensor * inp = build_inp();
803
+ ggml_tensor * embeddings = build_vit(
804
+ inp, n_patches,
805
+ NORM_TYPE_NORMAL,
806
+ hparams.ffn_op,
807
+ learned_pos_embd,
808
+ nullptr);
803
809
 
804
- ggml_context_ptr ctx0_ptr(ggml_init(params));
805
- auto ctx0 = ctx0_ptr.get();
810
+ // resampler projector (it is just another transformer)
806
811
 
807
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
812
+ ggml_tensor * q = model.mm_model_query;
813
+ ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
808
814
 
809
- struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
810
- ggml_set_name(inp_raw, "inp_raw");
811
- ggml_set_input(inp_raw);
815
+ // norm
816
+ q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
817
+ v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
812
818
 
813
- struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
819
+ // k = v + pos_embed
820
+ ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
814
821
 
815
- GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
816
- GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
822
+ // attention
823
+ {
824
+ int n_embd = clip_n_mmproj_embd(ctx);
825
+ const int d_head = 128;
826
+ int n_head = n_embd/d_head;
827
+ int num_query = 96;
828
+ if (ctx->minicpmv_version == 2) {
829
+ num_query = 96;
830
+ } else if (ctx->minicpmv_version == 3) {
831
+ num_query = 64;
832
+ } else if (ctx->minicpmv_version == 4) {
833
+ num_query = 64;
834
+ }
817
835
 
818
- auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
819
- inp = ggml_add(ctx0, inp, inp_1);
836
+ ggml_tensor * Q = ggml_add(ctx0,
837
+ ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
838
+ model.mm_model_attn_q_b);
839
+ ggml_tensor * K = ggml_add(ctx0,
840
+ ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
841
+ model.mm_model_attn_k_b);
842
+ ggml_tensor * V = ggml_add(ctx0,
843
+ ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
844
+ model.mm_model_attn_v_b);
845
+
846
+ Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
847
+ K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
848
+ V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
849
+
850
+ cb(Q, "resampler_Q", -1);
851
+ cb(K, "resampler_K", -1);
852
+ cb(V, "resampler_V", -1);
853
+
854
+ embeddings = build_attn(
855
+ model.mm_model_attn_o_w,
856
+ model.mm_model_attn_o_b,
857
+ Q, K, V, nullptr, kq_scale, -1);
858
+ cb(embeddings, "resampler_attn_out", -1);
859
+ }
860
+ // layernorm
861
+ embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
862
+
863
+ // projection
864
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
820
865
 
821
- inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
822
- inp = ggml_reshape_4d(
823
- ctx0, inp,
824
- hidden_size * 2, patches_w / 2, patches_h, batch_size);
825
- inp = ggml_reshape_4d(
826
- ctx0, inp,
827
- hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
828
- inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
829
- inp = ggml_reshape_3d(
830
- ctx0, inp,
831
- hidden_size, patches_w * patches_h, batch_size);
866
+ // build the graph
867
+ ggml_build_forward_expand(gf, embeddings);
832
868
 
833
- if (model.patch_bias) {
834
- // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
835
- inp = ggml_add(ctx0, inp, model.patch_bias);
869
+ return gf;
836
870
  }
837
- struct ggml_tensor * embeddings = inp;
838
- struct ggml_tensor * window_mask = nullptr;
839
- struct ggml_tensor * window_idx = nullptr;
840
- struct ggml_tensor * inv_window_idx = nullptr;
841
871
 
842
- struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
843
- ggml_set_name(positions, "positions");
844
- ggml_set_input(positions);
872
+ ggml_cgraph * build_internvl() {
873
+ GGML_ASSERT(model.class_embedding != nullptr);
874
+ GGML_ASSERT(model.position_embeddings != nullptr);
845
875
 
846
- // pre-layernorm
847
- if (model.pre_ln_w) {
848
- embeddings = ggml_rms_norm(ctx0, embeddings, eps);
849
- ggml_set_name(embeddings, "pre_ln");
876
+ const int n_pos = n_patches + 1;
877
+ ggml_tensor * inp = build_inp();
850
878
 
851
- embeddings = ggml_mul(ctx0, embeddings, model.pre_ln_w);
852
- }
853
-
854
- if (use_window_attn) {
855
- // handle window attention inputs
856
- inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4);
857
- ggml_set_name(inv_window_idx, "inv_window_idx");
858
- ggml_set_input(inv_window_idx);
859
- // mask for window attention
860
- window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, num_positions, num_positions);
861
- ggml_set_name(window_mask, "window_mask");
862
- ggml_set_input(window_mask);
879
+ // add CLS token
880
+ inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
863
881
 
864
- // embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
865
- GGML_ASSERT(batch_size == 1);
866
- embeddings = ggml_reshape_2d(ctx0, embeddings, hidden_size * 4, patches_w * patches_h * batch_size / 4);
867
- embeddings = ggml_get_rows(ctx0, embeddings, inv_window_idx);
868
- embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, patches_w * patches_h, batch_size);
869
- }
882
+ // The larger models use a different ViT, which uses RMS norm instead of layer norm
883
+ // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
884
+ norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
885
+ ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
886
+ : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
870
887
 
871
- // loop over layers
872
- for (int il = 0; il < n_layer; il++) {
873
- struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
888
+ ggml_tensor * cur = build_vit(
889
+ inp, n_pos,
890
+ norm_t,
891
+ hparams.ffn_op,
892
+ model.position_embeddings,
893
+ nullptr);
874
894
 
875
- // rmsnorm1
876
- cur = ggml_rms_norm(ctx0, cur, eps);
877
- cur = ggml_mul(ctx0, cur, model.layers[il].ln_1_w);
895
+ // remove CLS token
896
+ cur = ggml_view_2d(ctx0, cur,
897
+ n_embd, n_patches,
898
+ ggml_row_size(cur->type, n_embd), 0);
878
899
 
879
- // self-attention
900
+ // pixel shuffle
880
901
  {
902
+ const int scale_factor = model.hparams.proj_scale_factor;
903
+ const int bsz = 1; // batch size, always 1 for now since we don't support batching
904
+ const int height = n_patches_y;
905
+ const int width = n_patches_x;
906
+ GGML_ASSERT(scale_factor > 0);
907
+ cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
908
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
909
+ cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
910
+ n_embd * scale_factor * scale_factor,
911
+ height / scale_factor,
912
+ width / scale_factor,
913
+ bsz);
914
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
915
+ // flatten to 2D
916
+ cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur),
917
+ n_embd * scale_factor * scale_factor,
918
+ cur->ne[1] * cur->ne[2]);
919
+ }
920
+
921
+ // projector (always using GELU activation)
922
+ {
923
+ // projector LayerNorm uses pytorch's default eps = 1e-5
924
+ // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
925
+ cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
926
+ cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
927
+ cur = ggml_add(ctx0, cur, model.mm_1_b);
928
+ cur = ggml_gelu(ctx0, cur);
929
+ cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
930
+ cur = ggml_add(ctx0, cur, model.mm_3_b);
931
+ }
881
932
 
882
- struct ggml_tensor * Q =
883
- ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
933
+ // build the graph
934
+ ggml_build_forward_expand(gf, cur);
884
935
 
885
- Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
886
- Q = ggml_rope_multi(
887
- ctx0, Q, positions, nullptr,
888
- d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
889
- Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
890
- Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
936
+ return gf;
937
+ }
891
938
 
892
- struct ggml_tensor * K =
893
- ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
939
+ // this graph is used by llava, granite and glm
940
+ // due to having embedding_stack (used by granite), we cannot reuse build_vit
941
+ ggml_cgraph * build_llava() {
942
+ const int batch_size = 1;
943
+ const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
894
944
 
895
- K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
896
- K = ggml_rope_multi(
897
- ctx0, K, positions, nullptr,
898
- d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
899
- K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
900
- K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
945
+ GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
901
946
 
902
- struct ggml_tensor * V =
903
- ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
947
+ // Calculate the deepest feature layer based on hparams and projector type
948
+ int max_feature_layer = n_layer;
949
+ {
950
+ // Get the index of the second to last layer; this is the default for models that have a llava projector
951
+ int il_last = hparams.n_layer - 1;
952
+ int deepest_feature_layer = -1;
904
953
 
905
- V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
906
- V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
907
- V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
954
+ if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
955
+ il_last += 1;
956
+ }
908
957
 
909
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
910
- const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
911
- if (full_attn) {
912
- KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
913
- } else {
914
- KQ = ggml_soft_max_ext(ctx0, KQ, window_mask, 1.0f / sqrtf((float)d_head), 0.0f);
958
+ // If we set explicit vision feature layers, only go up to the deepest one
959
+ // NOTE: only used by granite-vision models for now
960
+ for (const auto & feature_layer : hparams.vision_feature_layer) {
961
+ if (feature_layer > deepest_feature_layer) {
962
+ deepest_feature_layer = feature_layer;
963
+ }
915
964
  }
965
+ max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
966
+ }
916
967
 
917
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
918
- KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
919
- KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
968
+ ggml_tensor * inp = build_inp();
920
969
 
921
- cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
970
+ // concat class_embeddings and patch_embeddings
971
+ if (model.class_embedding) {
972
+ inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
922
973
  }
923
974
 
924
- // attention output
925
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
975
+ ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
976
+ ggml_set_name(positions, "positions");
977
+ ggml_set_input(positions);
926
978
 
927
- // re-add the layer input, e.g., residual
928
- cur = ggml_add(ctx0, cur, embeddings);
979
+ inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
929
980
 
930
- embeddings = cur; // embeddings = residual, cur = hidden_states
981
+ ggml_tensor * inpL = inp;
931
982
 
932
- // rms norm2
933
- cur = ggml_rms_norm(ctx0, cur, eps);
934
- cur = ggml_mul(ctx0, cur, model.layers[il].ln_2_w);
983
+ // pre-layernorm
984
+ if (model.pre_ln_w) {
985
+ inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
986
+ cb(inpL, "pre_ln", -1);
987
+ }
935
988
 
936
- // mlp
937
- // ffn_up
938
- auto cur_up = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
939
- cur_up = ggml_add(ctx0, cur_up, model.layers[il].ff_o_b);
989
+ std::vector<ggml_tensor *> embedding_stack;
990
+ const auto & vision_feature_layer = hparams.vision_feature_layer;
940
991
 
941
- auto cur_gate = ggml_mul_mat(ctx0, model.layers[il].ff_g_w, cur);
942
- cur_gate = ggml_add(ctx0, cur_gate, model.layers[il].ff_g_b);
943
- // TODO : only 2 of these 3 are actually used, should we remove one of them?
944
- if (ctx->use_gelu) {
945
- cur_gate = ggml_gelu_inplace(ctx0, cur_gate);
946
- } else if (ctx->use_silu) {
947
- cur_gate = ggml_silu_inplace(ctx0, cur_gate);
948
- } else {
949
- cur_gate = ggml_gelu_quick_inplace(ctx0, cur_gate);
950
- }
951
- cur = ggml_mul(ctx0, cur_gate, cur_up);
992
+ // loop over layers
993
+ for (int il = 0; il < max_feature_layer; il++) {
994
+ auto & layer = model.layers[il];
995
+ ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
952
996
 
953
- // ffn_down
954
- cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
955
- cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
997
+ // If this is an embedding feature layer, save the output.
998
+ // NOTE: 0 index here refers to the input to the encoder.
999
+ if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
1000
+ embedding_stack.push_back(cur);
1001
+ }
956
1002
 
957
- // residual 2
958
- cur = ggml_add(ctx0, embeddings, cur);
1003
+ // layernorm1
1004
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
1005
+ cb(cur, "layer_inp_normed", il);
959
1006
 
960
- embeddings = cur;
961
- }
1007
+ // self-attention
1008
+ {
1009
+ ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
1010
+ if (layer.q_b) {
1011
+ Qcur = ggml_add(ctx0, Qcur, layer.q_b);
1012
+ }
962
1013
 
963
- // post-layernorm
964
- if (model.post_ln_w) {
965
- embeddings = ggml_rms_norm(ctx0, embeddings, eps);
966
- ggml_set_name(embeddings, "post_ln");
1014
+ ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
1015
+ if (layer.k_b) {
1016
+ Kcur = ggml_add(ctx0, Kcur, layer.k_b);
1017
+ }
967
1018
 
968
- embeddings = ggml_mul(ctx0, embeddings, model.post_ln_w);
969
- }
1019
+ ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
1020
+ if (layer.v_b) {
1021
+ Vcur = ggml_add(ctx0, Vcur, layer.v_b);
1022
+ }
970
1023
 
971
- embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
1024
+ Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
1025
+ Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
1026
+ Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
972
1027
 
973
- embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
974
- embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
1028
+ cb(Qcur, "Qcur", il);
1029
+ cb(Kcur, "Kcur", il);
1030
+ cb(Vcur, "Vcur", il);
975
1031
 
976
- // GELU activation
977
- embeddings = ggml_gelu(ctx0, embeddings);
1032
+ cur = build_attn(layer.o_w, layer.o_b,
1033
+ Qcur, Kcur, Vcur, nullptr, kq_scale, il);
1034
+ cb(cur, "attn_out", il);
1035
+ }
978
1036
 
979
- // Second linear layer
980
- embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
981
- embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
1037
+ // re-add the layer input, e.g., residual
1038
+ cur = ggml_add(ctx0, cur, inpL);
982
1039
 
983
- if (use_window_attn) {
984
- window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4);
985
- ggml_set_name(window_idx, "window_idx");
986
- ggml_set_input(window_idx);
1040
+ inpL = cur; // inpL = residual, cur = hidden_states
987
1041
 
988
- // embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
989
- GGML_ASSERT(batch_size == 1);
990
- embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4);
991
- embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
992
- embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4, batch_size);
993
- }
1042
+ cb(cur, "ffn_inp", il);
994
1043
 
995
- // build the graph
996
- ggml_build_forward_expand(gf, embeddings);
1044
+ // layernorm2
1045
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
1046
+ cb(cur, "ffn_inp_normed", il);
997
1047
 
998
- return gf;
999
- }
1048
+ // ffn
1049
+ cur = build_ffn(cur,
1050
+ layer.ff_up_w, layer.ff_up_b,
1051
+ layer.ff_gate_w, layer.ff_gate_b,
1052
+ layer.ff_down_w, layer.ff_down_b,
1053
+ hparams.ffn_op, il);
1000
1054
 
1001
- static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
1002
- const auto & model = ctx->vision_model;
1003
- const auto & hparams = model.hparams;
1055
+ cb(cur, "ffn_out", il);
1004
1056
 
1005
- const int image_size = hparams.image_size;
1006
- int image_size_width = image_size;
1007
- int image_size_height = image_size;
1057
+ // residual 2
1058
+ cur = ggml_add(ctx0, inpL, cur);
1059
+ cb(cur, "layer_out", il);
1008
1060
 
1009
- if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
1010
- LOG_DBG("%s: %d %d\n", __func__, load_image_size.width, load_image_size.height);
1011
- image_size_width = load_image_size.width;
1012
- image_size_height = load_image_size.height;
1013
- if (is_inf) {
1014
- image_size_width = imgs.entries[0]->nx;
1015
- image_size_height = imgs.entries[0]->ny;
1061
+ inpL = cur;
1016
1062
  }
1017
- }
1018
1063
 
1019
- else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
1020
- // use the image's native resolution when image is avaible
1021
- if (is_inf) {
1022
- // if (imgs->data->nx && imgs->data->ny) {
1023
- image_size_width = imgs.entries[0]->nx;
1024
- image_size_height = imgs.entries[0]->ny;
1064
+ // post-layernorm
1065
+ if (model.post_ln_w) {
1066
+ inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
1025
1067
  }
1026
- }
1027
-
1028
- const int patch_size = hparams.patch_size;
1029
- const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
1030
- const int patches_w = image_size_width / patch_size;
1031
- const int patches_h = image_size_height / patch_size;
1032
- const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
1033
- const int num_position_ids = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL ? num_positions * 4 : num_positions;
1034
- const int hidden_size = hparams.hidden_size;
1035
- const int n_head = hparams.n_head;
1036
- const int d_head = hidden_size / n_head;
1037
- const float eps = hparams.eps;
1038
- int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
1039
-
1040
- const int batch_size = imgs.entries.size();
1041
-
1042
- if (ctx->has_llava_projector
1043
- || ctx->proj_type == PROJECTOR_TYPE_MINICPMV
1044
- || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
1045
- GGML_ASSERT(batch_size == 1);
1046
- }
1047
1068
 
1048
- struct ggml_init_params params = {
1049
- /*.mem_size =*/ ctx->buf_compute_meta.size(),
1050
- /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
1051
- /*.no_alloc =*/ true,
1052
- };
1069
+ ggml_tensor * embeddings = inpL;
1053
1070
 
1054
- ggml_context_ptr ctx0_ptr(ggml_init(params));
1055
- auto ctx0 = ctx0_ptr.get();
1071
+ // process vision feature layers (used by granite)
1072
+ {
1073
+ // final layer is a vision feature layer
1074
+ if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
1075
+ embedding_stack.push_back(inpL);
1076
+ }
1056
1077
 
1057
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
1078
+ // If feature layers are explicitly set, stack them (if we have multiple)
1079
+ if (!embedding_stack.empty()) {
1080
+ embeddings = embedding_stack[0];
1081
+ for (size_t i = 1; i < embedding_stack.size(); i++) {
1082
+ embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
1083
+ }
1084
+ }
1085
+ }
1058
1086
 
1059
- struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
1060
- ggml_set_name(inp_raw, "inp_raw");
1061
- ggml_set_input(inp_raw);
1087
+ // llava projector (also used by granite)
1088
+ if (ctx->has_llava_projector) {
1089
+ embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
1062
1090
 
1063
- struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
1091
+ ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
1092
+ ggml_set_name(patches, "patches");
1093
+ ggml_set_input(patches);
1064
1094
 
1065
- if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
1066
- GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
1067
- GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
1095
+ // shape [1, 576, 1024]
1096
+ // ne is whcn, ne = [1024, 576, 1, 1]
1097
+ embeddings = ggml_get_rows(ctx0, embeddings, patches);
1068
1098
 
1069
- auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
1070
- inp = ggml_add(ctx0, inp, inp_1);
1071
- inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
1072
- inp = ggml_reshape_4d(
1073
- ctx0, inp,
1074
- hidden_size * 2, patches_w / 2, patches_h, batch_size);
1075
- inp = ggml_reshape_4d(
1076
- ctx0, inp,
1077
- hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
1078
- inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
1079
- inp = ggml_reshape_3d(
1080
- ctx0, inp,
1081
- hidden_size, patches_w * patches_h, batch_size);
1082
- }
1083
- else {
1084
- inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
1085
- inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
1086
- }
1099
+ // print_tensor_info(embeddings, "embeddings");
1087
1100
 
1088
- if (model.patch_bias) {
1089
- // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
1090
- inp = ggml_add(ctx0, inp, model.patch_bias);
1091
- }
1092
- struct ggml_tensor * embeddings = inp;
1093
- struct ggml_tensor * pos_embed = nullptr;
1101
+ // llava projector
1102
+ if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
1103
+ embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
1104
+ embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
1094
1105
 
1095
- // concat class_embeddings and patch_embeddings
1096
- if (model.class_embedding) {
1097
- embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
1098
- embeddings = ggml_scale(ctx0, embeddings, 0.0f); // set to all zeros
1099
- embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
1100
- embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
1101
- embeddings = ggml_acc(ctx0, embeddings, inp,
1102
- embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
1103
- }
1106
+ embeddings = ggml_gelu(ctx0, embeddings);
1107
+ if (model.mm_2_w) {
1108
+ embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
1109
+ embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
1110
+ }
1111
+ }
1112
+ else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
1113
+ embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
1114
+ embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
1115
+ // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
1116
+ // First LayerNorm
1117
+ embeddings = ggml_norm(ctx0, embeddings, eps);
1118
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
1119
+ model.mm_1_b);
1120
+
1121
+ // GELU activation
1122
+ embeddings = ggml_gelu(ctx0, embeddings);
1123
+
1124
+ // Second linear layer
1125
+ embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
1126
+ embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
1127
+
1128
+ // Second LayerNorm
1129
+ embeddings = ggml_norm(ctx0, embeddings, eps);
1130
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
1131
+ model.mm_4_b);
1132
+ }
1133
+ else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
1134
+ // MobileVLM projector
1135
+ int n_patch = 24;
1136
+ ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
1137
+ mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
1138
+ mlp_1 = ggml_gelu(ctx0, mlp_1);
1139
+ ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
1140
+ mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
1141
+ // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
1142
+
1143
+ // block 1
1144
+ ggml_tensor * block_1 = nullptr;
1145
+ {
1146
+ // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
1147
+ mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
1148
+ mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
1149
+ // stride = 1, padding = 1, bias is nullptr
1150
+ block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
1151
+
1152
+ // layer norm
1153
+ // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
1154
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
1155
+ // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
1156
+ block_1 = ggml_norm(ctx0, block_1, eps);
1157
+ block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
1158
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
1159
+
1160
+ // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
1161
+ // hardswish
1162
+ ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
1163
+
1164
+ block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
1165
+ // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
1166
+ // pointwise conv
1167
+ block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
1168
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
1169
+ block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
1170
+ block_1 = ggml_relu(ctx0, block_1);
1171
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
1172
+ block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
1173
+ block_1 = ggml_hardsigmoid(ctx0, block_1);
1174
+ // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
1175
+ block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
1176
+ block_1 = ggml_mul(ctx0, block_1_hw, block_1);
1177
+
1178
+ int w = block_1->ne[0], h = block_1->ne[1];
1179
+ block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
1180
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
1181
+
1182
+ // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
1183
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
1184
+ block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
1185
+
1186
+ // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
1187
+ block_1 = ggml_norm(ctx0, block_1, eps);
1188
+ block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
1189
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
1190
+ // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
1191
+ // residual
1192
+ block_1 = ggml_add(ctx0, mlp_3, block_1);
1193
+ }
1104
1194
 
1105
- struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
1106
- ggml_set_name(positions, "positions");
1107
- ggml_set_input(positions);
1195
+ // block_2
1196
+ {
1197
+ // stride = 2
1198
+ block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
1199
+
1200
+ // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
1201
+ // layer norm
1202
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
1203
+ // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
1204
+ block_1 = ggml_norm(ctx0, block_1, eps);
1205
+ block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
1206
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
1207
+ // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
1208
+ // hardswish
1209
+ ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
1210
+
1211
+ // not sure the parameters is right for globalAvgPooling
1212
+ block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
1213
+ // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
1214
+ // pointwise conv
1215
+ block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
1216
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
1217
+ block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
1218
+ block_1 = ggml_relu(ctx0, block_1);
1219
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
1220
+ block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
1221
+ block_1 = ggml_hardsigmoid(ctx0, block_1);
1222
+
1223
+ // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
1224
+ block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
1225
+ block_1 = ggml_mul(ctx0, block_1_hw, block_1);
1226
+
1227
+ int w = block_1->ne[0], h = block_1->ne[1];
1228
+ block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
1229
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
1230
+ // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
1231
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
1232
+ block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
1233
+
1234
+
1235
+ // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
1236
+ block_1 = ggml_norm(ctx0, block_1, eps);
1237
+ block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
1238
+ block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
1239
+ // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
1240
+ }
1241
+ embeddings = block_1;
1242
+ }
1243
+ else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
1244
+ {
1245
+ int n_patch = 24;
1246
+ ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
1247
+ mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
1248
+ mlp_0 = ggml_gelu(ctx0, mlp_0);
1249
+ ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
1250
+ mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
1251
+ // mlp_2 ne = [2048, 576, 1, 1]
1252
+ // // AVG Pool Layer 2*2, strides = 2
1253
+ mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
1254
+ // mlp_2 ne = [576, 2048, 1, 1]
1255
+ mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
1256
+ // mlp_2 ne [24, 24, 2048, 1]
1257
+ mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
1258
+ // weight ne = [3, 3, 2048, 1]
1259
+ ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
1260
+ peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
1261
+ peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
1262
+ mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
1263
+ peg_0 = ggml_add(ctx0, peg_0, mlp_2);
1264
+ peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
1265
+ embeddings = peg_0;
1266
+ }
1267
+ else {
1268
+ GGML_ABORT("fatal error");
1269
+ }
1270
+ }
1108
1271
 
1109
- if (ctx->proj_type != PROJECTOR_TYPE_QWEN2VL) { // qwen2vl does NOT use learned position embeddings
1110
- embeddings =
1111
- ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
1112
- }
1272
+ // glm projector
1273
+ else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
1274
+ size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
1275
+ embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
1276
+ embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
1277
+ embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
1278
+ embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
1279
+ embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
1280
+ embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
1281
+ // GLU
1282
+ {
1283
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
1284
+ embeddings = ggml_norm(ctx0, embeddings, eps);
1285
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
1286
+ embeddings = ggml_gelu_inplace(ctx0, embeddings);
1287
+ ggml_tensor * x = embeddings;
1288
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
1289
+ x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
1290
+ embeddings = ggml_silu_inplace(ctx0, embeddings);
1291
+ embeddings = ggml_mul(ctx0, embeddings,x);
1292
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
1293
+ }
1294
+ // arrangement of BOI/EOI token embeddings
1295
+ // note: these embeddings are not present in text model, hence we cannot process them as text tokens
1296
+ // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
1297
+ {
1298
+ embeddings = ggml_concat(ctx0, model.mm_glm_tok_boi, embeddings, 1); // BOI
1299
+ embeddings = ggml_concat(ctx0, embeddings, model.mm_glm_tok_eoi, 1); // EOI
1300
+ }
1301
+ }
1113
1302
 
1114
- if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
1115
- int pos_w = image_size_width/patch_size;
1116
- int pos_h = image_size_height/patch_size;
1117
- int n_output_dim = clip_n_mmproj_embd(ctx);
1118
- pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, pos_w * pos_h, 1);
1119
- ggml_set_name(pos_embed, "pos_embed");
1120
- ggml_set_input(pos_embed);
1121
- }
1303
+ else {
1304
+ GGML_ABORT("llava: unknown projector type");
1305
+ }
1122
1306
 
1123
- // pre-layernorm
1124
- if (model.pre_ln_w) {
1125
- embeddings = ggml_norm(ctx0, embeddings, eps);
1126
- ggml_set_name(embeddings, "pre_ln");
1307
+ // build the graph
1308
+ ggml_build_forward_expand(gf, embeddings);
1127
1309
 
1128
- embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
1310
+ return gf;
1129
1311
  }
1130
1312
 
1131
- std::vector<struct ggml_tensor *> embedding_stack;
1132
- const auto & vision_feature_layer = hparams.vision_feature_layer;
1313
+ private:
1314
+ //
1315
+ // utility functions
1316
+ //
1317
+
1318
+ void cb(ggml_tensor * cur, const char * name, int il) const {
1319
+ // TODO: implement this
1320
+ GGML_UNUSED(cur);
1321
+ GGML_UNUSED(name);
1322
+ GGML_UNUSED(il);
1323
+ }
1324
+
1325
+ // build vision transformer (ViT) cgraph
1326
+ // this function should cover most of the models
1327
+ // if your model has specific features, you should probably duplicate this function
1328
+ ggml_tensor * build_vit(
1329
+ ggml_tensor * inp,
1330
+ int64_t n_pos,
1331
+ norm_type norm_t,
1332
+ ffn_op_type ffn_t,
1333
+ ggml_tensor * learned_pos_embd,
1334
+ std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos
1335
+ ) {
1336
+ if (learned_pos_embd) {
1337
+ inp = ggml_add(ctx0, inp, learned_pos_embd);
1338
+ cb(inp, "pos_embed", -1);
1339
+ }
1340
+
1341
+ ggml_tensor * inpL = inp;
1342
+
1343
+ // pre-layernorm
1344
+ if (model.pre_ln_w) {
1345
+ inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
1346
+ cb(inpL, "pre_ln", -1);
1347
+ }
1348
+
1349
+ // loop over layers
1350
+ for (int il = 0; il < n_layer; il++) {
1351
+ auto & layer = model.layers[il];
1352
+ ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
1353
+
1354
+ // layernorm1
1355
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
1356
+ cb(cur, "layer_inp_normed", il);
1357
+
1358
+ // self-attention
1359
+ {
1360
+ ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
1361
+ if (layer.q_b) {
1362
+ Qcur = ggml_add(ctx0, Qcur, layer.q_b);
1363
+ }
1133
1364
 
1134
- // loop over layers
1135
- for (int il = 0; il < ctx->max_feature_layer; il++) {
1136
- struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
1365
+ ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
1366
+ if (layer.k_b) {
1367
+ Kcur = ggml_add(ctx0, Kcur, layer.k_b);
1368
+ }
1137
1369
 
1138
- // If this is an embedding feature layer, save the output.
1139
- // NOTE: 0 index here refers to the input to the encoder.
1140
- if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
1141
- embedding_stack.push_back(embeddings);
1142
- }
1370
+ ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
1371
+ if (layer.v_b) {
1372
+ Vcur = ggml_add(ctx0, Vcur, layer.v_b);
1373
+ }
1143
1374
 
1144
- //const size_t nb_q_w = model.layers[il].q_w->nb[0];
1375
+ if (layer.q_norm) {
1376
+ Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
1377
+ cb(Qcur, "Qcur_norm", il);
1378
+ }
1145
1379
 
1146
- // layernorm1
1147
- {
1148
- cur = ggml_norm(ctx0, cur, eps);
1380
+ if (layer.k_norm) {
1381
+ Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
1382
+ cb(Kcur, "Kcur_norm", il);
1383
+ }
1149
1384
 
1150
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w),
1151
- model.layers[il].ln_1_b);
1152
- }
1385
+ Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
1386
+ Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
1387
+ Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
1153
1388
 
1154
- // self-attention
1155
- {
1389
+ cb(Qcur, "Qcur", il);
1390
+ cb(Kcur, "Kcur", il);
1391
+ cb(Vcur, "Vcur", il);
1156
1392
 
1157
- struct ggml_tensor * Q =
1158
- ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
1393
+ if (add_pos) {
1394
+ Qcur = add_pos(Qcur, layer);
1395
+ Kcur = add_pos(Kcur, layer);
1396
+ cb(Qcur, "Qcur_pos", il);
1397
+ cb(Kcur, "Kcur_pos", il);
1398
+ }
1159
1399
 
1160
- Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
1161
- if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
1162
- Q = ggml_rope_multi(
1163
- ctx0, Q, positions, nullptr,
1164
- d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
1400
+ cur = build_attn(layer.o_w, layer.o_b,
1401
+ Qcur, Kcur, Vcur, nullptr, kq_scale, il);
1402
+ cb(cur, "attn_out", il);
1165
1403
  }
1166
- Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
1167
- Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
1168
-
1169
- struct ggml_tensor * K =
1170
- ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
1171
1404
 
1172
- K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
1173
- if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
1174
- K = ggml_rope_multi(
1175
- ctx0, K, positions, nullptr,
1176
- d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
1405
+ if (layer.ls_1_w) {
1406
+ cur = ggml_mul(ctx0, cur, layer.ls_1_w);
1407
+ cb(cur, "attn_out_scaled", il);
1177
1408
  }
1178
- K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
1179
- K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
1180
1409
 
1181
- struct ggml_tensor * V =
1182
- ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
1410
+ // re-add the layer input, e.g., residual
1411
+ cur = ggml_add(ctx0, cur, inpL);
1183
1412
 
1184
- V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
1185
- V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
1186
- V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
1413
+ inpL = cur; // inpL = residual, cur = hidden_states
1187
1414
 
1188
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1189
- KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
1190
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
1191
- KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
1192
- KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1415
+ cb(cur, "ffn_inp", il);
1193
1416
 
1194
- cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
1195
- }
1417
+ // layernorm2
1418
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
1419
+ cb(cur, "ffn_inp_normed", il);
1196
1420
 
1197
- // attention output
1198
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
1421
+ // ffn
1422
+ cur = build_ffn(cur,
1423
+ layer.ff_up_w, layer.ff_up_b,
1424
+ layer.ff_gate_w, layer.ff_gate_b,
1425
+ layer.ff_down_w, layer.ff_down_b,
1426
+ ffn_t, il);
1199
1427
 
1200
- // re-add the layer input, e.g., residual
1201
- cur = ggml_add(ctx0, cur, embeddings);
1428
+ cb(cur, "ffn_out", il);
1202
1429
 
1203
- embeddings = cur; // embeddings = residual, cur = hidden_states
1430
+ if (layer.ls_2_w) {
1431
+ cur = ggml_mul(ctx0, cur, layer.ls_2_w);
1432
+ cb(cur, "ffn_out_scaled", il);
1433
+ }
1204
1434
 
1205
- // layernorm2
1206
- {
1207
- cur = ggml_norm(ctx0, cur, eps);
1435
+ // residual 2
1436
+ cur = ggml_add(ctx0, inpL, cur);
1437
+ cb(cur, "layer_out", il);
1208
1438
 
1209
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
1439
+ inpL = cur;
1210
1440
  }
1211
1441
 
1212
- cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
1213
- cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
1214
-
1215
- if (ctx->use_gelu) {
1216
- cur = ggml_gelu_inplace(ctx0, cur);
1217
- } else if (ctx->use_silu) {
1218
- cur = ggml_silu_inplace(ctx0, cur);
1219
- } else {
1220
- cur = ggml_gelu_quick_inplace(ctx0, cur);
1442
+ // post-layernorm
1443
+ if (model.post_ln_w) {
1444
+ inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1);
1221
1445
  }
1446
+ return inpL;
1447
+ }
1222
1448
 
1223
- cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
1224
- cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
1225
-
1226
- // residual 2
1227
- cur = ggml_add(ctx0, embeddings, cur);
1449
+ // build the input after conv2d (inp_raw --> patches)
1450
+ // returns tensor with shape [n_embd, n_patches]
1451
+ ggml_tensor * build_inp() {
1452
+ ggml_tensor * inp_raw = build_inp_raw();
1453
+ ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
1454
+ inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
1455
+ inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
1456
+ if (model.patch_bias) {
1457
+ inp = ggml_add(ctx0, inp, model.patch_bias);
1458
+ cb(inp, "patch_bias", -1);
1459
+ }
1460
+ return inp;
1461
+ }
1228
1462
 
1229
- embeddings = cur;
1463
+ ggml_tensor * build_inp_raw() {
1464
+ ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, 3);
1465
+ ggml_set_name(inp_raw, "inp_raw");
1466
+ ggml_set_input(inp_raw);
1467
+ return inp_raw;
1230
1468
  }
1231
1469
 
1232
- // post-layernorm
1233
- if (model.post_ln_w) {
1234
- embeddings = ggml_norm(ctx0, embeddings, eps);
1235
- ggml_set_name(embeddings, "post_ln");
1470
+ ggml_tensor * build_norm(
1471
+ ggml_tensor * cur,
1472
+ ggml_tensor * mw,
1473
+ ggml_tensor * mb,
1474
+ norm_type type,
1475
+ float norm_eps,
1476
+ int il) const {
1236
1477
 
1237
- embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
1238
- }
1478
+ cur = type == NORM_TYPE_RMS
1479
+ ? ggml_rms_norm(ctx0, cur, norm_eps)
1480
+ : ggml_norm(ctx0, cur, norm_eps);
1239
1481
 
1240
- // final layer is a vision feature layer
1241
- if (vision_feature_layer.find(ctx->max_feature_layer) != vision_feature_layer.end()) {
1242
- embedding_stack.push_back(embeddings);
1243
- }
1482
+ if (mw || mb) {
1483
+ cb(cur, "norm", il);
1484
+ }
1244
1485
 
1245
- // If feature layers are explicitly set, stack them (if we have multiple)
1246
- if (!embedding_stack.empty()) {
1247
- embeddings = embedding_stack[0];
1248
- for (size_t i = 1; i < embedding_stack.size(); i++) {
1249
- embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
1486
+ if (mw) {
1487
+ cur = ggml_mul(ctx0, cur, mw);
1488
+ if (mb) {
1489
+ cb(cur, "norm_w", il);
1490
+ }
1250
1491
  }
1251
- }
1252
1492
 
1253
- // llava projector
1254
- if (ctx->has_llava_projector) {
1255
- embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
1493
+ if (mb) {
1494
+ cur = ggml_add(ctx0, cur, mb);
1495
+ }
1496
+
1497
+ return cur;
1498
+ }
1256
1499
 
1257
- struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
1258
- ggml_set_name(patches, "patches");
1259
- ggml_set_input(patches);
1500
+ ggml_tensor * build_ffn(
1501
+ ggml_tensor * cur,
1502
+ ggml_tensor * up,
1503
+ ggml_tensor * up_b,
1504
+ ggml_tensor * gate,
1505
+ ggml_tensor * gate_b,
1506
+ ggml_tensor * down,
1507
+ ggml_tensor * down_b,
1508
+ ffn_op_type type_op,
1509
+ int il) const {
1260
1510
 
1261
- // shape [1, 576, 1024]
1262
- // ne is whcn, ne = [1024, 576, 1, 1]
1263
- embeddings = ggml_get_rows(ctx0, embeddings, patches);
1511
+ ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur;
1512
+ cb(tmp, "ffn_up", il);
1264
1513
 
1265
- // print_tensor_info(embeddings, "embeddings");
1514
+ if (up_b) {
1515
+ tmp = ggml_add(ctx0, tmp, up_b);
1516
+ cb(tmp, "ffn_up_b", il);
1517
+ }
1266
1518
 
1267
- // llava projector
1268
- if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
1269
- embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
1270
- embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
1519
+ if (gate) {
1520
+ cur = ggml_mul_mat(ctx0, gate, cur);
1521
+ cb(cur, "ffn_gate", il);
1271
1522
 
1272
- embeddings = ggml_gelu(ctx0, embeddings);
1273
- if (model.mm_2_w) {
1274
- embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
1275
- embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
1523
+ if (gate_b) {
1524
+ cur = ggml_add(ctx0, cur, gate_b);
1525
+ cb(cur, "ffn_gate_b", il);
1276
1526
  }
1527
+ } else {
1528
+ cur = tmp;
1277
1529
  }
1278
- else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
1279
- embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
1280
- embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
1281
- // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
1282
- // First LayerNorm
1283
- embeddings = ggml_norm(ctx0, embeddings, eps);
1284
- embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
1285
- model.mm_1_b);
1286
-
1287
- // GELU activation
1288
- embeddings = ggml_gelu(ctx0, embeddings);
1289
-
1290
- // Second linear layer
1291
- embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
1292
- embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
1293
-
1294
- // Second LayerNorm
1295
- embeddings = ggml_norm(ctx0, embeddings, eps);
1296
- embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
1297
- model.mm_4_b);
1298
- }
1299
- else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
1300
- // MobileVLM projector
1301
- int n_patch = 24;
1302
- struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
1303
- mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
1304
- mlp_1 = ggml_gelu(ctx0, mlp_1);
1305
- struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
1306
- mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
1307
- // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
1308
-
1309
- // block 1
1310
- struct ggml_tensor * block_1 = nullptr;
1311
- {
1312
- // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
1313
- mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
1314
- mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
1315
- // stride = 1, padding = 1, bias is nullptr
1316
- block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
1317
-
1318
- // layer norm
1319
- // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
1320
- block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
1321
- // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
1322
- block_1 = ggml_norm(ctx0, block_1, eps);
1323
- block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
1324
- block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
1325
-
1326
- // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
1327
- // hardswish
1328
- struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
1329
-
1330
- block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
1331
- // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
1332
- // pointwise conv
1333
- block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
1334
- block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
1335
- block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
1336
- block_1 = ggml_relu(ctx0, block_1);
1337
- block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
1338
- block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
1339
- block_1 = ggml_hardsigmoid(ctx0, block_1);
1340
- // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
1341
- block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
1342
- block_1 = ggml_mul(ctx0, block_1_hw, block_1);
1343
-
1344
- int w = block_1->ne[0], h = block_1->ne[1];
1345
- block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
1346
- block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
1347
-
1348
- // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
1349
- block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
1350
- block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
1351
-
1352
- // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
1353
- block_1 = ggml_norm(ctx0, block_1, eps);
1354
- block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
1355
- block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
1356
- // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
1357
- // residual
1358
- block_1 = ggml_add(ctx0, mlp_3, block_1);
1359
- }
1360
1530
 
1361
- // block_2
1362
- {
1363
- // stride = 2
1364
- block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
1365
-
1366
- // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
1367
- // layer norm
1368
- block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
1369
- // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
1370
- block_1 = ggml_norm(ctx0, block_1, eps);
1371
- block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
1372
- block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
1373
- // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
1374
- // hardswish
1375
- struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
1376
-
1377
- // not sure the parameters is right for globalAvgPooling
1378
- block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
1379
- // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
1380
- // pointwise conv
1381
- block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
1382
- block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
1383
- block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
1384
- block_1 = ggml_relu(ctx0, block_1);
1385
- block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
1386
- block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
1387
- block_1 = ggml_hardsigmoid(ctx0, block_1);
1388
-
1389
- // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
1390
- block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
1391
- block_1 = ggml_mul(ctx0, block_1_hw, block_1);
1392
-
1393
- int w = block_1->ne[0], h = block_1->ne[1];
1394
- block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
1395
- block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
1396
- // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
1397
- block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
1398
- block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
1399
-
1400
-
1401
- // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
1402
- block_1 = ggml_norm(ctx0, block_1, eps);
1403
- block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
1404
- block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
1405
- // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
1406
- }
1407
- embeddings = block_1;
1531
+ switch (type_op) {
1532
+ case FFN_SILU:
1533
+ {
1534
+ cur = ggml_silu(ctx0, cur);
1535
+ cb(cur, "ffn_silu", il);
1536
+ } break;
1537
+ case FFN_GELU:
1538
+ {
1539
+ cur = ggml_gelu(ctx0, cur);
1540
+ cb(cur, "ffn_gelu", il);
1541
+ } break;
1542
+ case FFN_GELU_QUICK:
1543
+ {
1544
+ cur = ggml_gelu_quick(ctx0, cur);
1545
+ cb(cur, "ffn_relu", il);
1546
+ } break;
1408
1547
  }
1409
- else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
1410
- {
1411
- int n_patch = 24;
1412
- struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
1413
- mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
1414
- mlp_0 = ggml_gelu(ctx0, mlp_0);
1415
- struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
1416
- mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
1417
- // mlp_2 ne = [2048, 576, 1, 1]
1418
- // // AVG Pool Layer 2*2, strides = 2
1419
- mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
1420
- // mlp_2 ne = [576, 2048, 1, 1]
1421
- mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
1422
- // mlp_2 ne [24, 24, 2048, 1]
1423
- mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
1424
- // weight ne = [3, 3, 2048, 1]
1425
- struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
1426
- peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
1427
- peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
1428
- mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
1429
- peg_0 = ggml_add(ctx0, peg_0, mlp_2);
1430
- peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
1431
- embeddings = peg_0;
1548
+
1549
+ // we only support parallel ffn for now
1550
+ if (gate) {
1551
+ cur = ggml_mul(ctx0, cur, tmp);
1552
+ cb(cur, "ffn_gate_par", il);
1432
1553
  }
1433
- else {
1434
- GGML_ABORT("fatal error");
1554
+
1555
+ if (down) {
1556
+ cur = ggml_mul_mat(ctx0, down, cur);
1435
1557
  }
1436
- }
1437
- // minicpmv projector
1438
- else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
1439
- struct ggml_tensor * q = model.mm_model_query;
1440
- { // layernorm
1441
- q = ggml_norm(ctx0, q, eps);
1442
- q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
1558
+
1559
+ if (down_b) {
1560
+ cb(cur, "ffn_down", il);
1443
1561
  }
1444
- struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
1445
- { // layernorm
1446
- v = ggml_norm(ctx0, v, eps);
1447
- v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
1562
+
1563
+ if (down_b) {
1564
+ cur = ggml_add(ctx0, cur, down_b);
1448
1565
  }
1449
- struct ggml_tensor * k;
1450
- { // position
1451
- // q = ggml_add(ctx0, q, model.mm_model_pos_embed);
1452
- k = ggml_add(ctx0, v, pos_embed);
1566
+
1567
+ return cur;
1568
+ }
1569
+
1570
+ ggml_tensor * build_attn(
1571
+ ggml_tensor * wo,
1572
+ ggml_tensor * wo_b,
1573
+ ggml_tensor * q_cur,
1574
+ ggml_tensor * k_cur,
1575
+ ggml_tensor * v_cur,
1576
+ ggml_tensor * kq_mask,
1577
+ float kq_scale,
1578
+ int il) const {
1579
+ // these nodes are added to the graph together so that they are not reordered
1580
+ // by doing so, the number of splits in the graph is reduced
1581
+ ggml_build_forward_expand(gf, q_cur);
1582
+ ggml_build_forward_expand(gf, k_cur);
1583
+ ggml_build_forward_expand(gf, v_cur);
1584
+
1585
+ ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
1586
+ //cb(q, "q", il);
1587
+
1588
+ ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
1589
+ //cb(k, "k", il);
1590
+
1591
+ ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3);
1592
+ v = ggml_cont(ctx0, v);
1593
+ //cb(k, "v", il);
1594
+
1595
+ ggml_tensor * cur;
1596
+
1597
+ // TODO @ngxson : support flash attention
1598
+ {
1599
+ const auto n_tokens = q->ne[1];
1600
+ const auto n_head = q->ne[2];
1601
+ // const auto n_kv = k->ne[1]; // for flash attention
1602
+
1603
+ ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
1604
+ // F32 may not needed for vision encoders?
1605
+ // ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
1606
+
1607
+ kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f);
1608
+
1609
+ ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
1610
+ cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
1611
+ cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
1453
1612
  }
1454
1613
 
1455
- { // attention
1456
- int hidden_size = clip_n_mmproj_embd(ctx);
1457
- const int d_head = 128;
1458
- int n_head = hidden_size/d_head;
1459
- int num_query = 96;
1460
- if (ctx->minicpmv_version == 2) {
1461
- num_query = 96;
1462
- }
1463
- else if (ctx->minicpmv_version == 3) {
1464
- num_query = 64;
1465
- }
1466
- else if (ctx->minicpmv_version == 4) {
1467
- num_query = 64;
1468
- }
1614
+ cb(cur, "kqv_out", il);
1469
1615
 
1470
- struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
1471
- struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
1472
- struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
1473
- // permute
1474
- Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
1475
- Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
1476
- Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
1477
- K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
1478
- K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
1479
- K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
1480
- V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
1481
- V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
1482
- V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
1483
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1484
- KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
1485
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
1486
- KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
1487
- KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1488
- KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
1489
-
1490
- embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
1491
- }
1492
- { // layernorm
1493
- embeddings = ggml_norm(ctx0, embeddings, eps);
1494
- embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
1616
+ if (wo) {
1617
+ cur = ggml_mul_mat(ctx0, wo, cur);
1495
1618
  }
1496
- embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
1497
- }
1498
1619
 
1499
- // glm projector
1500
- else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
1501
- size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
1502
- embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
1503
- embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
1504
- embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
1505
- embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
1506
- embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
1507
- embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
1508
- // GLU
1509
- {
1510
- embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
1511
- embeddings = ggml_norm(ctx0, embeddings, eps);
1512
- embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
1513
- embeddings = ggml_gelu_inplace(ctx0, embeddings);
1514
- struct ggml_tensor * x = embeddings;
1515
- embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
1516
- x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
1517
- embeddings = ggml_silu_inplace(ctx0, embeddings);
1518
- embeddings = ggml_mul(ctx0, embeddings,x);
1519
- embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
1620
+ if (wo_b) {
1621
+ cur = ggml_add(ctx0, cur, wo_b);
1520
1622
  }
1623
+
1624
+ return cur;
1521
1625
  }
1522
1626
 
1523
- else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
1524
- embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
1627
+ // implementation of the 2D RoPE without adding a new op in ggml
1628
+ // this is not efficient (use double the memory), but works on all backends
1629
+ // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
1630
+ static ggml_tensor * build_rope_2d(
1631
+ ggml_context * ctx0,
1632
+ ggml_tensor * cur,
1633
+ ggml_tensor * pos_h,
1634
+ ggml_tensor * pos_w,
1635
+ const float freq_base
1636
+ ) {
1637
+ const int64_t n_dim = cur->ne[0];
1638
+ const int64_t n_head = cur->ne[1];
1639
+ const int64_t n_pos = cur->ne[2];
1525
1640
 
1526
- embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
1527
- embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
1641
+ // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
1642
+ // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
1643
+ // first half of cur will use 1e-0, 1e-2 (even)
1644
+ // second half of cur will use 1e-1, 1e-3 (odd)
1645
+ // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
1646
+ // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
1647
+ // then for the second half, we use freq_scale to shift the inv_freq
1648
+ // ^ why? replace (2i) with (2i+1) in the above equation
1649
+ const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
1528
1650
 
1529
- // GELU activation
1530
- embeddings = ggml_gelu(ctx0, embeddings);
1531
-
1532
- // Second linear layer
1533
- embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
1534
- embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
1651
+ // first half
1652
+ ggml_tensor * first;
1653
+ {
1654
+ first = ggml_view_3d(ctx0, cur,
1655
+ n_dim/2, n_head, n_pos,
1656
+ ggml_row_size(cur->type, n_dim),
1657
+ ggml_row_size(cur->type, n_dim*n_head),
1658
+ 0);
1659
+ first = ggml_rope_ext(
1660
+ ctx0,
1661
+ first,
1662
+ pos_h, // positions
1663
+ nullptr, // freq factors
1664
+ n_dim/2, // n_dims
1665
+ 0, 0, freq_base,
1666
+ 1.0f, 0.0f, 1.0f, 0.0f, 0.0f
1667
+ );
1668
+ }
1669
+
1670
+ // second half
1671
+ ggml_tensor * second;
1672
+ {
1673
+ second = ggml_view_3d(ctx0, cur,
1674
+ n_dim/2, n_head, n_pos,
1675
+ ggml_row_size(cur->type, n_dim),
1676
+ ggml_row_size(cur->type, n_dim*n_head),
1677
+ n_dim/2 * ggml_element_size(cur));
1678
+ second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
1679
+ second = ggml_rope_ext(
1680
+ ctx0,
1681
+ second,
1682
+ pos_w, // positions
1683
+ nullptr, // freq factors
1684
+ n_dim/2, // n_dims
1685
+ 0, 0, freq_base,
1686
+ freq_scale_odd,
1687
+ 0.0f, 1.0f, 0.0f, 0.0f
1688
+ );
1689
+ }
1690
+
1691
+ cur = ggml_concat(ctx0, first, second, 0);
1692
+ return cur;
1535
1693
  }
1536
1694
 
1537
- // build the graph
1538
- ggml_build_forward_expand(gf, embeddings);
1695
+ };
1539
1696
 
1540
- return gf;
1541
- }
1697
+ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
1698
+ GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
1699
+ clip_graph graph(ctx, *imgs.entries[0]);
1542
1700
 
1543
- static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
1544
1701
  ggml_cgraph * res;
1702
+
1545
1703
  switch (ctx->proj_type) {
1546
1704
  case PROJECTOR_TYPE_GEMMA3:
1547
1705
  case PROJECTOR_TYPE_IDEFICS3:
1548
1706
  {
1549
- GGML_ASSERT(imgs.entries.size() == 1);
1550
- res = clip_image_build_graph_siglip(ctx, *imgs.entries[0]);
1707
+ res = graph.build_siglip();
1551
1708
  } break;
1552
1709
  case PROJECTOR_TYPE_PIXTRAL:
1553
1710
  {
1554
- GGML_ASSERT(imgs.entries.size() == 1);
1555
- res = clip_image_build_graph_pixtral(ctx, *imgs.entries[0]);
1711
+ res = graph.build_pixtral();
1556
1712
  } break;
1713
+ case PROJECTOR_TYPE_QWEN2VL:
1557
1714
  case PROJECTOR_TYPE_QWEN25VL:
1558
1715
  {
1559
- res = clip_image_build_graph_qwen25vl(ctx, imgs);
1716
+ res = graph.build_qwen2vl();
1717
+ } break;
1718
+ case PROJECTOR_TYPE_MINICPMV:
1719
+ {
1720
+ res = graph.build_minicpmv();
1721
+ } break;
1722
+ case PROJECTOR_TYPE_INTERNVL:
1723
+ {
1724
+ res = graph.build_internvl();
1560
1725
  } break;
1561
1726
  default:
1562
1727
  {
1563
- // TODO: we should have one build_* function per model
1564
- res = clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
1728
+ res = graph.build_llava();
1565
1729
  } break;
1566
1730
  }
1567
1731
  return res;
@@ -1615,7 +1779,7 @@ struct clip_model_loader {
1615
1779
  const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
1616
1780
  const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i);
1617
1781
  enum ggml_type type = gguf_get_tensor_type(ctx_gguf.get(), i);
1618
- struct ggml_tensor * cur = ggml_get_tensor(meta, name);
1782
+ ggml_tensor * cur = ggml_get_tensor(meta, name);
1619
1783
  size_t tensor_size = ggml_nbytes(cur);
1620
1784
  model_size += tensor_size;
1621
1785
  LOG_DBG("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
@@ -1626,6 +1790,7 @@ struct clip_model_loader {
1626
1790
 
1627
1791
  void load_hparams() {
1628
1792
  auto & hparams = ctx_clip.vision_model.hparams;
1793
+ std::string log_ffn_op; // for logging
1629
1794
 
1630
1795
  // projector type
1631
1796
  std::string proj_type;
@@ -1641,14 +1806,11 @@ struct clip_model_loader {
1641
1806
 
1642
1807
  // other hparams
1643
1808
  {
1644
- get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false);
1809
+ get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); // legacy
1645
1810
 
1646
- get_bool(KEY_USE_GELU, ctx_clip.use_gelu, false);
1647
- get_bool(KEY_USE_SILU, ctx_clip.use_silu, false);
1648
-
1649
- get_u32(KEY_N_EMBD, hparams.hidden_size);
1811
+ get_u32(KEY_N_EMBD, hparams.n_embd);
1650
1812
  get_u32(KEY_N_HEAD, hparams.n_head);
1651
- get_u32(KEY_N_FF, hparams.n_intermediate);
1813
+ get_u32(KEY_N_FF, hparams.n_ff);
1652
1814
  get_u32(KEY_N_BLOCK, hparams.n_layer);
1653
1815
  get_u32(KEY_PROJ_DIM, hparams.projection_dim);
1654
1816
  get_f32(KEY_LAYER_NORM_EPS, hparams.eps);
@@ -1657,11 +1819,34 @@ struct clip_model_loader {
1657
1819
  get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
1658
1820
  get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
1659
1821
 
1822
+ // default warmup value
1823
+ hparams.warmup_image_size = hparams.image_size;
1824
+
1660
1825
  ctx_clip.has_llava_projector = ctx_clip.proj_type == PROJECTOR_TYPE_MLP
1661
1826
  || ctx_clip.proj_type == PROJECTOR_TYPE_MLP_NORM
1662
1827
  || ctx_clip.proj_type == PROJECTOR_TYPE_LDP
1663
1828
  || ctx_clip.proj_type == PROJECTOR_TYPE_LDPV2;
1664
1829
 
1830
+ {
1831
+ bool use_gelu = false;
1832
+ bool use_silu = false;
1833
+ get_bool(KEY_USE_GELU, use_gelu, false);
1834
+ get_bool(KEY_USE_SILU, use_silu, false);
1835
+ if (use_gelu && use_silu) {
1836
+ throw std::runtime_error(string_format("%s: both use_gelu and use_silu are set to true\n", __func__));
1837
+ }
1838
+ if (use_gelu) {
1839
+ hparams.ffn_op = FFN_GELU;
1840
+ log_ffn_op = "gelu";
1841
+ } else if (use_silu) {
1842
+ hparams.ffn_op = FFN_SILU;
1843
+ log_ffn_op = "silu";
1844
+ } else {
1845
+ hparams.ffn_op = FFN_GELU_QUICK;
1846
+ log_ffn_op = "gelu_quick";
1847
+ }
1848
+ }
1849
+
1665
1850
  {
1666
1851
  std::string mm_patch_merge_type;
1667
1852
  get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false);
@@ -1695,30 +1880,6 @@ struct clip_model_loader {
1695
1880
  hparams.vision_feature_layer.insert(layer);
1696
1881
  }
1697
1882
 
1698
- // Calculate the deepest feature layer based on hparams and projector type
1699
- // NOTE: This is only used by build_graph_legacy()
1700
- {
1701
- // Get the index of the second to last layer; this is the default for models that have a llava projector
1702
- int n_layer = hparams.n_layer - 1;
1703
- int deepest_feature_layer = -1;
1704
-
1705
- if (ctx_clip.proj_type == PROJECTOR_TYPE_MINICPMV
1706
- || ctx_clip.proj_type == PROJECTOR_TYPE_GLM_EDGE
1707
- || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL
1708
- || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN25VL) {
1709
- n_layer += 1;
1710
- }
1711
-
1712
- // If we set explicit vision feature layers, only go up to the deepest one
1713
- // NOTE: only used by granite-vision models for now
1714
- for (const auto & feature_layer : hparams.vision_feature_layer) {
1715
- if (feature_layer > deepest_feature_layer) {
1716
- deepest_feature_layer = feature_layer;
1717
- }
1718
- }
1719
- ctx_clip.max_feature_layer = deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
1720
- }
1721
-
1722
1883
  // model-specific params
1723
1884
  switch (ctx_clip.proj_type) {
1724
1885
  case PROJECTOR_TYPE_MINICPMV:
@@ -1728,15 +1889,41 @@ struct clip_model_loader {
1728
1889
  }
1729
1890
  } break;
1730
1891
  case PROJECTOR_TYPE_IDEFICS3:
1892
+ case PROJECTOR_TYPE_INTERNVL:
1731
1893
  {
1732
1894
  get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
1733
1895
  } break;
1734
1896
  case PROJECTOR_TYPE_PIXTRAL:
1735
1897
  {
1736
1898
  hparams.rope_theta = 10000.0f;
1899
+ hparams.warmup_image_size = hparams.patch_size * 8;
1900
+ get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
1901
+ } break;
1902
+ case PROJECTOR_TYPE_GEMMA3:
1903
+ {
1904
+ // default value (used by all model sizes in gemma 3 family)
1905
+ // number of patches for each **side** is reduced by a factor of 4
1906
+ hparams.proj_scale_factor = 4;
1907
+ // test model (tinygemma3) has a different value, we optionally read it
1908
+ get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
1909
+ } break;
1910
+ case PROJECTOR_TYPE_QWEN2VL:
1911
+ {
1912
+ // max image size = sqrt(max_pixels) = 3584
1913
+ // ref: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json
1914
+ // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
1915
+ // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
1916
+ hparams.image_size = 1024;
1917
+ hparams.warmup_image_size = hparams.patch_size * 8;
1737
1918
  } break;
1738
1919
  case PROJECTOR_TYPE_QWEN25VL:
1739
1920
  {
1921
+ // max image size = sqrt(max_pixels)
1922
+ // https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
1923
+ // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
1924
+ // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
1925
+ hparams.image_size = 1024;
1926
+ hparams.warmup_image_size = hparams.patch_size * 8;
1740
1927
  get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
1741
1928
  } break;
1742
1929
  default:
@@ -1744,18 +1931,26 @@ struct clip_model_loader {
1744
1931
  }
1745
1932
 
1746
1933
  LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
1934
+ LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd);
1935
+ LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head);
1936
+ LOG_INF("%s: n_ff: %d\n", __func__, hparams.n_ff);
1937
+ LOG_INF("%s: n_layer: %d\n", __func__, hparams.n_layer);
1938
+ LOG_INF("%s: projection_dim: %d\n", __func__, hparams.projection_dim);
1939
+ LOG_INF("%s: image_size: %d\n", __func__, hparams.image_size);
1940
+ LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size);
1941
+ LOG_INF("\n");
1747
1942
  LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector);
1748
1943
  LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version);
1749
1944
  LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
1750
1945
  LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
1751
- LOG_INF("%s: use_silu: %d\n", __func__, ctx_clip.use_silu);
1752
- LOG_INF("%s: use_gelu: %d\n", __func__, ctx_clip.use_gelu);
1946
+ LOG_INF("%s: ffn_op: %s\n", __func__, log_ffn_op.c_str());
1753
1947
  LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
1754
1948
  LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
1755
1949
  }
1756
1950
  }
1757
1951
 
1758
1952
  void load_tensors() {
1953
+ auto & hparams = ctx_clip.vision_model.hparams;
1759
1954
  std::map<std::string, size_t> tensor_offset;
1760
1955
  std::vector<ggml_tensor *> tensors_to_load;
1761
1956
 
@@ -1778,14 +1973,14 @@ struct clip_model_loader {
1778
1973
 
1779
1974
  // helper function
1780
1975
  auto get_tensor = [&](const std::string & name, bool required = true) {
1781
- struct ggml_tensor * cur = ggml_get_tensor(ctx_meta.get(), name.c_str());
1976
+ ggml_tensor * cur = ggml_get_tensor(ctx_meta.get(), name.c_str());
1782
1977
  if (!cur && required) {
1783
1978
  throw std::runtime_error(string_format("%s: unable to find tensor %s\n", __func__, name.c_str()));
1784
1979
  }
1785
1980
  if (cur) {
1786
1981
  tensors_to_load.push_back(cur);
1787
1982
  // add tensors to context
1788
- struct ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur);
1983
+ ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur);
1789
1984
  ggml_set_name(data_tensor, cur->name);
1790
1985
  cur = data_tensor;
1791
1986
  }
@@ -1809,15 +2004,20 @@ struct clip_model_loader {
1809
2004
  vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false);
1810
2005
 
1811
2006
  // layers
1812
- vision_model.layers.resize(vision_model.hparams.n_layer);
1813
- for (int il = 0; il < vision_model.hparams.n_layer; ++il) {
2007
+ vision_model.layers.resize(hparams.n_layer);
2008
+ for (int il = 0; il < hparams.n_layer; ++il) {
1814
2009
  auto & layer = vision_model.layers[il];
1815
2010
  layer.k_w = get_tensor(string_format(TN_ATTN_K, "v", il, "weight"));
1816
2011
  layer.q_w = get_tensor(string_format(TN_ATTN_Q, "v", il, "weight"));
1817
2012
  layer.v_w = get_tensor(string_format(TN_ATTN_V, "v", il, "weight"));
1818
2013
  layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight"));
2014
+ layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, "v", il, "weight"), false);
2015
+ layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, "v", il, "weight"), false);
1819
2016
  layer.ln_1_w = get_tensor(string_format(TN_LN_1, "v", il, "weight"), false);
1820
2017
  layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false);
2018
+ layer.ls_1_w = get_tensor(string_format(TN_LS_1, "v", il, "weight"), false); // no bias
2019
+ layer.ls_2_w = get_tensor(string_format(TN_LS_2, "v", il, "weight"), false); // no bias
2020
+
1821
2021
  layer.k_b = get_tensor(string_format(TN_ATTN_K, "v", il, "bias"), false);
1822
2022
  layer.q_b = get_tensor(string_format(TN_ATTN_Q, "v", il, "bias"), false);
1823
2023
  layer.v_b = get_tensor(string_format(TN_ATTN_V, "v", il, "bias"), false);
@@ -1825,7 +2025,7 @@ struct clip_model_loader {
1825
2025
  layer.ln_1_b = get_tensor(string_format(TN_LN_1, "v", il, "bias"), false);
1826
2026
  layer.ln_2_b = get_tensor(string_format(TN_LN_2, "v", il, "bias"), false);
1827
2027
 
1828
- // new naming
2028
+ // ffn
1829
2029
  layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, "v", il, "weight"));
1830
2030
  layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, "v", il, "bias"), false);
1831
2031
  layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, "v", il, "weight"), false);
@@ -1833,13 +2033,18 @@ struct clip_model_loader {
1833
2033
  layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight"));
1834
2034
  layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"), false);
1835
2035
 
1836
- // legacy naming (the in and out is reversed! don't ask me why)
1837
- layer.ff_i_w = layer.ff_down_w;
1838
- layer.ff_o_w = layer.ff_up_w;
1839
- layer.ff_g_w = layer.ff_gate_w;
1840
- layer.ff_i_b = layer.ff_down_b;
1841
- layer.ff_o_b = layer.ff_up_b;
1842
- layer.ff_g_b = layer.ff_gate_b;
2036
+ // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
2037
+ // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
2038
+ if (layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd) {
2039
+ // swap up and down weights
2040
+ ggml_tensor * tmp = layer.ff_up_w;
2041
+ layer.ff_up_w = layer.ff_down_w;
2042
+ layer.ff_down_w = tmp;
2043
+ // swap up and down biases
2044
+ tmp = layer.ff_up_b;
2045
+ layer.ff_up_b = layer.ff_down_b;
2046
+ layer.ff_down_b = tmp;
2047
+ }
1843
2048
  }
1844
2049
 
1845
2050
  switch (ctx_clip.proj_type) {
@@ -1930,12 +2135,14 @@ struct clip_model_loader {
1930
2135
  {
1931
2136
  vision_model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
1932
2137
  vision_model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
1933
- vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR,"weight"));
1934
- vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1,"weight"));
1935
- vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1,"bias"));
1936
- vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
1937
- vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight"));
1938
- vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
2138
+ vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight"));
2139
+ vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight"));
2140
+ vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias"));
2141
+ vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
2142
+ vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
2143
+ vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
2144
+ vision_model.mm_glm_tok_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
2145
+ vision_model.mm_glm_tok_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
1939
2146
  } break;
1940
2147
  case PROJECTOR_TYPE_QWEN2VL:
1941
2148
  case PROJECTOR_TYPE_QWEN25VL:
@@ -1957,11 +2164,23 @@ struct clip_model_loader {
1957
2164
  case PROJECTOR_TYPE_PIXTRAL:
1958
2165
  {
1959
2166
  vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
1960
- vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
2167
+ vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
1961
2168
  vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
1962
- vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
2169
+ vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
1963
2170
  // [IMG_BREAK] token embedding
1964
2171
  vision_model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
2172
+ // for mistral small 3.1
2173
+ vision_model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
2174
+ vision_model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
2175
+ } break;
2176
+ case PROJECTOR_TYPE_INTERNVL:
2177
+ {
2178
+ vision_model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
2179
+ vision_model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
2180
+ vision_model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
2181
+ vision_model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
2182
+ vision_model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
2183
+ vision_model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
1965
2184
  } break;
1966
2185
  default:
1967
2186
  GGML_ASSERT(false && "unknown projector type");
@@ -1981,7 +2200,7 @@ struct clip_model_loader {
1981
2200
  ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
1982
2201
  ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
1983
2202
  for (auto & t : tensors_to_load) {
1984
- struct ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
2203
+ ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
1985
2204
  const size_t offset = tensor_offset[t->name];
1986
2205
  fin.seekg(offset, std::ios::beg);
1987
2206
  if (!fin) {
@@ -2010,16 +2229,14 @@ struct clip_model_loader {
2010
2229
  // create a fake batch
2011
2230
  clip_image_f32_batch batch;
2012
2231
  clip_image_f32_ptr img(clip_image_f32_init());
2013
- clip_image_size image_size;
2014
- image_size.width = ctx_clip.vision_model.hparams.image_size;
2015
- image_size.height = ctx_clip.vision_model.hparams.image_size;
2016
- img->nx = image_size.width;
2017
- img->ny = image_size.height;
2018
- img->buf.resize(image_size.width * image_size.height * 3);
2232
+ img->nx = ctx_clip.vision_model.hparams.warmup_image_size;
2233
+ img->ny = ctx_clip.vision_model.hparams.warmup_image_size;
2234
+ img->buf.resize(img->nx * img->ny * 3);
2019
2235
  batch.entries.push_back(std::move(img));
2020
2236
 
2021
- ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch, image_size, false);
2237
+ ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
2022
2238
  ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
2239
+
2023
2240
  for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
2024
2241
  ggml_backend_t backend = ctx_clip.backend_ptrs[i];
2025
2242
  ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i];
@@ -2092,19 +2309,12 @@ struct clip_model_loader {
2092
2309
  }
2093
2310
  };
2094
2311
 
2095
- // read and create ggml_context containing the tensors and their data
2096
- struct clip_ctx * clip_model_load(const char * fname, const int verbosity) {
2097
- return clip_init(fname, clip_context_params{
2098
- /* use_gpu */ true,
2099
- /* verbosity */ static_cast<ggml_log_level>(verbosity),
2100
- });
2101
- }
2102
-
2103
2312
  struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
2104
2313
  g_logger_state.verbosity_thold = ctx_params.verbosity;
2105
- clip_ctx * ctx_clip = new clip_ctx(ctx_params);
2314
+ clip_ctx * ctx_clip = nullptr;
2106
2315
 
2107
2316
  try {
2317
+ ctx_clip = new clip_ctx(ctx_params);
2108
2318
  clip_model_loader loader(fname, *ctx_clip);
2109
2319
  loader.load_hparams();
2110
2320
  loader.load_tensors();
@@ -2417,8 +2627,8 @@ struct image_manipulation {
2417
2627
  float target_width_f = static_cast<float>(inp_size.width) * scale;
2418
2628
  float target_height_f = static_cast<float>(inp_size.height) * scale;
2419
2629
 
2420
- int aligned_width = GGML_PAD((int)target_width_f, align_size);
2421
- int aligned_height = GGML_PAD((int)target_height_f, align_size);
2630
+ int aligned_width = CLIP_ALIGN((int)target_width_f, align_size);
2631
+ int aligned_height = CLIP_ALIGN((int)target_height_f, align_size);
2422
2632
 
2423
2633
  return {aligned_width, aligned_height};
2424
2634
  }
@@ -2516,7 +2726,7 @@ struct llava_uhd {
2516
2726
 
2517
2727
  // no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
2518
2728
 
2519
- auto best_size = get_best_resize(original_size, slice_size, patch_size, has_slices);
2729
+ auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices);
2520
2730
  res.overview_size = best_size;
2521
2731
 
2522
2732
  if (!has_slices) {
@@ -2737,10 +2947,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
2737
2947
  }
2738
2948
  else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
2739
2949
  clip_image_u8 resized;
2740
- auto patch_size = clip_get_patch_size(ctx) * 2;
2741
- int nx = ceil((float)img->nx / patch_size) * patch_size;
2742
- int ny = ceil((float)img->ny / patch_size) * patch_size;
2743
- image_manipulation::bicubic_resize(*img, resized, nx, ny);
2950
+ auto patch_size = params.patch_size * 2;
2951
+ auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
2952
+ image_manipulation::bicubic_resize(*img, resized, new_size.width, new_size.height);
2744
2953
 
2745
2954
  clip_image_f32_ptr img_f32(clip_image_f32_init());
2746
2955
  // clip_image_f32_ptr res(clip_image_f32_init());
@@ -2751,7 +2960,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
2751
2960
  }
2752
2961
  else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE
2753
2962
  || ctx->proj_type == PROJECTOR_TYPE_GEMMA3
2754
- || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
2963
+ || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3
2964
+ || ctx->proj_type == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
2965
+ ) {
2755
2966
  clip_image_u8 resized_image;
2756
2967
  int sz = params.image_size;
2757
2968
  image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz});
@@ -2848,7 +3059,7 @@ int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
2848
3059
  }
2849
3060
 
2850
3061
  int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
2851
- return ctx->vision_model.hparams.hidden_size;
3062
+ return ctx->vision_model.hparams.n_embd;
2852
3063
  }
2853
3064
 
2854
3065
  const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
@@ -2866,19 +3077,6 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
2866
3077
  return ctx->vision_model.hparams.image_grid_pinpoints.size();
2867
3078
  }
2868
3079
 
2869
- // deprecated
2870
- int clip_n_patches(const struct clip_ctx * ctx) {
2871
- clip_image_f32 img;
2872
- img.nx = ctx->vision_model.hparams.image_size;
2873
- img.ny = ctx->vision_model.hparams.image_size;
2874
- return clip_n_output_tokens(ctx, &img);
2875
- }
2876
-
2877
- // deprecated
2878
- int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
2879
- return clip_n_output_tokens(ctx, img);
2880
- }
2881
-
2882
3080
  int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
2883
3081
  const auto & params = ctx->vision_model.hparams;
2884
3082
  const int n_total = clip_n_output_tokens(ctx, img);
@@ -2901,8 +3099,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
2901
3099
 
2902
3100
  int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
2903
3101
 
2904
- if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
3102
+ if (ctx->proj_type == PROJECTOR_TYPE_LDP
3103
+ || ctx->proj_type == PROJECTOR_TYPE_LDPV2
3104
+ || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
2905
3105
  n_patches /= 4;
3106
+ if (ctx->vision_model.mm_glm_tok_boi) {
3107
+ n_patches += 2; // for BOI and EOI token embeddings
3108
+ }
2906
3109
  } else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
2907
3110
  if (ctx->minicpmv_version == 2) {
2908
3111
  n_patches = 96;
@@ -2922,12 +3125,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
2922
3125
  int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
2923
3126
  n_patches = x_patch * y_patch;
2924
3127
  } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
2925
- n_patches = 256;
2926
- } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
2927
- n_patches /= ctx->vision_model.hparams.proj_scale_factor;
3128
+ int n_per_side = params.image_size / params.patch_size;
3129
+ int n_per_side_2d_pool = n_per_side / params.proj_scale_factor;
3130
+ n_patches = n_per_side_2d_pool * n_per_side_2d_pool;
3131
+ } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3 || ctx->proj_type == PROJECTOR_TYPE_INTERNVL) {
3132
+ // both W and H are divided by proj_scale_factor
3133
+ n_patches /= (params.proj_scale_factor * params.proj_scale_factor);
2928
3134
  } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
2929
- int n_patches_x = img->nx / params.patch_size;
2930
- int n_patches_y = img->ny / params.patch_size;
3135
+ int n_merge = params.spatial_merge_size;
3136
+ int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
3137
+ int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1);
2931
3138
  n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
2932
3139
  }
2933
3140
 
@@ -3033,15 +3240,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3033
3240
  const clip_image_f32_batch & imgs = *imgs_c_ptr;
3034
3241
  int batch_size = imgs.entries.size();
3035
3242
 
3036
- if (ctx->has_llava_projector
3037
- || ctx->proj_type == PROJECTOR_TYPE_MINICPMV
3038
- || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
3039
- GGML_ASSERT(batch_size == 1);
3243
+ // TODO @ngxson : implement batch size > 1 as a loop
3244
+ // we don't need true batching support because the cgraph will gonna be big anyway
3245
+ if (batch_size != 1) {
3246
+ return false; // only support batch size of 1
3040
3247
  }
3041
3248
 
3042
3249
  // build the inference graph
3043
3250
  ggml_backend_sched_reset(ctx->sched.get());
3044
- ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
3251
+ ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
3045
3252
  ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
3046
3253
 
3047
3254
  // set inputs
@@ -3053,14 +3260,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3053
3260
 
3054
3261
  const int patch_size = hparams.patch_size;
3055
3262
  const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
3056
- const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
3263
+ const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
3057
3264
  const int pos_w = ctx->load_image_size.width / patch_size;
3058
3265
  const int pos_h = ctx->load_image_size.height / patch_size;
3059
3266
 
3060
3267
  const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
3061
3268
 
3062
3269
  auto get_inp_tensor = [&gf](const char * name) {
3063
- struct ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
3270
+ ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
3064
3271
  if (inp == nullptr) {
3065
3272
  GGML_ABORT("Failed to get tensor %s", name);
3066
3273
  }
@@ -3169,7 +3376,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3169
3376
  const int merge_ratio = 2;
3170
3377
  const int pw = image_size_width / patch_size;
3171
3378
  const int ph = image_size_height / patch_size;
3172
- std::vector<int> positions(num_positions * 4);
3379
+ std::vector<int> positions(n_pos * 4);
3173
3380
  int ptr = 0;
3174
3381
  for (int y = 0; y < ph; y += merge_ratio) {
3175
3382
  for (int x = 0; x < pw; x += merge_ratio) {
@@ -3246,7 +3453,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3246
3453
  }
3247
3454
 
3248
3455
  const int mpow = merge_ratio * merge_ratio;
3249
- std::vector<int> positions(num_positions * 4);
3456
+ std::vector<int> positions(n_pos * 4);
3250
3457
 
3251
3458
  int ptr = 0;
3252
3459
  for (int y = 0; y < iph; y += merge_ratio) {
@@ -3272,14 +3479,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3272
3479
  {
3273
3480
  // set the 2D positions
3274
3481
  int n_patches_per_col = image_size_width / patch_size;
3275
- std::vector<int> pos_data(num_positions);
3482
+ std::vector<int> pos_data(n_pos);
3276
3483
  // dimension H
3277
- for (int i = 0; i < num_positions; i++) {
3484
+ for (int i = 0; i < n_pos; i++) {
3278
3485
  pos_data[i] = i / n_patches_per_col;
3279
3486
  }
3280
3487
  set_input_i32("pos_h", pos_data);
3281
3488
  // dimension W
3282
- for (int i = 0; i < num_positions; i++) {
3489
+ for (int i = 0; i < n_pos; i++) {
3283
3490
  pos_data[i] = i % n_patches_per_col;
3284
3491
  }
3285
3492
  set_input_i32("pos_w", pos_data);
@@ -3287,8 +3494,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3287
3494
  case PROJECTOR_TYPE_GLM_EDGE:
3288
3495
  {
3289
3496
  // llava and other models
3290
- std::vector<int32_t> positions(num_positions);
3291
- for (int i = 0; i < num_positions; i++) {
3497
+ std::vector<int32_t> positions(n_pos);
3498
+ for (int i = 0; i < n_pos; i++) {
3292
3499
  positions[i] = i;
3293
3500
  }
3294
3501
  set_input_i32("positions", positions);
@@ -3299,8 +3506,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3299
3506
  case PROJECTOR_TYPE_LDPV2:
3300
3507
  {
3301
3508
  // llava and other models
3302
- std::vector<int32_t> positions(num_positions);
3303
- for (int i = 0; i < num_positions; i++) {
3509
+ std::vector<int32_t> positions(n_pos);
3510
+ for (int i = 0; i < n_pos; i++) {
3304
3511
  positions[i] = i;
3305
3512
  }
3306
3513
  set_input_i32("positions", positions);
@@ -3317,6 +3524,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3317
3524
  } break;
3318
3525
  case PROJECTOR_TYPE_GEMMA3:
3319
3526
  case PROJECTOR_TYPE_IDEFICS3:
3527
+ case PROJECTOR_TYPE_INTERNVL:
3320
3528
  {
3321
3529
  // do nothing
3322
3530
  } break;
@@ -3324,7 +3532,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3324
3532
  GGML_ABORT("Unknown projector type");
3325
3533
  }
3326
3534
 
3327
- ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
3535
+ // ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
3536
+ ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend_cpu);
3537
+ ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
3538
+ if (reg) {
3539
+ auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
3540
+ if (ggml_backend_set_n_threads_fn) {
3541
+ ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads);
3542
+ }
3543
+ }
3328
3544
 
3329
3545
  auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
3330
3546
  if (status != GGML_STATUS_SUCCESS) {
@@ -3333,145 +3549,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3333
3549
  }
3334
3550
 
3335
3551
  // the last node is the embedding tensor
3336
- struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
3552
+ ggml_tensor * embeddings = ggml_graph_node(gf, -1);
3337
3553
 
3338
- // copy the embeddings to the location passed by the user
3339
- ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
3340
-
3341
- return true;
3342
- }
3343
-
3344
- bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
3345
- assert(itype < GGML_TYPE_COUNT);
3346
- ggml_type type = static_cast<ggml_type>(itype);
3347
-
3348
- auto * ctx_clip = clip_init(fname_inp, clip_context_params{
3349
- /* use_gpu */ false,
3350
- /* verbosity */ GGML_LOG_LEVEL_ERROR,
3351
- });
3352
-
3353
- const auto & ctx_src = ctx_clip->ctx_gguf.get();
3354
- const auto & ctx_data = ctx_clip->ctx_data.get();
3355
-
3356
- auto * ctx_out = gguf_init_empty();
3357
- gguf_set_kv(ctx_out, ctx_src);
3358
- gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
3359
- gguf_set_val_u32(ctx_out, "general.file_type", itype);
3360
-
3361
- auto fout = std::ofstream(fname_out, std::ios::binary);
3362
-
3363
- const int n_tensors = gguf_get_n_tensors(ctx_src);
3364
-
3365
- for (int i = 0; i < n_tensors; ++i) {
3366
- const char * name = gguf_get_tensor_name(ctx_src, i);
3367
- struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
3368
- gguf_add_tensor(ctx_out, cur);
3554
+ // sanity check (only support batch size of 1 for now)
3555
+ const int n_tokens_out = embeddings->ne[1];
3556
+ const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
3557
+ if (n_tokens_out != expected_n_tokens_out) {
3558
+ LOG_ERR("%s: expected %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
3559
+ GGML_ABORT("Invalid number of output tokens");
3369
3560
  }
3370
3561
 
3371
- const size_t meta_size = gguf_get_meta_size(ctx_out);
3372
- for (size_t i = 0; i < meta_size; ++i) {
3373
- fout.put(0);
3374
- }
3375
-
3376
- // regexes of tensor names to be quantized
3377
- const std::vector<std::string> k_names = {
3378
- ".*weight",
3379
- };
3380
-
3381
- std::vector<uint8_t> work(512);
3382
- std::vector<float> conv_buf(512);
3383
- size_t total_size_org = 0;
3384
- size_t total_size_new = 0;
3385
-
3386
- for (int i = 0; i < n_tensors; ++i) {
3387
- const std::string name = gguf_get_tensor_name(ctx_src, i);
3388
- struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());
3389
-
3390
- enum ggml_type new_type;
3391
- void * new_data;
3392
- size_t new_size;
3393
-
3394
- bool quantize = false;
3395
- for (const auto & s : k_names) {
3396
- if (std::regex_match(name, std::regex(s))) {
3397
- quantize = true;
3398
- break;
3399
- }
3400
- }
3401
-
3402
- // quantize only 2D tensors and bigger than block size
3403
- quantize &= (ggml_n_dims(cur) == 2) && cur->ne[0] > ggml_blck_size(type);
3404
-
3405
- if (quantize) {
3406
- new_type = type;
3407
- if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
3408
- new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
3409
- // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
3410
- }
3411
- const size_t n_elms = ggml_nelements(cur);
3412
- float * f32_data;
3413
-
3414
- switch (cur->type) {
3415
- case GGML_TYPE_F32:
3416
- f32_data = (float *)cur->data;
3417
- break;
3418
- case GGML_TYPE_F16:
3419
- if (conv_buf.size() < n_elms) {
3420
- conv_buf.resize(n_elms);
3421
- }
3422
- for (size_t j = 0; j < n_elms; ++j) {
3423
- conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]);
3424
- }
3425
- f32_data = (float *)conv_buf.data();
3426
- break;
3427
- default:
3428
- LOG_ERR("%s: Please use an input file in f32 or f16\n", __func__);
3429
- gguf_free(ctx_out);
3430
- return false;
3431
- }
3432
-
3433
- if (work.size() < n_elms * 4) {
3434
- work.resize(n_elms * 4);
3435
- }
3436
- new_data = work.data();
3437
-
3438
- new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr);
3439
- } else {
3440
- new_type = cur->type;
3441
- new_data = cur->data;
3442
- new_size = ggml_nbytes(cur);
3443
- }
3444
- const size_t orig_size = ggml_nbytes(cur);
3445
- total_size_org += orig_size;
3446
- total_size_new += new_size;
3447
- gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
3448
- GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size);
3449
- gguf_set_tensor_data(ctx_out, name.c_str(), new_data);
3450
- fout.write((const char *)new_data, new_size);
3451
- size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
3452
- for (size_t j = 0; j < pad; ++j) {
3453
- fout.put(0);
3454
- }
3455
-
3456
- LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
3457
- orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
3458
- }
3459
-
3460
- // go back to beginning of file and write the updated metadata
3461
- fout.seekp(0, std::ios::beg);
3462
- std::vector<uint8_t> meta(meta_size);
3463
- gguf_get_meta_data(ctx_out, meta.data());
3464
- fout.write((const char *)meta.data(), meta_size);
3465
-
3466
- fout.close();
3467
-
3468
- clip_free(ctx_clip);
3469
- gguf_free(ctx_out);
3470
-
3471
- {
3472
- LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
3473
- LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
3474
- }
3562
+ // copy the embeddings to the location passed by the user
3563
+ ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
3475
3564
 
3476
3565
  return true;
3477
3566
  }
@@ -3484,7 +3573,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
3484
3573
  return ctx->vision_model.mm_model_peg_0_b->ne[0];
3485
3574
  case PROJECTOR_TYPE_MLP:
3486
3575
  case PROJECTOR_TYPE_PIXTRAL:
3487
- return ctx->vision_model.mm_2_b->ne[0];
3576
+ return ctx->vision_model.mm_2_w->ne[1];
3488
3577
  case PROJECTOR_TYPE_MLP_NORM:
3489
3578
  return ctx->vision_model.mm_3_b->ne[0];
3490
3579
  case PROJECTOR_TYPE_MINICPMV:
@@ -3505,6 +3594,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
3505
3594
  return ctx->vision_model.mm_input_proj_w->ne[0];
3506
3595
  case PROJECTOR_TYPE_IDEFICS3:
3507
3596
  return ctx->vision_model.projection->ne[1];
3597
+ case PROJECTOR_TYPE_INTERNVL:
3598
+ return ctx->vision_model.mm_3_w->ne[1];
3508
3599
  default:
3509
3600
  GGML_ABORT("Unknown projector type");
3510
3601
  }