@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -3,7 +3,6 @@
3
3
  // I'll gradually clean and extend it
4
4
  // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
5
5
  #include "clip.h"
6
- #include "log.h"
7
6
  #include "ggml.h"
8
7
  #include "ggml-alloc.h"
9
8
  #include "ggml-backend.h"
@@ -20,6 +19,10 @@
20
19
  #include "ggml-cann.h"
21
20
  #endif
22
21
 
22
+ #ifdef GGML_USE_VULKAN
23
+ #include "ggml-vulkan.h"
24
+ #endif
25
+
23
26
  #define STB_IMAGE_IMPLEMENTATION
24
27
  #include "stb_image.h"
25
28
 
@@ -36,6 +39,11 @@
36
39
  #include <cinttypes>
37
40
  #include <limits>
38
41
 
42
+ #define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
43
+ #define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
44
+ #define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
45
+ #define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
46
+
39
47
  //#define CLIP_DEBUG_FUNCTIONS
40
48
 
41
49
  // RGB uint8 image
@@ -74,26 +82,28 @@ static std::string format(const char * fmt, ...) {
74
82
  // key constants
75
83
  //
76
84
 
77
- #define KEY_FTYPE "general.file_type"
78
- #define KEY_NAME "general.name"
79
- #define KEY_DESCRIPTION "general.description"
80
- #define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
81
- #define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
82
- #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
83
- #define KEY_USE_GELU "clip.use_gelu"
84
- #define KEY_N_EMBD "clip.%s.embedding_length"
85
- #define KEY_N_FF "clip.%s.feed_forward_length"
86
- #define KEY_N_BLOCK "clip.%s.block_count"
87
- #define KEY_N_HEAD "clip.%s.attention.head_count"
88
- #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
89
- #define KEY_PROJ_DIM "clip.%s.projection_dim"
90
- #define KEY_TOKENS "tokenizer.ggml.tokens"
91
- #define KEY_N_POSITIONS "clip.text.context_length"
92
- #define KEY_IMAGE_SIZE "clip.vision.image_size"
93
- #define KEY_PATCH_SIZE "clip.vision.patch_size"
94
- #define KEY_IMAGE_MEAN "clip.vision.image_mean"
95
- #define KEY_IMAGE_STD "clip.vision.image_std"
96
- #define KEY_PROJ_TYPE "clip.projector_type"
85
+ #define KEY_FTYPE "general.file_type"
86
+ #define KEY_NAME "general.name"
87
+ #define KEY_DESCRIPTION "general.description"
88
+ #define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
89
+ #define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
90
+ #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
91
+ #define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
92
+ #define KEY_MINICPMV_VERSION "clip.minicpmv_version"
93
+ #define KEY_USE_GELU "clip.use_gelu"
94
+ #define KEY_N_EMBD "clip.%s.embedding_length"
95
+ #define KEY_N_FF "clip.%s.feed_forward_length"
96
+ #define KEY_N_BLOCK "clip.%s.block_count"
97
+ #define KEY_N_HEAD "clip.%s.attention.head_count"
98
+ #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
99
+ #define KEY_PROJ_DIM "clip.%s.projection_dim"
100
+ #define KEY_TOKENS "tokenizer.ggml.tokens"
101
+ #define KEY_N_POSITIONS "clip.text.context_length"
102
+ #define KEY_IMAGE_SIZE "clip.vision.image_size"
103
+ #define KEY_PATCH_SIZE "clip.vision.patch_size"
104
+ #define KEY_IMAGE_MEAN "clip.vision.image_mean"
105
+ #define KEY_IMAGE_STD "clip.vision.image_std"
106
+ #define KEY_PROJ_TYPE "clip.projector_type"
97
107
 
98
108
  #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
99
109
  #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
@@ -127,12 +137,20 @@ static std::string format(const char * fmt, ...) {
127
137
  #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
128
138
  #define TN_IMAGE_NEWLINE "model.image_newline"
129
139
 
140
+ #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
141
+ #define TN_MINICPMV_QUERY "resampler.query"
142
+ #define TN_MINICPMV_PROJ "resampler.proj.weight"
143
+ #define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
144
+ #define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
145
+ #define TN_MINICPMV_LN "resampler.ln_%s.%s"
146
+
130
147
 
131
148
  enum projector_type {
132
149
  PROJECTOR_TYPE_MLP,
133
150
  PROJECTOR_TYPE_MLP_NORM,
134
151
  PROJECTOR_TYPE_LDP,
135
152
  PROJECTOR_TYPE_LDPV2,
153
+ PROJECTOR_TYPE_RESAMPLER,
136
154
  PROJECTOR_TYPE_UNKNOWN,
137
155
  };
138
156
 
@@ -140,6 +158,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
140
158
  { PROJECTOR_TYPE_MLP, "mlp" },
141
159
  { PROJECTOR_TYPE_LDP, "ldp" },
142
160
  { PROJECTOR_TYPE_LDPV2, "ldpv2"},
161
+ { PROJECTOR_TYPE_RESAMPLER, "resampler"},
143
162
  };
144
163
 
145
164
 
@@ -150,7 +169,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
150
169
  static int get_key_idx(const gguf_context * ctx, const char * key) {
151
170
  int i = gguf_find_key(ctx, key);
152
171
  if (i == -1) {
153
- LOG_TEE("key %s not found in file\n", key);
172
+ LOG_ERR("key %s not found in file\n", key);
154
173
  throw std::runtime_error(format("Missing required key: %s", key));
155
174
  }
156
175
 
@@ -200,17 +219,20 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
200
219
  }
201
220
 
202
221
  static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
203
- std::string result;
204
- for (size_t pos = 0; ; pos += search.length()) {
205
- auto new_pos = s.find(search, pos);
206
- if (new_pos == std::string::npos) {
207
- result += s.substr(pos, s.size() - pos);
208
- break;
209
- }
210
- result += s.substr(pos, new_pos - pos) + replace;
211
- pos = new_pos;
222
+ if (search.empty()) {
223
+ return;
212
224
  }
213
- s = std::move(result);
225
+ std::string builder;
226
+ builder.reserve(s.length());
227
+ size_t pos = 0;
228
+ size_t last_pos = 0;
229
+ while ((pos = s.find(search, last_pos)) != std::string::npos) {
230
+ builder.append(s, last_pos, pos - last_pos);
231
+ builder.append(replace);
232
+ last_pos = pos + search.length();
233
+ }
234
+ builder.append(s, last_pos, std::string::npos);
235
+ s = std::move(builder);
214
236
  }
215
237
 
216
238
  static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
@@ -252,7 +274,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
252
274
 
253
275
  static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
254
276
  size_t tensor_size = ggml_nbytes(tensor);
255
- LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
277
+ LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
256
278
  prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
257
279
  tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
258
280
  }
@@ -270,7 +292,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
270
292
  static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
271
293
  std::ofstream file(filename, std::ios::binary);
272
294
  if (!file.is_open()) {
273
- LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
295
+ LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
274
296
  return;
275
297
  }
276
298
 
@@ -289,7 +311,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
289
311
  static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
290
312
  std::ofstream file(filename, std::ios::binary);
291
313
  if (!file.is_open()) {
292
- LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
314
+ LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
293
315
  return;
294
316
  }
295
317
 
@@ -492,12 +514,34 @@ struct clip_vision_model {
492
514
  struct ggml_tensor * mm_model_mlp_2_b;
493
515
  struct ggml_tensor * mm_model_peg_0_w;
494
516
  struct ggml_tensor * mm_model_peg_0_b;
517
+
518
+ // MINICPMV projection
519
+ struct ggml_tensor * mm_model_pos_embed_k;
520
+ struct ggml_tensor * mm_model_query;
521
+ struct ggml_tensor * mm_model_proj;
522
+ struct ggml_tensor * mm_model_kv_proj;
523
+ struct ggml_tensor * mm_model_attn_q_w;
524
+ struct ggml_tensor * mm_model_attn_q_b;
525
+ struct ggml_tensor * mm_model_attn_k_w;
526
+ struct ggml_tensor * mm_model_attn_k_b;
527
+ struct ggml_tensor * mm_model_attn_v_w;
528
+ struct ggml_tensor * mm_model_attn_v_b;
529
+ struct ggml_tensor * mm_model_attn_o_w;
530
+ struct ggml_tensor * mm_model_attn_o_b;
531
+ struct ggml_tensor * mm_model_ln_q_w;
532
+ struct ggml_tensor * mm_model_ln_q_b;
533
+ struct ggml_tensor * mm_model_ln_kv_w;
534
+ struct ggml_tensor * mm_model_ln_kv_b;
535
+ struct ggml_tensor * mm_model_ln_post_w;
536
+ struct ggml_tensor * mm_model_ln_post_b;
495
537
  };
496
538
 
497
539
  struct clip_ctx {
498
540
  bool has_text_encoder = false;
499
541
  bool has_vision_encoder = false;
500
542
  bool has_llava_projector = false;
543
+ bool has_minicpmv_projector = false;
544
+ int minicpmv_version = 2;
501
545
 
502
546
  struct clip_vision_model vision_model;
503
547
  projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -522,31 +566,46 @@ struct clip_ctx {
522
566
 
523
567
  ggml_backend_t backend = NULL;
524
568
  ggml_gallocr_t compute_alloc = NULL;
569
+
570
+ struct clip_image_size * load_image_size;
525
571
  };
526
572
 
527
- static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
573
+ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
528
574
  if (!ctx->has_vision_encoder) {
529
- LOG_TEE("This gguf file seems to have no vision encoder\n");
575
+ LOG_ERR("This gguf file seems to have no vision encoder\n");
530
576
  return nullptr;
531
577
  }
532
578
 
533
579
  const auto & model = ctx->vision_model;
534
580
  const auto & hparams = model.hparams;
535
581
 
536
- const int image_size = hparams.image_size;
582
+ const int image_size = hparams.image_size;
583
+ int image_size_width = image_size;
584
+ int image_size_height = image_size;
585
+ if (ctx->has_minicpmv_projector) {
586
+ if (load_image_size == nullptr) {
587
+ load_image_size = clip_image_size_init();
588
+ }
589
+ LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
590
+ image_size_width = load_image_size->width;
591
+ image_size_height = load_image_size->height;
592
+ if (is_inf) {
593
+ image_size_width = imgs->data->nx;
594
+ image_size_height = imgs->data->ny;
595
+ }
596
+ }
537
597
  const int patch_size = hparams.patch_size;
538
- const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
539
- const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
598
+ const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
540
599
  const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
541
600
  const int hidden_size = hparams.hidden_size;
542
601
  const int n_head = hparams.n_head;
543
602
  const int d_head = hidden_size / n_head;
544
- const int n_layer = hparams.n_layer;
603
+ int n_layer = hparams.n_layer;
545
604
  const float eps = hparams.eps;
546
605
 
547
606
  const int batch_size = imgs->size;
548
607
 
549
- if (ctx->has_llava_projector) {
608
+ if (ctx->has_llava_projector || ctx->has_minicpmv_projector) {
550
609
  GGML_ASSERT(batch_size == 1);
551
610
  }
552
611
 
@@ -559,7 +618,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
559
618
  struct ggml_context * ctx0 = ggml_init(params);
560
619
  struct ggml_cgraph * gf = ggml_new_graph(ctx0);
561
620
 
562
- struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
621
+ struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
563
622
  ggml_set_name(inp_raw, "inp_raw");
564
623
  ggml_set_input(inp_raw);
565
624
 
@@ -572,19 +631,21 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
572
631
  // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
573
632
  inp = ggml_add(ctx0, inp, model.patch_bias);
574
633
  }
575
-
576
- // concat class_embeddings and patch_embeddings
577
634
  struct ggml_tensor * embeddings = inp;
578
- if (ctx->has_class_embedding) {
579
- embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
580
- ggml_set_name(embeddings, "embeddings");
581
- ggml_set_input(embeddings);
582
- embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
583
- embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
584
- embeddings = ggml_acc(ctx0, embeddings, inp,
585
- embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
586
- }
635
+ struct ggml_tensor * pos_embed = nullptr;
587
636
 
637
+ if (ctx->has_llava_projector) {
638
+ // concat class_embeddings and patch_embeddings
639
+ if (ctx->has_class_embedding) {
640
+ embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
641
+ ggml_set_name(embeddings, "embeddings");
642
+ ggml_set_input(embeddings);
643
+ embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
644
+ embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
645
+ embeddings = ggml_acc(ctx0, embeddings, inp,
646
+ embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
647
+ }
648
+ }
588
649
 
589
650
  struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
590
651
  ggml_set_name(positions, "positions");
@@ -593,6 +654,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
593
654
  embeddings =
594
655
  ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
595
656
 
657
+ if (ctx->has_minicpmv_projector) {
658
+ int pos_w = image_size_width/patch_size;
659
+ int pos_h = image_size_height/patch_size;
660
+ if (ctx->minicpmv_version == 2) {
661
+ pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
662
+ }
663
+ else if (ctx->minicpmv_version == 3) {
664
+ pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
665
+ }
666
+ ggml_set_name(pos_embed, "pos_embed");
667
+ ggml_set_input(pos_embed);
668
+ }
669
+
596
670
  // pre-layernorm
597
671
  if (ctx->has_pre_norm) {
598
672
  embeddings = ggml_norm(ctx0, embeddings, eps);
@@ -602,6 +676,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
602
676
  }
603
677
 
604
678
  // loop over layers
679
+ if (ctx->has_minicpmv_projector) {
680
+ n_layer += 1;
681
+ }
605
682
  for (int il = 0; il < n_layer - 1; il++) {
606
683
  struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
607
684
 
@@ -691,7 +768,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
691
768
  }
692
769
 
693
770
  // llava projector
694
- {
771
+ if (ctx->has_llava_projector) {
695
772
  embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
696
773
 
697
774
  struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
@@ -712,8 +789,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
712
789
  embeddings = ggml_gelu(ctx0, embeddings);
713
790
  embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
714
791
  embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
715
-
716
- } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
792
+ }
793
+ else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
717
794
  embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
718
795
  embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
719
796
  // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
@@ -872,6 +949,75 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
872
949
  GGML_ABORT("fatal error");
873
950
  }
874
951
  }
952
+ // minicpmv projector
953
+ else if (ctx->has_minicpmv_projector)
954
+ {
955
+ if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
956
+ struct ggml_tensor * q = model.mm_model_query;
957
+ { // layernorm
958
+ q = ggml_norm(ctx0, q, eps);
959
+ q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
960
+ }
961
+ struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
962
+ { // layernorm
963
+ v = ggml_norm(ctx0, v, eps);
964
+ v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
965
+ }
966
+ struct ggml_tensor * k;
967
+ { // position
968
+ // q = ggml_add(ctx0, q, model.mm_model_pos_embed);
969
+ k = ggml_add(ctx0, v, pos_embed);
970
+ }
971
+
972
+ { // attention
973
+ int hidden_size = 4096;
974
+ const int d_head = 128;
975
+ int n_head = hidden_size/d_head;
976
+ int num_query = 96;
977
+ if (ctx->minicpmv_version == 2) {
978
+ hidden_size = 4096;
979
+ n_head = hidden_size/d_head;
980
+ num_query = 96;
981
+ }
982
+ else if (ctx->minicpmv_version == 3) {
983
+ hidden_size = 3584;
984
+ n_head = hidden_size/d_head;
985
+ num_query = 64;
986
+ }
987
+
988
+ struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
989
+ Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
990
+ struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
991
+ struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
992
+ // permute
993
+ Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
994
+ Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
995
+ Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
996
+ K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
997
+ K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
998
+ K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
999
+ V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
1000
+ V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
1001
+ V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
1002
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1003
+ KQ = ggml_soft_max_inplace(ctx0, KQ);
1004
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
1005
+ KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
1006
+ KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1007
+ KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
1008
+
1009
+ embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
1010
+ }
1011
+ { // layernorm
1012
+ embeddings = ggml_norm(ctx0, embeddings, eps);
1013
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
1014
+ }
1015
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
1016
+ }
1017
+ else {
1018
+ GGML_ASSERT(false);
1019
+ }
1020
+ }
875
1021
 
876
1022
  // build the graph
877
1023
  ggml_build_forward_expand(gf, embeddings);
@@ -905,21 +1051,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
905
1051
  const int idx_name = gguf_find_key(ctx, KEY_NAME);
906
1052
  if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
907
1053
  const std::string name = gguf_get_val_str(ctx, idx_name);
908
- LOG_TEE("%s: model name: %s\n", __func__, name.c_str());
1054
+ LOG_INF("%s: model name: %s\n", __func__, name.c_str());
909
1055
  }
910
- LOG_TEE("%s: description: %s\n", __func__, description.c_str());
911
- LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
912
- LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
913
- LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors);
914
- LOG_TEE("%s: n_kv: %d\n", __func__, n_kv);
915
- LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str());
916
- LOG_TEE("\n");
1056
+ LOG_INF("%s: description: %s\n", __func__, description.c_str());
1057
+ LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
1058
+ LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
1059
+ LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
1060
+ LOG_INF("%s: n_kv: %d\n", __func__, n_kv);
1061
+ LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
1062
+ LOG_INF("\n");
917
1063
  }
918
1064
  const int n_tensors = gguf_get_n_tensors(ctx);
919
1065
 
920
1066
  // kv
921
1067
  const int n_kv = gguf_get_n_kv(ctx);
922
- LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
1068
+ LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
923
1069
  __func__, n_kv, n_tensors, fname);
924
1070
  {
925
1071
  std::map<enum ggml_type, uint32_t> n_type;
@@ -930,7 +1076,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
930
1076
  n_type[type]++;
931
1077
  }
932
1078
 
933
- LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
1079
+ LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
934
1080
  for (int i = 0; i < n_kv; i++) {
935
1081
  const char * name = gguf_get_key(ctx, i);
936
1082
  const enum gguf_type type = gguf_get_kv_type(ctx, i);
@@ -946,7 +1092,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
946
1092
  }
947
1093
  replace_all(value, "\n", "\\n");
948
1094
 
949
- LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
1095
+ LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
950
1096
  }
951
1097
 
952
1098
  // print type counts
@@ -955,7 +1101,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
955
1101
  continue;
956
1102
  }
957
1103
 
958
- LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
1104
+ LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
959
1105
  }
960
1106
  }
961
1107
 
@@ -970,13 +1116,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
970
1116
  size_t tensor_size = ggml_nbytes(cur);
971
1117
  model_size += tensor_size;
972
1118
  if (verbosity >= 3) {
973
- LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
1119
+ LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
974
1120
  __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
975
1121
  }
976
1122
  }
977
1123
  }
978
1124
 
979
- clip_ctx * new_clip = new clip_ctx;
1125
+ clip_ctx * new_clip = new clip_ctx{};
980
1126
 
981
1127
  // update projector type
982
1128
  {
@@ -997,23 +1143,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
997
1143
 
998
1144
  #ifdef GGML_USE_CUDA
999
1145
  new_clip->backend = ggml_backend_cuda_init(0);
1000
- LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
1146
+ LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1001
1147
  #endif
1002
1148
 
1003
1149
  #ifdef GGML_USE_METAL
1004
1150
  new_clip->backend = ggml_backend_metal_init();
1005
- LOG_TEE("%s: CLIP using Metal backend\n", __func__);
1151
+ LOG_INF("%s: CLIP using Metal backend\n", __func__);
1006
1152
  #endif
1007
1153
 
1008
1154
  #ifdef GGML_USE_CANN
1009
1155
  new_clip->backend = ggml_backend_cann_init(0);
1010
- LOG_TEE("%s: CLIP using CANN backend\n", __func__);
1156
+ LOG_INF("%s: CLIP using CANN backend\n", __func__);
1011
1157
  #endif
1012
1158
 
1159
+ #ifdef GGML_USE_VULKAN
1160
+ new_clip->backend = ggml_backend_vk_init(0);
1161
+ LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1162
+ #endif
1013
1163
 
1014
1164
  if (!new_clip->backend) {
1015
1165
  new_clip->backend = ggml_backend_cpu_init();
1016
- LOG_TEE("%s: CLIP using CPU backend\n", __func__);
1166
+ LOG_INF("%s: CLIP using CPU backend\n", __func__);
1017
1167
  }
1018
1168
 
1019
1169
  // model size and capabilities
@@ -1029,7 +1179,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1029
1179
  new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
1030
1180
  }
1031
1181
 
1032
- GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
1182
+ idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ);
1183
+ if (idx != -1) {
1184
+ new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx);
1185
+ }
1186
+
1187
+ idx = gguf_find_key(ctx, KEY_MINICPMV_VERSION);
1188
+ if (idx != -1) {
1189
+ new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
1190
+ }
1191
+
1192
+ // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
1193
+
1033
1194
  GGML_ASSERT(new_clip->has_vision_encoder);
1034
1195
  GGML_ASSERT(!new_clip->has_text_encoder);
1035
1196
 
@@ -1037,15 +1198,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1037
1198
  new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
1038
1199
 
1039
1200
  if (verbosity >= 1) {
1040
- LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
1041
- LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
1042
- LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
1043
- LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
1044
- LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
1201
+ LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
1202
+ LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
1203
+ LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
1204
+ LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
1205
+ LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
1206
+ LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
1045
1207
  }
1046
1208
  }
1047
1209
 
1048
- LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
1210
+ LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
1049
1211
 
1050
1212
  // load tensors
1051
1213
  {
@@ -1058,7 +1220,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1058
1220
 
1059
1221
  new_clip->ctx_data = ggml_init(params);
1060
1222
  if (!new_clip->ctx_data) {
1061
- LOG_TEE("%s: ggml_init() failed\n", __func__);
1223
+ LOG_ERR("%s: ggml_init() failed\n", __func__);
1062
1224
  clip_free(new_clip);
1063
1225
  gguf_free(ctx);
1064
1226
  return nullptr;
@@ -1066,7 +1228,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1066
1228
 
1067
1229
  auto fin = std::ifstream(fname, std::ios::binary);
1068
1230
  if (!fin) {
1069
- LOG_TEE("cannot open model file for loading tensors\n");
1231
+ LOG_ERR("cannot open model file for loading tensors\n");
1070
1232
  clip_free(new_clip);
1071
1233
  gguf_free(ctx);
1072
1234
  return nullptr;
@@ -1088,7 +1250,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1088
1250
  const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
1089
1251
  fin.seekg(offset, std::ios::beg);
1090
1252
  if (!fin) {
1091
- LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
1253
+ LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
1092
1254
  clip_free(new_clip);
1093
1255
  gguf_free(ctx);
1094
1256
  return nullptr;
@@ -1159,23 +1321,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1159
1321
  }
1160
1322
 
1161
1323
  if (verbosity >= 2) {
1162
- LOG_TEE("\n%s: vision model hparams\n", __func__);
1163
- LOG_TEE("image_size %d\n", hparams.image_size);
1164
- LOG_TEE("patch_size %d\n", hparams.patch_size);
1165
- LOG_TEE("v_hidden_size %d\n", hparams.hidden_size);
1166
- LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate);
1167
- LOG_TEE("v_projection_dim %d\n", hparams.projection_dim);
1168
- LOG_TEE("v_n_head %d\n", hparams.n_head);
1169
- LOG_TEE("v_n_layer %d\n", hparams.n_layer);
1170
- LOG_TEE("v_eps %f\n", hparams.eps);
1171
- LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
1172
- LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
1173
- LOG_TEE("v_image_grid_pinpoints: ");
1324
+ LOG_INF("\n%s: vision model hparams\n", __func__);
1325
+ LOG_INF("image_size %d\n", hparams.image_size);
1326
+ LOG_INF("patch_size %d\n", hparams.patch_size);
1327
+ LOG_INF("v_hidden_size %d\n", hparams.hidden_size);
1328
+ LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate);
1329
+ LOG_INF("v_projection_dim %d\n", hparams.projection_dim);
1330
+ LOG_INF("v_n_head %d\n", hparams.n_head);
1331
+ LOG_INF("v_n_layer %d\n", hparams.n_layer);
1332
+ LOG_INF("v_eps %f\n", hparams.eps);
1333
+ LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
1334
+ LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
1335
+ LOG_INF("v_image_grid_pinpoints: ");
1174
1336
  for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
1175
- LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
1337
+ LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
1176
1338
  }
1177
- LOG_TEE("\n");
1178
- LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
1339
+ LOG_INF("\n");
1340
+ LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
1179
1341
 
1180
1342
  }
1181
1343
 
@@ -1213,7 +1375,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1213
1375
  vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
1214
1376
  vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
1215
1377
  } catch(const std::exception& /*e*/) {
1216
- LOG_TEE("%s: failed to load vision model tensors\n", __func__);
1378
+ LOG_ERR("%s: failed to load vision model tensors\n", __func__);
1217
1379
  }
1218
1380
 
1219
1381
  // LLaVA projection
@@ -1242,7 +1404,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1242
1404
  } catch (std::runtime_error & /*e*/) { }
1243
1405
  try {
1244
1406
  vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
1245
- // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
1407
+ // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
1246
1408
  } catch (std::runtime_error & /*e*/) { }
1247
1409
  } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
1248
1410
  // MobileVLM projection
@@ -1281,6 +1443,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1281
1443
  vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
1282
1444
  vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
1283
1445
  }
1446
+ else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
1447
+ // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
1448
+ vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
1449
+ vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
1450
+ vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
1451
+ vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
1452
+ vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight"));
1453
+ vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight"));
1454
+ vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight"));
1455
+ vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias"));
1456
+ vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias"));
1457
+ vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias"));
1458
+ vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight"));
1459
+ vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias"));
1460
+ vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight"));
1461
+ vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias"));
1462
+ vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight"));
1463
+ vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias"));
1464
+ vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
1465
+ vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
1466
+ }
1284
1467
  else {
1285
1468
  std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
1286
1469
  throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@@ -1319,15 +1502,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1319
1502
  new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
1320
1503
  clip_image_f32_batch batch;
1321
1504
  batch.size = 1;
1322
- ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
1505
+ ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
1323
1506
  ggml_gallocr_reserve(new_clip->compute_alloc, gf);
1324
1507
  size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
1325
- LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
1508
+ LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
1326
1509
  }
1327
1510
 
1328
1511
  return new_clip;
1329
1512
  }
1330
1513
 
1514
+ void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
1515
+ ctx_clip->load_image_size = load_image_size;
1516
+ }
1517
+
1518
+ struct clip_image_size * clip_image_size_init() {
1519
+ struct clip_image_size * load_image_size = new struct clip_image_size();
1520
+ load_image_size->width = 448;
1521
+ load_image_size->height = 448;
1522
+ return load_image_size;
1523
+ }
1524
+
1331
1525
  struct clip_image_u8 * clip_image_u8_init() {
1332
1526
  return new clip_image_u8();
1333
1527
  }
@@ -1362,7 +1556,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
1362
1556
  int nx, ny, nc;
1363
1557
  auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
1364
1558
  if (!data) {
1365
- LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
1559
+ LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
1366
1560
  return false;
1367
1561
  }
1368
1562
  build_clip_img_from_data(data, nx, ny, img);
@@ -1374,7 +1568,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
1374
1568
  int nx, ny, nc;
1375
1569
  auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
1376
1570
  if (!data) {
1377
- LOG_TEE("%s: failed to decode image bytes\n", __func__);
1571
+ LOG_ERR("%s: failed to decode image bytes\n", __func__);
1378
1572
  return false;
1379
1573
  }
1380
1574
  build_clip_img_from_data(data, nx, ny, img);
@@ -1433,7 +1627,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
1433
1627
  }
1434
1628
  }
1435
1629
 
1436
- inline float clip(float x, float lower, float upper) {
1630
+ inline int clip(int x, int lower, int upper) {
1437
1631
  return std::max(lower, std::min(x, upper));
1438
1632
  }
1439
1633
 
@@ -1564,7 +1758,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
1564
1758
  int downscaled_height = static_cast<int>(original_height * scale);
1565
1759
  int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
1566
1760
  int wasted_resolution = (width * height) - effective_resolution;
1567
- // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
1761
+ // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
1568
1762
  if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
1569
1763
  max_effective_resolution = effective_resolution;
1570
1764
  min_wasted_resolution = wasted_resolution;
@@ -1598,12 +1792,185 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
1598
1792
  return patches;
1599
1793
  }
1600
1794
 
1795
+ static int ensure_divide(int length, int patch_size) {
1796
+ return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
1797
+ }
1798
+
1799
+ static std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
1800
+ int width = original_size.first;
1801
+ int height = original_size.second;
1802
+ if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
1803
+ float r = static_cast<float>(width) / height;
1804
+ height = static_cast<int>(scale_resolution / std::sqrt(r));
1805
+ width = static_cast<int>(height * r);
1806
+ }
1807
+ int best_width = ensure_divide(width, patch_size);
1808
+ int best_height = ensure_divide(height, patch_size);
1809
+ return std::make_pair(best_width, best_height);
1810
+ }
1811
+
1812
+ static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
1813
+ int width, height;
1814
+ std::tie(width, height) = original_size;
1815
+ int grid_x, grid_y;
1816
+ std::tie(grid_x, grid_y) = grid;
1817
+
1818
+ int refine_width = ensure_divide(width, grid_x);
1819
+ int refine_height = ensure_divide(height, grid_y);
1820
+
1821
+ int grid_width = refine_width / grid_x;
1822
+ int grid_height = refine_height / grid_y;
1823
+
1824
+ // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
1825
+ auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
1826
+ int best_grid_width, best_grid_height;
1827
+ std::tie(best_grid_width, best_grid_height) = best_grid_size;
1828
+
1829
+ // std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
1830
+ std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
1831
+ return refine_size;
1832
+ }
1833
+
1834
+ static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
1835
+ std::vector<int> candidate_split_grids_nums;
1836
+ for (int i : {multiple - 1, multiple, multiple + 1}) {
1837
+ if (i == 1 || i > max_slice_nums) {
1838
+ continue;
1839
+ }
1840
+ candidate_split_grids_nums.push_back(i);
1841
+ }
1842
+
1843
+ std::vector<std::pair<int, int>> candidate_grids;
1844
+ for (int split_grids_nums : candidate_split_grids_nums) {
1845
+ int m = 1;
1846
+ while (m <= split_grids_nums) {
1847
+ if (split_grids_nums % m == 0) {
1848
+ candidate_grids.emplace_back(m, split_grids_nums / m);
1849
+ }
1850
+ ++m;
1851
+ }
1852
+ }
1853
+
1854
+ std::pair<int, int> best_grid{1, 1};
1855
+ float min_error = std::numeric_limits<float>::infinity();
1856
+ for (const auto& grid : candidate_grids) {
1857
+ float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
1858
+ if (error < min_error) {
1859
+ best_grid = grid;
1860
+ min_error = error;
1861
+ }
1862
+ }
1863
+ return best_grid;
1864
+ }
1865
+
1866
+ // inspired from LLaVA-UHD:
1867
+ // -> https://arxiv.org/pdf/2403.11703
1868
+ // -> https://github.com/thunlp/LLaVA-UHD
1869
+ // -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
1870
+ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
1871
+ const std::pair<int, int> original_size={img->nx,img->ny};
1872
+ const int original_width = img->nx;
1873
+ const int original_height = img->ny;
1874
+ const float log_ratio = log(1.0*original_width/original_height);
1875
+ const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
1876
+ const int multiple = fmin(ceil(ratio), max_slice_nums);
1877
+
1878
+ std::vector<std::vector<clip_image_u8 *>> images;
1879
+ LOG_INF("%s: multiple %d\n", __func__, multiple);
1880
+ images.push_back(std::vector<clip_image_u8 *>());
1881
+
1882
+ if (multiple <= 1) {
1883
+ auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
1884
+ clip_image_u8 * source_image = clip_image_u8_init();
1885
+ bicubic_resize(*img, *source_image, best_size.first, best_size.second);
1886
+ // source_image = image.resize(best_size, Image.Resampling.BICUBIC)
1887
+ images[images.size()-1].push_back(source_image);
1888
+ }
1889
+ else if (multiple > 1) {
1890
+ auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
1891
+ clip_image_u8 * source_image = clip_image_u8_init();
1892
+ bicubic_resize(*img, *source_image, best_size.first, best_size.second);
1893
+ // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
1894
+ LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
1895
+ images[images.size()-1].push_back(source_image);
1896
+
1897
+ std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
1898
+ LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
1899
+
1900
+ auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
1901
+ clip_image_u8 * refine_image = clip_image_u8_init();
1902
+ bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
1903
+
1904
+ LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
1905
+
1906
+ // split_to_patches
1907
+ int width = refine_image->nx;
1908
+ int height = refine_image->ny;
1909
+ int grid_x = int(width / best_grid.first);
1910
+ int grid_y = int(height / best_grid.second);
1911
+ for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
1912
+ images.push_back(std::vector<clip_image_u8 *>());
1913
+ for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
1914
+ clip_image_u8 * patch = clip_image_u8_init();
1915
+ patch->nx = grid_x;
1916
+ patch->ny = grid_y;
1917
+ patch->buf.resize(3 * patch->nx * patch->ny);
1918
+ for (int y = patches_i; y < patches_i + grid_y; ++y) {
1919
+ for (int x = patches_j; x < patches_j + grid_x; ++x) {
1920
+ const int i = 3 * (y * refine_image->nx + x);
1921
+ const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
1922
+ patch->buf[j] = refine_image->buf[i];
1923
+ patch->buf[j+1] = refine_image->buf[i+1];
1924
+ patch->buf[j+2] = refine_image->buf[i+2];
1925
+ }
1926
+ }
1927
+ images[images.size()-1].push_back(patch);
1928
+ }
1929
+ }
1930
+ }
1931
+ return images;
1932
+ }
1933
+
1934
+ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
1935
+ const int max_slice_nums=9;
1936
+ const int scale_resolution=448;
1937
+ const int original_width = ctx_clip->load_image_size->width;
1938
+ const int original_height = ctx_clip->load_image_size->height;
1939
+ const float log_ratio = log(1.0*original_width/original_height);
1940
+ const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
1941
+ const int multiple = fmin(ceil(ratio), max_slice_nums);
1942
+ std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
1943
+ return best_grid.first;
1944
+ }
1945
+
1601
1946
  // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
1602
1947
  // res_imgs memory is being allocated here, previous allocations will be freed if found
1603
1948
  bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
1949
+
1950
+ if(clip_is_minicpmv(ctx)){
1951
+ int max_slice_nums = 9;
1952
+ std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
1953
+ res_imgs->size = 0;
1954
+ for (size_t i = 0; i < imgs.size(); ++i){
1955
+ res_imgs->size += imgs[i].size();
1956
+ }
1957
+ res_imgs->data = new clip_image_f32[res_imgs->size];
1958
+ int idx = 0;
1959
+ for (size_t i = 0; i < imgs.size(); ++i) {
1960
+ for (size_t j = 0; j < imgs[i].size(); ++j) {
1961
+ LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
1962
+ clip_image_f32 * res = clip_image_f32_init();
1963
+ normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
1964
+ res_imgs->data[idx++] = *res;
1965
+ clip_image_f32_free(res);
1966
+ }
1967
+ }
1968
+ return true;
1969
+ }
1970
+
1604
1971
  bool pad_to_square = true;
1605
1972
  if (!ctx->has_vision_encoder) {
1606
- LOG_TEE("This gguf file seems to have no vision encoder\n");
1973
+ LOG_ERR("This gguf file seems to have no vision encoder\n");
1607
1974
  return false;
1608
1975
  }
1609
1976
  auto & params = ctx->vision_model.hparams;
@@ -1680,7 +2047,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
1680
2047
  }
1681
2048
 
1682
2049
  for (size_t i = 0; i < patches.size(); i++) {
1683
- // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
2050
+ // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
1684
2051
  clip_image_u8_free(patches[i]);
1685
2052
  }
1686
2053
 
@@ -1816,14 +2183,107 @@ int clip_n_patches(const struct clip_ctx * ctx) {
1816
2183
 
1817
2184
  if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
1818
2185
  n_patches /= 4;
2186
+ } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2187
+ if (ctx->minicpmv_version == 2) {
2188
+ n_patches = 96;
2189
+ }
2190
+ else if (ctx->minicpmv_version == 3) {
2191
+ n_patches = 64;
2192
+ }
1819
2193
  }
1820
2194
 
1821
2195
  return n_patches;
1822
2196
  }
1823
2197
 
2198
+ static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
2199
+ assert(embed_dim % 2 == 0);
2200
+ int H = pos.size();
2201
+ int W = pos[0].size();
2202
+
2203
+ std::vector<float> omega(embed_dim / 2);
2204
+ for (int i = 0; i < embed_dim / 2; ++i) {
2205
+ omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
2206
+ }
2207
+
2208
+ std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
2209
+ for (int h = 0; h < H; ++h) {
2210
+ for (int w = 0; w < W; ++w) {
2211
+ for (int d = 0; d < embed_dim / 2; ++d) {
2212
+ float out_value = pos[h][w] * omega[d];
2213
+ emb[h][w][d] = sin(out_value);
2214
+ emb[h][w][d + embed_dim / 2] = cos(out_value);
2215
+ }
2216
+ }
2217
+ }
2218
+
2219
+ return emb;
2220
+ }
2221
+
2222
+ static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>> & grid) {
2223
+ assert(embed_dim % 2 == 0);
2224
+ std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
2225
+ std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
2226
+
2227
+ int H = emb_h.size();
2228
+ int W = emb_h[0].size();
2229
+ std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
2230
+
2231
+ for (int h = 0; h < H; ++h) {
2232
+ for (int w = 0; w < W; ++w) {
2233
+ for (int d = 0; d < embed_dim / 2; ++d) {
2234
+ emb[h][w][d] = emb_h[h][w][d];
2235
+ emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
2236
+ }
2237
+ }
2238
+ }
2239
+ return emb;
2240
+ }
2241
+
2242
+ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
2243
+ int grid_h_size = image_size.first;
2244
+ int grid_w_size = image_size.second;
2245
+
2246
+ std::vector<float> grid_h(grid_h_size);
2247
+ std::vector<float> grid_w(grid_w_size);
2248
+
2249
+ for (int i = 0; i < grid_h_size; ++i) {
2250
+ grid_h[i] = static_cast<float>(i);
2251
+ }
2252
+ for (int i = 0; i < grid_w_size; ++i) {
2253
+ grid_w[i] = static_cast<float>(i);
2254
+ }
2255
+
2256
+ std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
2257
+ for (int h = 0; h < grid_h_size; ++h) {
2258
+ for (int w = 0; w < grid_w_size; ++w) {
2259
+ grid[h][w] = grid_w[w];
2260
+ }
2261
+ }
2262
+ std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
2263
+ for (int h = 0; h < grid_h_size; ++h) {
2264
+ for (int w = 0; w < grid_w_size; ++w) {
2265
+ grid_2d[0][h][w] = grid_h[h];
2266
+ grid_2d[1][h][w] = grid_w[w];
2267
+ }
2268
+ }
2269
+
2270
+ std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
2271
+
2272
+ int H = image_size.first;
2273
+ int W = image_size.second;
2274
+ std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
2275
+ for (int h = 0; h < H; ++h) {
2276
+ for (int w = 0; w < W; ++w) {
2277
+ pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
2278
+ }
2279
+ }
2280
+
2281
+ return pos_embed_2d;
2282
+ }
2283
+
1824
2284
  bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
1825
2285
  if (!ctx->has_vision_encoder) {
1826
- LOG_TEE("This gguf file seems to have no vision encoder\n");
2286
+ LOG_ERR("This gguf file seems to have no vision encoder\n");
1827
2287
  return false;
1828
2288
  }
1829
2289
 
@@ -1835,7 +2295,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
1835
2295
 
1836
2296
  bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
1837
2297
  if (!ctx->has_vision_encoder) {
1838
- LOG_TEE("This gguf file seems to have no vision encoder\n");
2298
+ LOG_ERR("This gguf file seems to have no vision encoder\n");
1839
2299
  return false;
1840
2300
  }
1841
2301
 
@@ -1843,19 +2303,33 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
1843
2303
  if (ctx->has_llava_projector) {
1844
2304
  GGML_ASSERT(batch_size == 1); // TODO: support multiple images
1845
2305
  }
2306
+ if (ctx->has_minicpmv_projector) {
2307
+ GGML_ASSERT(batch_size == 1);
2308
+ }
1846
2309
 
1847
2310
  // build the inference graph
1848
- ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
2311
+ ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
1849
2312
  ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
1850
2313
 
1851
2314
  // set inputs
1852
2315
  const auto & model = ctx->vision_model;
1853
2316
  const auto & hparams = model.hparams;
1854
2317
 
1855
- const int image_size = hparams.image_size;
2318
+ const int image_size = hparams.image_size;
2319
+ int image_size_width = image_size;
2320
+ int image_size_height = image_size;
2321
+ if (ctx->has_minicpmv_projector) {
2322
+ image_size_width = imgs->data[0].nx;
2323
+ image_size_height = imgs->data[0].ny;
2324
+ }
1856
2325
  const int patch_size = hparams.patch_size;
1857
- const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
2326
+ const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
1858
2327
  const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
2328
+ if(ctx->load_image_size==nullptr){
2329
+ ctx->load_image_size= clip_image_size_init();
2330
+ }
2331
+ const int pos_w = ctx->load_image_size->width/patch_size;
2332
+ const int pos_h = ctx->load_image_size->height/patch_size;
1859
2333
 
1860
2334
  {
1861
2335
  struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
@@ -1864,7 +2338,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
1864
2338
  for (size_t i = 0; i < imgs->size; i++) {
1865
2339
  const int nx = imgs->data[i].nx;
1866
2340
  const int ny = imgs->data[i].ny;
1867
- GGML_ASSERT(nx == image_size && ny == image_size);
2341
+ if (!ctx->has_minicpmv_projector) {
2342
+ GGML_ASSERT(nx == image_size && ny == image_size);
2343
+ }
1868
2344
 
1869
2345
  const int n = nx * ny;
1870
2346
 
@@ -1881,53 +2357,97 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
1881
2357
  ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
1882
2358
  free(data);
1883
2359
  }
2360
+ if (ctx->has_minicpmv_projector) {
2361
+ {
2362
+ // inspired from siglip:
2363
+ // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
2364
+ // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
2365
+ struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
2366
+ int* positions_data = (int*)malloc(ggml_nbytes(positions));
2367
+ int bucket_coords_h[70];
2368
+ int bucket_coords_w[70];
2369
+ for (int i = 0; i < pos_h; i++){
2370
+ bucket_coords_h[i] = std::floor(70.0*i/pos_h);
2371
+ }
2372
+ for (int i = 0; i < pos_w; i++){
2373
+ bucket_coords_w[i] = std::floor(70.0*i/pos_w);
2374
+ }
2375
+ for (int i = 0, id = 0; i < pos_h; i++){
2376
+ for (int j = 0; j < pos_w; j++){
2377
+ positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
2378
+ }
2379
+ }
2380
+ ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
2381
+ free(positions_data);
2382
+ }
1884
2383
 
1885
- {
1886
- if (ctx->has_class_embedding) {
1887
- struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
2384
+ {
2385
+ // inspired from resampler of Qwen-VL:
2386
+ // -> https://huggingface.co/Qwen/Qwen-VL/tree/main
2387
+ // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
2388
+ struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
2389
+ int embed_dim = 4096;
2390
+ if (ctx->minicpmv_version == 2) {
2391
+ embed_dim = 4096;
2392
+ }
2393
+ else if (ctx->minicpmv_version == 3) {
2394
+ embed_dim = 3584;
2395
+ }
2396
+ auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
1888
2397
 
1889
- void* zero_mem = malloc(ggml_nbytes(embeddings));
1890
- memset(zero_mem, 0, ggml_nbytes(embeddings));
1891
- ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
1892
- free(zero_mem);
2398
+ float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
2399
+ for(int i=0;i<pos_w * pos_h;++i){
2400
+ for(int j=0;j<embed_dim;++j){
2401
+ pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
2402
+ }
2403
+ }
2404
+
2405
+ ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
2406
+ free(pos_embed_data);
1893
2407
  }
1894
2408
  }
2409
+ else{
2410
+ {
2411
+ if (ctx->has_class_embedding) {
2412
+ struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
1895
2413
 
1896
- {
1897
- struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
2414
+ void* zero_mem = malloc(ggml_nbytes(embeddings));
2415
+ memset(zero_mem, 0, ggml_nbytes(embeddings));
2416
+ ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
2417
+ free(zero_mem);
2418
+ }
2419
+ }
2420
+
2421
+ {
2422
+ struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
1898
2423
 
1899
- int* positions_data = (int*)malloc(ggml_nbytes(positions));
1900
- for (int i = 0; i < num_positions; i++) {
1901
- positions_data[i] = i;
2424
+ int* positions_data = (int*)malloc(ggml_nbytes(positions));
2425
+ for (int i = 0; i < num_positions; i++) {
2426
+ positions_data[i] = i;
2427
+ }
2428
+ ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
2429
+ free(positions_data);
1902
2430
  }
1903
- ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
1904
- free(positions_data);
1905
- }
1906
2431
 
1907
- {
1908
- struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
1909
- int* patches_data = (int*)malloc(ggml_nbytes(patches));
1910
- for (int i = 0; i < num_patches; i++) {
1911
- patches_data[i] = i + 1;
2432
+ {
2433
+ struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
2434
+ int* patches_data = (int*)malloc(ggml_nbytes(patches));
2435
+ for (int i = 0; i < num_patches; i++) {
2436
+ patches_data[i] = i + 1;
2437
+ }
2438
+ ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
2439
+ free(patches_data);
1912
2440
  }
1913
- ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
1914
- free(patches_data);
1915
2441
  }
1916
2442
 
1917
2443
  if (ggml_backend_is_cpu(ctx->backend)) {
1918
2444
  ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
1919
2445
  }
1920
2446
 
1921
- #ifdef GGML_USE_METAL
1922
- if (ggml_backend_is_metal(ctx->backend)) {
1923
- ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
1924
- }
1925
- #endif
1926
-
1927
2447
  ggml_backend_graph_compute(ctx->backend, gf);
1928
2448
 
1929
2449
  // the last node is the embedding tensor
1930
- struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
2450
+ struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
1931
2451
 
1932
2452
  // copy the embeddings to the location passed by the user
1933
2453
  ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
@@ -1999,7 +2519,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
1999
2519
  new_type = type;
2000
2520
  if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
2001
2521
  new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
2002
- // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
2522
+ // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
2003
2523
  }
2004
2524
  const size_t n_elms = ggml_nelements(cur);
2005
2525
  float * f32_data;
@@ -2018,7 +2538,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
2018
2538
  f32_data = (float *)conv_buf.data();
2019
2539
  break;
2020
2540
  default:
2021
- LOG_TEE("Please use an input file in f32 or f16\n");
2541
+ LOG_ERR("Please use an input file in f32 or f16\n");
2022
2542
  gguf_free(ctx_out);
2023
2543
  return false;
2024
2544
  }
@@ -2045,7 +2565,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
2045
2565
  fout.put(0);
2046
2566
  }
2047
2567
 
2048
- LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
2568
+ LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
2049
2569
  orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
2050
2570
  }
2051
2571
 
@@ -2061,8 +2581,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
2061
2581
  gguf_free(ctx_out);
2062
2582
 
2063
2583
  {
2064
- LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
2065
- LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
2584
+ LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
2585
+ LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
2066
2586
  }
2067
2587
 
2068
2588
  return true;
@@ -2081,7 +2601,22 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
2081
2601
  if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
2082
2602
  return ctx->vision_model.mm_3_b->ne[0];
2083
2603
  }
2604
+ if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2605
+ if (ctx->minicpmv_version == 2) {
2606
+ return 4096;
2607
+ }
2608
+ else if (ctx->minicpmv_version == 3) {
2609
+ return 3584;
2610
+ }
2611
+ }
2084
2612
 
2085
2613
  std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
2086
2614
  throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
2087
2615
  }
2616
+
2617
+ int clip_is_minicpmv(const struct clip_ctx * ctx) {
2618
+ if (ctx->has_minicpmv_projector) {
2619
+ return ctx->minicpmv_version;
2620
+ }
2621
+ return 0;
2622
+ }